In [127]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np 
from ydata_profiling import ProfileReport


In [128]:
df = pd.read_csv('USA_cars_datasets.csv')

profile = ProfileReport(df)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [129]:
df['vehicle_name'] = df['brand'] + " " + df['model']

In [130]:
df['vehicle_name'] = df['brand'] + " " + df['model']
cols = ['price','vehicle_name','year','title_status','mileage','color','state','country'] # Cols to choose
df = df[cols]
df

Unnamed: 0,price,vehicle_name,year,title_status,mileage,color,state,country
0,6300,toyota cruiser,2008,clean vehicle,274117.0,black,new jersey,usa
1,2899,ford se,2011,clean vehicle,190552.0,silver,tennessee,usa
2,5350,dodge mpv,2018,clean vehicle,39590.0,silver,georgia,usa
3,25000,ford door,2014,clean vehicle,64146.0,blue,virginia,usa
4,27700,chevrolet 1500,2018,clean vehicle,6654.0,red,florida,usa
...,...,...,...,...,...,...,...,...
2494,7800,nissan versa,2019,clean vehicle,23609.0,red,california,usa
2495,9200,nissan versa,2018,clean vehicle,34553.0,silver,florida,usa
2496,9200,nissan versa,2018,clean vehicle,31594.0,silver,florida,usa
2497,9200,nissan versa,2018,clean vehicle,32557.0,black,florida,usa


In [131]:
df['price'].to_numpy()

array([6300, 2899, 5350, ..., 9200, 9200, 9200], dtype=int64)

In [132]:
from sklearn.preprocessing import MinMaxScaler

# We will pick rows with prices with more 2000
df = df[df['price']>=2000]
df['price'].describe()
normal = MinMaxScaler()
df['mileage'] = normal.fit_transform(df['mileage'].to_numpy().reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mileage'] = normal.fit_transform(df['mileage'].to_numpy().reshape(-1,1))


In [133]:
color_price = df.groupby('color')['price'].mean().reset_index()
sns.barplot(data=color_price, x='color', y='price')

<Axes: xlabel='color', ylabel='price'>

In [137]:
from sklearn.model_selection import train_test_split 
X = df.drop('price',axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [135]:
# from sklearn.preprocessing import OneHotEncoder

# enc = OneHotEncoder(handle_unknown='ignore')
# X_train = enc.fit_transform(X_train)
# X_test = enc.transform(X_test)

In [138]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()
enc.fit(pd.concat([X_test,X_train]))
X_train = enc.transform(X_train)
X_test = enc.transform(X_test)

In [139]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

lr = LinearRegression().fit(X_train,y_train)
y_pred = lr.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
print('MAE: ',MAE)
print('MSE: ',MSE)

MAE:  7558.4791241773255
MSE:  97703008.16371584


In [140]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
print('MAE: ',MAE)
print('MSE: ',MSE)

MAE:  4913.789133215254
MSE:  56436670.76800619


In [141]:
import lightgbm as lgb
params = {"boosting_type": "gbdt", "objective": "regression", "metric": "mae", 
          "learning_rate": 0.1, "num_leaves": 16, "n_estimators": 100000, "random_state": 0, "importance_type": "gain", 
          "early_stopping_rounds": 100, "verbose": 10}

cat_cols = ['brand','model','title_status','color','state','country']
model = lgb.LGBMRegressor(**params)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])
y_pred = model.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
print('MAE: ',MAE)
print('MSE: ',MSE)


[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.164782
[LightGBM] [Debug] init for col-wise cost 0.000006 seconds, init for row-wise cost 0.000131 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 428
[LightGBM] [Info] Number of data points in the train set: 1592, number of used features: 6
[LightGBM] [Info] Start training from score 19830.623116
[LightGBM] [Debug] Trained a tree with leaves = 16 and depth = 7
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 16 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 16 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 16 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 16 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 16 and depth = 7
[LightGBM] [Debug] Trained a