In [172]:
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sb
import matplotlib.pyplot as plt

In [173]:
df = pd.read_csv("USA_cars_datasets.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,price,brand,model,year,title_status,mileage,color,vin,lot,state,country,condition
0,0,6300,toyota,cruiser,2008,clean vehicle,274117.0,black,jtezu11f88k007763,159348797,new jersey,usa,10 days left
1,1,2899,ford,se,2011,clean vehicle,190552.0,silver,2fmdk3gc4bbb02217,166951262,tennessee,usa,6 days left
2,2,5350,dodge,mpv,2018,clean vehicle,39590.0,silver,3c4pdcgg5jt346413,167655728,georgia,usa,2 days left
3,3,25000,ford,door,2014,clean vehicle,64146.0,blue,1ftfw1et4efc23745,167753855,virginia,usa,22 hours left
4,4,27700,chevrolet,1500,2018,clean vehicle,6654.0,red,3gcpcrec2jg473991,167763266,florida,usa,22 hours left


In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2499 entries, 0 to 2498
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2499 non-null   int64  
 1   price         2499 non-null   int64  
 2   brand         2499 non-null   object 
 3   model         2499 non-null   object 
 4   year          2499 non-null   int64  
 5   title_status  2499 non-null   object 
 6   mileage       2499 non-null   float64
 7   color         2499 non-null   object 
 8   vin           2499 non-null   object 
 9   lot           2499 non-null   int64  
 10  state         2499 non-null   object 
 11  country       2499 non-null   object 
 12  condition     2499 non-null   object 
dtypes: float64(1), int64(4), object(8)
memory usage: 253.9+ KB


In [175]:
df.isnull().sum()
df["brand"].value_counts()

ford               1235
dodge               432
nissan              312
chevrolet           297
gmc                  42
jeep                 30
chrysler             18
bmw                  17
hyundai              15
kia                  13
buick                13
infiniti             12
honda                12
cadillac             10
mercedes-benz        10
heartland             5
land                  4
peterbilt             4
audi                  4
acura                 3
lincoln               2
lexus                 2
mazda                 2
maserati              1
toyota                1
harley-davidson       1
jaguar                1
ram                   1
Name: brand, dtype: int64

In [176]:
#dropping columns 
df.drop(["Unnamed: 0","vin", "lot" ], axis=1, inplace= True)

In [177]:
df.head()

Unnamed: 0,price,brand,model,year,title_status,mileage,color,state,country,condition
0,6300,toyota,cruiser,2008,clean vehicle,274117.0,black,new jersey,usa,10 days left
1,2899,ford,se,2011,clean vehicle,190552.0,silver,tennessee,usa,6 days left
2,5350,dodge,mpv,2018,clean vehicle,39590.0,silver,georgia,usa,2 days left
3,25000,ford,door,2014,clean vehicle,64146.0,blue,virginia,usa,22 hours left
4,27700,chevrolet,1500,2018,clean vehicle,6654.0,red,florida,usa,22 hours left


In [178]:
#convert column condition to minutes
df['value']= df['condition'] .str.split(' ').str[0]
df['days']= df['condition'] .str.split(' ').str[1]

def days_to_min_converter(time):
    return int(time)*1440

def hours_to_min_converter(time):
    return int(time)*60

temp_data=pd.concat([df[df['days']=='days']['value'].apply(days_to_min_converter),
           df[df['days']=='hours']['value'].apply(hours_to_min_converter),
           df[df['days']=='minutes']['value'].astype(int)]).rename('Minutes_Left',inplace=True)



df=pd.concat([df,temp_data],axis=1)
df['Minutes_Left'].fillna(-200,inplace=True)

df.drop(['condition','value','days'],axis=1,inplace=True)

In [179]:
#label encoding 
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder ()

#select ctegorical columns 
cat_df = df.select_dtypes(exclude=["int", "float"])

for i in cat_df:
    cat_df[i] = le.fit_transform(df[i])

#joining the data to the numeric data
num_df = df.select_dtypes(include=['int', 'float'])
main_df = pd.concat([num_df, cat_df], axis=1)

In [180]:
X = main_df.drop(columns=["price"])
y = main_df["price"]

#standard scaler
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
x_scaled=scaler.fit_transform(X) 

# Predicting price 

#### Linear Regression

In [181]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score
#split the data
X_train, X_test, y_train, y_test = train_test_split(x_scaled ,y, test_size=0.2, random_state=0)

#create function to fit models

def fit_model(mod):
    mod.fit(X_train, y_train)
    y_pred = mod.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    print ("The Root mean squared error is: ", r2)
    print("The root mean squared error is: ", mse)


In [186]:
from sklearn .linear_model import LinearRegression 
model = LinearRegression()
# Create an instance of the LinearRegression class
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

fit_model(model)

The Root mean squared error is:  0.32585116179878126
The root mean squared error is:  87723106.95110983


#### XgBoost

In [183]:
xg_model = xgb.XGBRFRegressor()
xg_model.fit(X_train, y_train)

xg_pred = xg_model.predict(X_test)

xg_mse = r2_score(y_test, xg_pred)
print ("The Root mean squared error is: ", xg_mse)


The Root mean squared error is:  0.5220227550991472


#### Random forest

In [184]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()

rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)

rf_mse = r2_score(y_test, rf_pred)
print ("The Root mean squared error is: ", rf_mse)

The Root mean squared error is:  0.6735858249317579


#### KNN

In [185]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor(n_neighbors=6)

knn_model.fit(X_train, y_train)

knn_pred = knn_model.predict(X_test) 
knn_mse = r2_score(y_test, knn_pred)
print ("The Root mean squared error is: ", knn_mse)

The Root mean squared error is:  0.4984015207426775
