In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv("/kaggle/input/usa-cers-dataset/USA_cars_datasets.csv")
df.head()

/kaggle/input/usa-cers-dataset/USA_cars_datasets.csv


Unnamed: 0.1,Unnamed: 0,price,brand,model,year,title_status,mileage,color,vin,lot,state,country,condition
0,0,6300,toyota,cruiser,2008,clean vehicle,274117.0,black,jtezu11f88k007763,159348797,new jersey,usa,10 days left
1,1,2899,ford,se,2011,clean vehicle,190552.0,silver,2fmdk3gc4bbb02217,166951262,tennessee,usa,6 days left
2,2,5350,dodge,mpv,2018,clean vehicle,39590.0,silver,3c4pdcgg5jt346413,167655728,georgia,usa,2 days left
3,3,25000,ford,door,2014,clean vehicle,64146.0,blue,1ftfw1et4efc23745,167753855,virginia,usa,22 hours left
4,4,27700,chevrolet,1500,2018,clean vehicle,6654.0,red,3gcpcrec2jg473991,167763266,florida,usa,22 hours left


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2499 entries, 0 to 2498
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2499 non-null   int64  
 1   price         2499 non-null   int64  
 2   brand         2499 non-null   object 
 3   model         2499 non-null   object 
 4   year          2499 non-null   int64  
 5   title_status  2499 non-null   object 
 6   mileage       2499 non-null   float64
 7   color         2499 non-null   object 
 8   vin           2499 non-null   object 
 9   lot           2499 non-null   int64  
 10  state         2499 non-null   object 
 11  country       2499 non-null   object 
 12  condition     2499 non-null   object 
dtypes: float64(1), int64(4), object(8)
memory usage: 253.9+ KB


In [4]:
df.isnull().sum()
df["brand"].value_counts()

ford               1235
dodge               432
nissan              312
chevrolet           297
gmc                  42
jeep                 30
chrysler             18
bmw                  17
hyundai              15
kia                  13
buick                13
infiniti             12
honda                12
cadillac             10
mercedes-benz        10
heartland             5
land                  4
peterbilt             4
audi                  4
acura                 3
lincoln               2
lexus                 2
mazda                 2
maserati              1
toyota                1
harley-davidson       1
jaguar                1
ram                   1
Name: brand, dtype: int64

In [5]:
#dropping columns 
df.drop(["Unnamed: 0","vin", "lot" ], axis=1, inplace= True)

In [6]:
df.head()

Unnamed: 0,price,brand,model,year,title_status,mileage,color,state,country,condition
0,6300,toyota,cruiser,2008,clean vehicle,274117.0,black,new jersey,usa,10 days left
1,2899,ford,se,2011,clean vehicle,190552.0,silver,tennessee,usa,6 days left
2,5350,dodge,mpv,2018,clean vehicle,39590.0,silver,georgia,usa,2 days left
3,25000,ford,door,2014,clean vehicle,64146.0,blue,virginia,usa,22 hours left
4,27700,chevrolet,1500,2018,clean vehicle,6654.0,red,florida,usa,22 hours left


In [7]:
#convert column condition to minutes
df['value']= df['condition'] .str.split(' ').str[0]
df['days']= df['condition'] .str.split(' ').str[1]

def days_to_min_converter(time):
    return int(time)*1440

def hours_to_min_converter(time):
    return int(time)*60

temp_data=pd.concat([df[df['days']=='days']['value'].apply(days_to_min_converter),
           df[df['days']=='hours']['value'].apply(hours_to_min_converter),
           df[df['days']=='minutes']['value'].astype(int)]).rename('Minutes_Left',inplace=True)



df=pd.concat([df,temp_data],axis=1)
df['Minutes_Left'].fillna(-200,inplace=True)

df.drop(['condition','value','days'],axis=1,inplace=True)

In [8]:
#label encoding 
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder ()

#select ctegorical columns 
cat_df = df.select_dtypes(exclude=["int", "float"])

for i in cat_df:
    cat_df[i] = le.fit_transform(df[i])

#joining the data to the numeric data
num_df = df.select_dtypes(include=['int', 'float'])
main_df = pd.concat([num_df, cat_df], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == "__main__":


In [9]:
X = main_df.drop(columns=["price"])
y = main_df["price"]

#standard scaler
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
x_scaled=scaler.fit_transform(X) 

# Predicting price 


In [10]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

#split the data
X_train, X_test, y_train, y_test = train_test_split(x_scaled ,y, test_size=0.2, random_state=0)

#create function to fit models
model_preds = []

def fit_model(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    model_preds.append([model_name, r2, mse])
    print ("The Root mean squared error is: ", r2)
    print("The root mean squared error is: ", mse)

#model evaluation function
def model_eval():
    preds = pd.DataFrame(model_preds)
    preds.columns = ["Model Name", "R2 Score", "Mean Squared Error"]
    return preds.sort_values(by="R2 Score", ascending=False)

#### Linear Regression

In [11]:
from sklearn .linear_model import LinearRegression 
model = LinearRegression()
model.fit(X_train, y_train)

fit_model(model, "Linear Regression")

The Root mean squared error is:  0.32585116179878126
The root mean squared error is:  87723106.95110983


#### XgBoost

In [12]:
xg_model = xgb.XGBRFRegressor()
fit_model(xg_model, "XG Boost")

The Root mean squared error is:  0.5220227550991472
The root mean squared error is:  62196426.958937


#### Random forest

In [13]:
rf_model = RandomForestRegressor()
fit_model(rf_model, "Random Forest Regressor")

The Root mean squared error is:  0.6861387721252841
The root mean squared error is:  40840954.54962781


#### KNN

In [14]:

knn_model = KNeighborsRegressor(n_neighbors=6)
fit_model(knn_model, "K-Neigbors Regressor")

The Root mean squared error is:  0.4984015207426775
The root mean squared error is:  65270122.18816666


#### Support Vector Regressor

In [15]:
svr_model = SVR()
fit_model(svr_model, "Support Vector Regressor")

The Root mean squared error is:  -0.00588684366125003
The root mean squared error is:  130890263.64363852


### Model Evaluation

In [16]:
model_eval()

Unnamed: 0,Model Name,R2 Score,Mean Squared Error
2,Random Forest Regressor,0.686139,40840950.0
1,XG Boost,0.522023,62196430.0
3,K-Neigbors Regressor,0.498402,65270120.0
0,Linear Regression,0.325851,87723110.0
4,Support Vector Regressor,-0.005887,130890300.0
