In [140]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [141]:
df=pd.read_csv('cardekho_imputated.csv',index_col=0)

In [142]:
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


#### Feature Engineering

In [143]:
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [144]:
df.drop(columns=['brand','car_name'],inplace=True)
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [145]:
X=df.drop('selling_price',axis=1)
y=df['selling_price']

In [146]:
df['model'].value_counts()

model
i20            906
Swift Dzire    890
Swift          781
Alto           778
City           757
              ... 
Ghibli           1
Altroz           1
GTC4Lusso        1
Aura             1
Gurkha           1
Name: count, Length: 120, dtype: int64

In [147]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X['model']=le.fit_transform(X['model'])

In [148]:
num_feature=X.select_dtypes(exclude='object').columns
cat_feature=X.select_dtypes(include='object').columns
onehot_features=[feature for feature in cat_feature if X[feature].nunique()<=10]
LabelEncoder_features=[feature for feature in cat_feature if X[feature].nunique()>10]
print(num_feature)
print(onehot_features)
print(LabelEncoder_features)


Index(['model', 'vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power',
       'seats'],
      dtype='object')
['seller_type', 'fuel_type', 'transmission_type']
[]


In [149]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

ohe=OneHotEncoder(drop='first')
std_scaler=StandardScaler()
preprocessor=ColumnTransformer(
    transformers=[
        ('onehot',ohe,onehot_features),
        ('scaler',std_scaler,num_feature)
    ],remainder='passthrough'
)


In [150]:
X=preprocessor.fit_transform(X)

In [151]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=43)

In [152]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score


In [153]:
def evaluate_model(true,pred):
    mae=mean_absolute_error(true,pred)
    mse=mean_squared_error(true,pred)
    rmse=np.sqrt(mse)
    r2=r2_score(true,pred)
    return mae,mse,rmse,r2  

In [154]:
models={
    'RandomForest':RandomForestRegressor(),
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'DecisionTree':DecisionTreeRegressor(),
    'KNeighbors':KNeighborsRegressor()
}

for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    mae_train,mse_train,rmse_train,r2_train=evaluate_model(y_train,y_train_pred)
    mae_test,mse_test,rmse_test,r2_test=evaluate_model(y_test,y_test_pred)
    print(f"{list(models.keys())[i]} Results:")
    print(f"Train MAE: {mae_train}, MSE: {mse_train}, RMSE: {rmse_train}, R2: {r2_train}")
    print('----------------------------------')
    print(f"Test MAE: {mae_test}, MSE: {mse_test}, RMSE: {rmse_test}, R2: {r2_test}")   
    print('--------------------------------------------------------------------------')


RandomForest Results:
Train MAE: 39024.410135373044, MSE: 11727874633.10331, RMSE: 108295.31214740235, R2: 0.9848256709655601
----------------------------------
Test MAE: 100413.3770548207, MSE: 96964748507.52708, RMSE: 311391.63204480475, R2: 0.8929193150587623
--------------------------------------------------------------------------
LinearRegression Results:
Train MAE: 265287.33444572956, MSE: 288921093492.94214, RMSE: 537513.8077230594, R2: 0.62617406181362
----------------------------------
Test MAE: 269560.96072889527, MSE: 325279362843.6122, RMSE: 570332.6773415777, R2: 0.6407855689138456
--------------------------------------------------------------------------
Ridge Results:
Train MAE: 265253.2112325706, MSE: 288921380074.72345, RMSE: 537514.0743038488, R2: 0.6261736910144444
----------------------------------
Test MAE: 269536.4101263026, MSE: 325285660058.4738, RMSE: 570338.1979654473, R2: 0.6407786147362606
--------------------------------------------------------------------

In [155]:
knn_params={
    'n_neighbors':[3,5,7,9,11]
}
rf_params={   
    'n_estimators':[100,200,300],
    'max_depth':[None,10,20,30],
    'min_samples_split':[2,5,10,15],
    'max_features':[5,7,'auto',8]
}

In [156]:
randomcv_models=[
    ('KNN',KNeighborsRegressor(),knn_params),
    ('RF',RandomForestRegressor(),rf_params)
]
from sklearn.model_selection import RandomizedSearchCV
for name,model,params in randomcv_models:
    print(f"Tuning hyperparameters for {name}")
    random_search=RandomizedSearchCV(estimator=model,
                                     param_distributions=params,
                                     n_iter=100,
                                     scoring='neg_mean_squared_error',
                                     cv=5,
                                     verbose=2,
                                     n_jobs=-1,
                                     random_state=43)
    random_search.fit(X_train,y_train)
    print(f"Best parameters for {name}: {random_search.best_params_}")
    best_model=random_search.best_estimator_
    y_test_pred=best_model.predict(X_test)
    mae_test,mse_test,rmse_test,r2_test=evaluate_model(y_test,y_test_pred)
    print(f"Test MAE: {mae_test}, MSE: {mse_test}, RMSE: {rmse_test}, R2: {r2_test}")   
    print('--------------------------------------------------------------------------')

Tuning hyperparameters for KNN
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ......................................n_neighbors=3; total time=   0.0s
[CV] END ......................................n_neighbors=3; total time=   0.0s
[CV] END ......................................n_neighbors=5; total time=   0.0s
[CV] END ......................................n_neighbors=5; total time=   0.0s
[CV] END ......................................n_neighbors=3; total time=   0.0s
[CV] END ......................................n_neighbors=3; total time=   0.0s
[CV] END ......................................n_neighbors=3; total time=   0.0s
[CV] END ......................................n_neighbors=5; total time=   0.0s
[CV] END ......................................n_neighbors=7; total time=   0.0s
[CV] END ......................................n_neighbors=7; total time=   0.0s
[CV] END ......................................n_neighbors=5; total time=   0.0s
[CV] END .........

In [139]:
models={
    'RandomForest':RandomForestRegressor(n_estimators= 300, min_samples_split= 2, max_features= 8, max_depth= 20),
    'KNeighbors':KNeighborsRegressor(n_neighbors=3)
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    mae_train,mse_train,rmse_train,r2_train=evaluate_model(y_train,y_train_pred)
    mae_test,mse_test,rmse_test,r2_test=evaluate_model(y_test,y_test_pred)
    print(f"{list(models.keys())[i]} Results After Hyperparameter Tuning:")
    print(f"Train MAE: {mae_train}, MSE: {mse_train}, RMSE: {rmse_train}, R2: {r2_train}")
    print('----------------------------------')
    print(f"Test MAE: {mae_test}, MSE: {mse_test}, RMSE: {rmse_test}, R2: {r2_test}")   
    print('--------------------------------------------------------------------------')

RandomForest Results After Hyperparameter Tuning:
Train MAE: 40942.37506383795, MSE: 14910294908.001665, RMSE: 122107.71846202707, R2: 0.9807080372179351
----------------------------------
Test MAE: 97549.98253991798, MSE: 87288301218.93607, RMSE: 295445.9362031166, R2: 0.9036052665969099
--------------------------------------------------------------------------
KNeighbors Results After Hyperparameter Tuning:
Train MAE: 76615.7324789098, MSE: 56772712664.87986, RMSE: 238270.251321645, R2: 0.926543568284491
----------------------------------
Test MAE: 116919.66699102605, MSE: 189020772479.9077, RMSE: 434765.19235089154, R2: 0.7912594618476305
--------------------------------------------------------------------------
