In [202]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import urllib
import warnings

warnings.filterwarnings('ignore')

%matplotlib inline

In [203]:
import urllib.request


download_dir = './data/'
download_url = 'https://raw.githubusercontent.com/aravind9722/datasets-for-ML-projects/main/cardekho_dataset.csv'
os.makedirs(download_dir, exist_ok=True)
filename = os.path.basename(download_url)
download_file_path = os.path.join(download_dir, filename)
urllib.request.urlretrieve(download_url, download_file_path)
df = pd.read_csv(download_file_path, index_col=[0])

In [204]:
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [205]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15411 entries, 0 to 19543
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   car_name           15411 non-null  object 
 1   brand              15411 non-null  object 
 2   model              15411 non-null  object 
 3   vehicle_age        15411 non-null  int64  
 4   km_driven          15411 non-null  int64  
 5   seller_type        15411 non-null  object 
 6   fuel_type          15411 non-null  object 
 7   transmission_type  15411 non-null  object 
 8   mileage            15411 non-null  float64
 9   engine             15411 non-null  int64  
 10  max_power          15411 non-null  float64
 11  seats              15411 non-null  int64  
 12  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 1.6+ MB


In [206]:
df.drop('car_name', axis=1,inplace=True)
df.drop('brand', axis=1, inplace=True)

In [207]:
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [208]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print(f'Num Features / count : {num_features} / {len(num_features)}')
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print(f'Cat Features / count : {cat_features} / {len(cat_features)}')
discrete_feature = [feature for feature in num_features if len(df[feature].unique()) <= 25]
print(f'Discrete Features / count : {discrete_feature} / {len(discrete_feature)}')
cont_feature = [feature for feature in num_features if feature not in discrete_feature]
print(f'Continuous Features / count : {cont_feature} / {len(cont_feature)}')


Num Features / count : ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'selling_price'] / 7
Cat Features / count : ['model', 'seller_type', 'fuel_type', 'transmission_type'] / 4
Discrete Features / count : ['vehicle_age', 'seats'] / 2
Continuous Features / count : ['km_driven', 'mileage', 'engine', 'max_power', 'selling_price'] / 5


In [209]:
X = df.drop('selling_price', axis=1)
y = df['selling_price']

In [210]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [211]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])

In [212]:
num_feature = X.select_dtypes(exclude='object').columns
onehot_columns = ['seller_type', 'fuel_type', 'transmission_type']

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
onehot_transformer = OneHotEncoder(drop='first')

ct = ColumnTransformer(
    [('OneHotEncoder', onehot_transformer, onehot_columns),
     ('StandardScaler', numeric_transformer, num_feature)],
     remainder='passthrough')



In [213]:
X = ct.fit_transform(X)


In [214]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((12328, 14), (3083, 14))

In [215]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.261053,0.319814,0.283541,-2.038093,1.753906,2.662498,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.793003,-1.339555,-0.883751,0.992261,-0.550880,-0.386028,-0.403022
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.244390,-1.339555,-0.961245,-0.168096,0.890331,3.274530,-0.403022
3,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.024131,0.319814,0.143045,-0.455788,0.020999,0.388902,-0.403022
4,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.508844,1.315436,0.478051,0.157955,-0.554718,-0.504712,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12323,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.556082,0.319814,1.397111,0.256249,-0.456846,-0.274327,2.073444
12324,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.903133,1.647309,0.065551,-0.865749,0.214823,0.060778,-0.403022
12325,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.040794,0.319814,-0.690016,0.193916,-0.936610,-0.780708,-0.403022
12326,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.536377,-1.339555,-0.786884,-0.263994,-0.554718,-0.435829,-0.403022


In [216]:
# Model training and Model Selection
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [217]:
#Create Function to evaluate model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [218]:
#Begining Model training
models = {
    'Linear Regression'       : LinearRegression(),
    'Lasso Regression'        : Lasso(),
    'Ridge Regression'        : Ridge(),
    'KNeighbors Regressor'    : KNeighborsRegressor(),
    'Decision Tree Regressor' : DecisionTreeRegressor(),
    'Random Forest Regressor' : RandomForestRegressor(),
    'Adaboost Regressor'      : AdaBoostRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    #Make Predictions
    y_train_predict = model.predict(X_train)
    y_test_predict = model.predict(X_test)

    #Evaluate Test and train dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_predict)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_predict)

    print(list(models.keys())[i])
    print('Model performance for Training Set')
    print('Mean Absolute error     :     {:.4f}'.format(model_train_mae))
    print('Root Mean Squared Error :     {:.4f}'.format(model_train_rmse))
    print('R2 Square               :     {:.4f}'.format(model_train_r2))
    print('-'*50)
    print('Model performance for Test Set')
    print('Mean Absolute error     :     {:.4f}'.format(model_test_mae))
    print('Root Mean Squared Error :     {:.4f}'.format(model_test_rmse))
    print('R2 Square               :     {:.4f}'.format(model_test_r2))
    print('='*50)
    print('\n')

Linear Regression
Model performance for Training Set
Mean Absolute error     :     268101.6071
Root Mean Squared Error :     553855.6665
R2 Square               :     0.6218
--------------------------------------------------
Model performance for Test Set
Mean Absolute error     :     279618.5794
Root Mean Squared Error :     502543.5930
R2 Square               :     0.6645


Lasso Regression
Model performance for Training Set
Mean Absolute error     :     268099.2226
Root Mean Squared Error :     553855.6710
R2 Square               :     0.6218
--------------------------------------------------
Model performance for Test Set
Mean Absolute error     :     279614.7461
Root Mean Squared Error :     502542.6696
R2 Square               :     0.6645


Ridge Regression
Model performance for Training Set
Mean Absolute error     :     268059.8015
Root Mean Squared Error :     553856.3160
R2 Square               :     0.6218
--------------------------------------------------
Model performance f

In [219]:
knn_params = {'n_neighbors': [2,3,10,20,40,50]}
rf_params = {"max_depth" : [5,8,None,15,10],
             "min_samples_split" : [2,8,15,20],
             "max_features" : [5,7,8,'Auto'],
             "n_estimators" : [100,200,500,1000]}
ada_params = {"n_estimators" : [60,70,80,90,100],
              'loss' : ['linear', 'square', 'exponential']}

In [220]:
#Model List for Hyperparameter Tuning
randomcv = [('KNN', KNeighborsRegressor(), knn_params),
            ('RF', RandomForestRegressor(), rf_params),
            ('AB', AdaBoostRegressor(), ada_params)]

In [221]:
from sklearn.model_selection import RandomizedSearchCV
model_param = {}
for name, model, params in randomcv:
    random = RandomizedSearchCV(estimator=model,
                                param_distributions=params,
                                n_iter=100,
                                cv=3,
                                verbose=2,
                                n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"-------------Best params for {model_name}-------------")
    print(model_param[model_name])


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits
-------------Best params for KNN-------------
{'n_neighbors': 10}
-------------Best params for RF-------------
{'n_estimators': 100, 'min_samples_split': 2, 'max_features': 5, 'max_depth': 15}
-------------Best params for AB-------------
{'n_estimators': 60, 'loss': 'linear'}


In [223]:
hyper_models = {
    'KNN' : KNeighborsRegressor(n_neighbors=10, n_jobs=-1),
    'Random Forest Regressor' : RandomForestRegressor(n_estimators=200,
                                                      min_samples_split=2,
                                                      max_features=8,
                                                      max_depth=None),
    'AdaBoost Regressor': AdaBoostRegressor(n_estimators=60, loss='linear')
}

for i in range(len(list(hyper_models))):
    model = list(hyper_models.values())[i]
    model.fit(X_train, y_train)

    #Make Predictions
    y_train_predict = model.predict(X_train)
    y_test_predict = model.predict(X_test)

    #Evaluate Test and train dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_predict)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_predict)

    print(list(hyper_models.keys())[i])
    print('Model performance for Training Set')
    print('Mean Absolute error     :     {:.4f}'.format(model_train_mae))
    print('Root Mean Squared Error :     {:.4f}'.format(model_train_rmse))
    print('R2 Square               :     {:.4f}'.format(model_train_r2))
    print('-'*50)
    print('Model performance for Test Set')
    print('Mean Absolute error     :     {:.4f}'.format(model_test_mae))
    print('Root Mean Squared Error :     {:.4f}'.format(model_test_rmse))
    print('R2 Square               :     {:.4f}'.format(model_test_r2))
    print('='*50)
    print('\n')

KNN
Model performance for Training Set
Mean Absolute error     :     103472.0474
Root Mean Squared Error :     363460.7706
R2 Square               :     0.8371
--------------------------------------------------
Model performance for Test Set
Mean Absolute error     :     117496.2131
Root Mean Squared Error :     263888.0623
R2 Square               :     0.9075


Random Forest Regressor
Model performance for Training Set
Mean Absolute error     :     39139.1211
Root Mean Squared Error :     129976.8761
R2 Square               :     0.9792
--------------------------------------------------
Model performance for Test Set
Mean Absolute error     :     99236.6194
Root Mean Squared Error :     217421.0150
R2 Square               :     0.9372


AdaBoost Regressor
Model performance for Training Set
Mean Absolute error     :     377910.9755
Root Mean Squared Error :     486287.4409
R2 Square               :     0.7084
--------------------------------------------------
Model performance for Test