In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('cardekho_dataset.csv')
df.drop(['Unnamed: 0', 'car_name', 'brand'], axis=1, inplace=True)

In [3]:
df

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.70,796,46.30,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.90,1197,82.00,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.00,1197,80.00,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.10,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000
...,...,...,...,...,...,...,...,...,...,...,...
15406,i10,9,10723,Dealer,Petrol,Manual,19.81,1086,68.05,5,250000
15407,Ertiga,2,18000,Dealer,Petrol,Manual,17.50,1373,91.10,7,925000
15408,Rapid,6,67000,Dealer,Diesel,Manual,21.14,1498,103.52,5,425000
15409,XUV500,5,3800000,Dealer,Diesel,Manual,16.00,2179,140.00,7,1225000


In [4]:
df.isnull().sum()

model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [5]:
df['model'].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [6]:
df['model'].dtype

dtype('O')

### Getting all types of features

In [7]:
# Numerical data
numerical_data = [feature for feature in df.columns if df[feature].dtype != 'O']

# categorical data
cat_data = [feature for feature in df.columns if df[feature].dtype == 'O']

# discrete data
dis_data = [feature for feature in numerical_data if len(df[feature].unique()) <= 25]

# continues data
continuous_data = [feature for feature in numerical_data if feature not in dis_data]

In [8]:
from sklearn.model_selection import train_test_split

x = df.drop('selling_price', axis=1)
y = df['selling_price']

In [9]:
x

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Alto,9,120000,Individual,Petrol,Manual,19.70,796,46.30,5
1,Grand,5,20000,Individual,Petrol,Manual,18.90,1197,82.00,5
2,i20,11,60000,Individual,Petrol,Manual,17.00,1197,80.00,5
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.10,5
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5
...,...,...,...,...,...,...,...,...,...,...
15406,i10,9,10723,Dealer,Petrol,Manual,19.81,1086,68.05,5
15407,Ertiga,2,18000,Dealer,Petrol,Manual,17.50,1373,91.10,7
15408,Rapid,6,67000,Dealer,Diesel,Manual,21.14,1498,103.52,5
15409,XUV500,5,3800000,Dealer,Diesel,Manual,16.00,2179,140.00,7


In [10]:
y

0         120000
1         550000
2         215000
3         226000
4         570000
          ...   
15406     250000
15407     925000
15408     425000
15409    1225000
15410    1200000
Name: selling_price, Length: 15411, dtype: int64

### Feature Encoding and Scaling

In [11]:
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [12]:
len(df['model'].unique())

120

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le = LabelEncoder()

In [15]:
x['model'] = le.fit_transform(x['model'])

In [16]:
(x['model'].unique())

array([  7,  54, 118,  38, 100, 117,  96,  88,  97,  32,  29,  24,  16,
        60,  13,  89,  95,  30,  25,  14,  45,  64,  10,  84, 114,  63,
        59,  78,  85,  68,  11,  44,  98,  91,  73,  86,  23,  49,   1,
        19,  41,  62,  61,   0,   4,  92,  42,  83,  90,  58,  79,  39,
         5,  36,  74, 116,   2, 111, 106,  57,  26,  31,  22, 103,  77,
        46,  87,  70, 113,  34,  82,  99, 112,  93,  37, 101,  20,  40,
       115,  47,   3,  81,  33,  12, 107,  51,  28,  18,  65,  80,  94,
        56, 104,  71,  27,  17, 119,  53,  67, 105,  35, 109,  43,   6,
        66,  50,  48, 102, 110, 108,  72,   9,   8,  69,  21,  15,  76,
        52,  75,  55])

In [17]:
le.classes_ # no of categories

array(['3', '5', '6', '7', 'A4', 'A6', 'A8', 'Alto', 'Altroz', 'Alturas',
       'Amaze', 'Aspire', 'Aura', 'Baleno', 'Bolero', 'C', 'C-Class',
       'CLS', 'CR', 'CR-V', 'Camry', 'Carnival', 'Cayenne', 'Celerio',
       'Ciaz', 'City', 'Civic', 'Compass', 'Continental', 'Cooper',
       'Creta', 'D-Max', 'Duster', 'Dzire LXI', 'Dzire VXI', 'Dzire ZXI',
       'E-Class', 'ES', 'Ecosport', 'Eeco', 'Elantra', 'Endeavour',
       'Ertiga', 'F-PACE', 'Figo', 'Fortuner', 'Freestyle', 'GL-Class',
       'GLS', 'GO', 'GTC4Lusso', 'Ghibli', 'Ghost', 'Glanza', 'Grand',
       'Gurkha', 'Harrier', 'Hector', 'Hexa', 'Ignis', 'Innova', 'Jazz',
       'KUV', 'KUV100', 'KWID', 'Kicks', 'MUX', 'Macan', 'Marazzo', 'NX',
       'Nexon', 'Octavia', 'Panamera', 'Polo', 'Q7', 'Quattroporte', 'RX',
       'Rapid', 'RediGO', 'Rover', 'S-Class', 'S-Presso', 'S90', 'Safari',
       'Santro', 'Scorpio', 'Seltos', 'Superb', 'Swift', 'Swift Dzire',
       'Thar', 'Tiago', 'Tigor', 'Triber', 'Tucson', 'Vento', '

In [18]:
print(len(df['seller_type'].unique()))
print(len(df['fuel_type'].unique()))
print(len(df['transmission_type'].unique()))

3
5
2


In [19]:
# One hote encoding on seller_type, fuel_type and transmission_type as they have less number of categories

In [20]:
# Creating Column tranformer

In [21]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_features = x.select_dtypes(exclude='object').columns
onehot_columns = ['seller_type', 'fuel_type', 'transmission_type']

scaler = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ('OneHotTransformer', oh_transformer, onehot_columns),
        ('Standard Scalar', scaler, num_features)
    ], remainder='passthrough'
)

In [22]:
x = preprocessor.fit_transform(x)

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [24]:
x_train

array([[ 0.        ,  0.        ,  0.        , ..., -0.76773286,
        -0.89566754, -0.40302241],
       [ 0.        ,  0.        ,  1.        , ...,  0.92487372,
         0.9404295 , -0.40302241],
       [ 0.        ,  0.        ,  0.        , ..., -0.55087963,
        -0.61874036, -0.40302241],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.9366097 ,
        -0.78070786, -0.40302241],
       [ 0.        ,  0.        ,  0.        , ..., -0.55471774,
        -0.43582879, -0.40302241],
       [ 1.        ,  0.        ,  0.        , ..., -0.04616815,
         0.06194201, -0.40302241]], shape=(11558, 14))

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [35]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2 = r2_score(true, predicted)

    return mae, mse, rmse, r2

### Model Training

In [39]:
models = {
    'Linear Regression' : LinearRegression(),
    'Ridge' : Ridge(),
    'Lasso' : Lasso(),
    'KNN' : KNeighborsRegressor(),
    'Decision' : DecisionTreeRegressor(),
    'rf' : RandomForestRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]

    model.fit(x_train, y_train)

    # Prediction for train and test
    y_train_pred = model.predict(x_train)
    y_test_pred  = model.predict(x_test)

    # Evaluate the model 
    mea_train, mse_train, rmse_train, r2_score_train = evaluate_model(y_train, y_train_pred)
    mea_test, mse_test, rmse_test, r2_score_test = evaluate_model(y_test, y_test_pred)

    # final output :
    
    print()
    print(list(models.keys())[i])
    print()
    print('------------training set---------------')
    print("MEA Train: ", mea_train)
    print("MSE Train: ", mse_train)
    print("RMSE Train: ", rmse_train)
    print("r2_score Train: ", r2_score_train)

    
    print('------------test set---------------')
    print("MEA Test: ", mea_test)
    print("MSE Test: ", mse_test)
    print("RMSE Test: ", rmse_test)
    print("r2_score Test: ", r2_score_test)


Linear Regression

------------training set---------------
MEA Train:  266675.1075542505
MSE Train:  304874315292.8461
RMSE Train:  552154.249547032
r2_score Train:  0.6219860307551311
------------test set---------------
MEA Test:  284283.44595338294
MSE Test:  270286925822.75293
RMSE Test:  519891.26346069033
r2_score Test:  0.6524693637784766

Ridge

------------training set---------------
MEA Train:  266635.36623235425
MSE Train:  304875008766.8082
RMSE Train:  552154.877517901
r2_score Train:  0.6219851709160702
------------test set---------------
MEA Test:  284241.11291362543
MSE Test:  270275613895.96542
RMSE Test:  519880.38421926
r2_score Test:  0.6524839084742713

Lasso

------------training set---------------
MEA Train:  266674.04718300886
MSE Train:  304874327628.1807
RMSE Train:  552154.2607172208
r2_score Train:  0.6219860154605374
------------test set---------------
MEA Test:  284283.7889963272
MSE Test:  270286207881.65326
RMSE Test:  519890.5729878676
r2_score Test:  0

In [None]:
list(models.values())

In [None]:
x_train

In [None]:
y_train

In [38]:
# Hyperparameter for two best models : knn and rf

In [40]:
knn_params = {
    'n_neighbors' : [2, 3, 10, 20, 40, 50]
}
rf_params = {
    'max_depth' : [5, 8, 15, None, 10],
    'max_features' : [5, 7, 'auto', 8],
    'min_samples_split' : [2, 8, 15, 20],
    'n_estimators' : [100, 200, 500, 1000]
}

In [44]:
randomcv_models = [
    ('knn', KNeighborsRegressor(), knn_params),
    ('rf', RandomForestRegressor(), rf_params)
]

In [42]:
from sklearn.model_selection import RandomizedSearchCV

In [45]:
model_params = {}

for name, model, param in randomcv_models:
    random = RandomizedSearchCV(estimator=model, param_distributions=param, n_iter=100, cv=3, verbose=2, n_jobs=-1)
    random.fit(x_train, y_train)
    model_params[model] = random.best_params_

for model_name in model_params:
    print(model_params[model_name])



Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits


60 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "H:\Python-all-in-one\ML-algorithams\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "H:\Python-all-in-one\ML-algorithams\myenv\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "H:\Python-all-in-one\ML-algorithams\myenv\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "H:\Python-all-in-one\ML-algorithams\myenv\Lib\site-packages\sklearn\utils\_param_validation.py", 

{'n_neighbors': 10}
{'n_estimators': 200, 'min_samples_split': 2, 'max_features': 7, 'max_depth': 10}


In [48]:
models = {
    'rf' : RandomForestRegressor(n_estimators= 200, min_samples_split=2, max_features=7, max_depth=10),
    'knn' : KNeighborsRegressor(n_neighbors=10)
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    mea_train, mse_train, rmse_train, r2_score_train = evaluate_model(y_train, y_train_pred)
    mea_test, mse_test, rmse_test, r2_score_test = evaluate_model(y_test, y_test_pred)

    # final output :
    
    print()
    print(list(models.keys())[i])
    print()
    print('------------training set---------------')
    print("MEA Train: ", mea_train)
    print("MSE Train: ", mse_train)
    print("RMSE Train: ", rmse_train)
    print("r2_score Train: ", r2_score_train)

    
    print('------------test set---------------')
    print("MEA Test: ", mea_test)
    print("MSE Test: ", mse_test)
    print("RMSE Test: ", rmse_test)
    print("r2_score Test: ", r2_score_test)


rf

------------training set---------------
MEA Train:  84259.78993036623
MSE Train:  29261621159.747593
RMSE Train:  171060.28516212522
r2_score Train:  0.9637184865818856
------------test set---------------
MEA Test:  108936.29935660063
MSE Test:  57351116853.02524
RMSE Test:  239480.93212827036
r2_score Test:  0.926258844865411

knn

------------training set---------------
MEA Train:  102914.24987021976
MSE Train:  133640227644.05606
RMSE Train:  365568.362476919
r2_score Train:  0.8342993477361558
------------test set---------------
MEA Test:  121765.48793148196
MSE Test:  87994611511.97119
RMSE Test:  296638.85705006885
r2_score Test:  0.8868579261474333
