In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Data Cleaning
* Handling Missing values
* Handling Duplicates
* Check for dtypes
* understand dataset

In [None]:
#load dataset
df=pd.read_csv('cardekho_imputated.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [5]:
#Drop unnecassry columns
df.drop('car_name',axis=1,inplace=True)
df.drop('brand',axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [7]:
print('Number of unique values',df['model'].nunique())
print('-------')
print(df['model'].unique())

Number of unique values 120
-------
['Alto' 'Grand' 'i20' 'Ecosport' 'Wagon R' 'i10' 'Venue' 'Swift' 'Verna'
 'Duster' 'Cooper' 'Ciaz' 'C-Class' 'Innova' 'Baleno' 'Swift Dzire'
 'Vento' 'Creta' 'City' 'Bolero' 'Fortuner' 'KWID' 'Amaze' 'Santro'
 'XUV500' 'KUV100' 'Ignis' 'RediGO' 'Scorpio' 'Marazzo' 'Aspire' 'Figo'
 'Vitara' 'Tiago' 'Polo' 'Seltos' 'Celerio' 'GO' '5' 'CR-V' 'Endeavour'
 'KUV' 'Jazz' '3' 'A4' 'Tigor' 'Ertiga' 'Safari' 'Thar' 'Hexa' 'Rover'
 'Eeco' 'A6' 'E-Class' 'Q7' 'Z4' '6' 'XF' 'X5' 'Hector' 'Civic' 'D-Max'
 'Cayenne' 'X1' 'Rapid' 'Freestyle' 'Superb' 'Nexon' 'XUV300' 'Dzire VXI'
 'S90' 'WR-V' 'XL6' 'Triber' 'ES' 'Wrangler' 'Camry' 'Elantra' 'Yaris'
 'GL-Class' '7' 'S-Presso' 'Dzire LXI' 'Aura' 'XC' 'Ghibli' 'Continental'
 'CR' 'Kicks' 'S-Class' 'Tucson' 'Harrier' 'X3' 'Octavia' 'Compass' 'CLS'
 'redi-GO' 'Glanza' 'Macan' 'X4' 'Dzire ZXI' 'XC90' 'F-PACE' 'A8' 'MUX'
 'GTC4Lusso' 'GLS' 'X-Trail' 'XE' 'XC60' 'Panamera' 'Alturas' 'Altroz'
 'NX' 'Carnival' 'C' 'RX' 'Ghost

In [8]:
#Get all different tyopes of feature

num_features=[feature for feature in df.columns if df[feature].dtype!=object]
print('Number of numerical features',len(num_features))


cat_features=[feature for feature in df.columns if df[feature].dtype==object]
print('Number of categorical features',len(cat_features))


dis_features=[feature for feature in num_features if df[feature].nunique() <= 25]
print('Number of discrete features',len(dis_features))

cont_features = [feature for feature in num_features if feature not in dis_features]
print('Number of continuous features', len(cont_features))

Number of numerical features 8
Number of categorical features 4
Number of discrete features 2
Number of continuous features 6


In [9]:
df.isnull().sum() #no missing values

Unnamed: 0           0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [10]:
#I/P and D/P features
X=df.drop(['selling_price'],axis=1)
y=df['selling_price']

In [11]:
#Feature encoding and scaling
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
X['model']=le.fit_transform(X['model'])

In [12]:
#create a ColumnTransformer with 3 types of transformers
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_features=X.select_dtypes(exclude=object).columns
oh_column=['seller_type','fuel_type','transmission_type']

numeric_features=StandardScaler()
oh_transformer=OneHotEncoder(drop='first')

preprocessor=ColumnTransformer(
    [
    ('onehotencoder',oh_transformer,oh_column),
    ('standarscaler',numeric_features,num_features)
    ]
    ,remainder='passthrough' #if don gice it erest dataset will be deleted not inclusded thats why passthrough
)

In [13]:
X=preprocessor.fit_transform(X)
pd.DataFrame(X) #done one hot encoding and stnadarscaling 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738694,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738516,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738339,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.738162,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.737985,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.723327,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022
15407,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.723859,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444
15408,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.724036,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022
15409,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.724213,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444


In [14]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [23]:
pd.to_pickle(df,'cleaned_travel.pkl')

In [25]:
pd.to_pickle(X_train, "X_train.pkl")
pd.to_pickle(X_test, "X_test.pkl")
pd.to_pickle(y_train, "y_train.pkl")
pd.to_pickle(y_test, "y_test.pkl")

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,accuracy_score,classification_report,roc_auc_score,roc_curve,confusion_matrix,f1_score,precision_score,recall_score

In [17]:
#create a func to evaluate model
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2__score=r2_score(true,predicted)
    return mae,mse,rmse,r2__score

In [18]:
#Beginning model training
models={
    'RandomForestRegressor':RandomForestRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'KNeighborsRegressor':KNeighborsRegressor()
}

for i in range(len(list(models))): #converted to list
    model=list(models.values())[i] #iterating through model names
    model.fit(X_train,y_train)

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    # Training metrics
    model_train_mae=mean_absolute_error(y_train,y_train_pred)
    model_train_rmse=np.sqrt(mean_squared_error(y_train,y_train_pred))
    model_train_r2=r2_score(y_train,y_train_pred)

    # Testing metrics
    model_test_mae=mean_absolute_error(y_test,y_test_pred)
    model_test_rmse=np.sqrt(mean_squared_error(y_test,y_test_pred))
    model_test_r2=r2_score(y_test,y_test_pred)

    print(list(models.keys())[i]) #printing the model name with metrics
    
    print('Model program for Training Set')
    print('MAE:{:.4f}'.format(model_train_mae))
    print('RMSE:{:.4f}'.format(model_train_rmse))
    print('R2:{:.4f}'.format(model_train_r2))
    
    print('-----------------------')
    print('Model program for Testing Set')
    print('MAE:{:.4f}'.format(model_test_mae))
    print('RMSE:{:.4f}'.format(model_test_rmse))
    print('R2:{:.4f}'.format(model_test_r2))
    print('='*35)
    print('\n')

RandomForestRegressor
Model program for Training Set
MAE:36204.9116
RMSE:131402.7114
R2:0.9787
-----------------------
Model program for Testing Set
MAE:96841.2277
RMSE:229490.2381
R2:0.9300


DecisionTreeRegressor
Model program for Training Set
MAE:0.0000
RMSE:0.0000
R2:1.0000
-----------------------
Model program for Testing Set
MAE:129923.7350
RMSE:324891.1693
R2:0.8598


LinearRegression
Model program for Training Set
MAE:268104.1303
RMSE:553850.0494
R2:0.6218
-----------------------
Model program for Testing Set
MAE:279686.6479
RMSE:502582.0834
R2:0.6645


Ridge
Model program for Training Set
MAE:268061.4421
RMSE:553850.6941
R2:0.6218
-----------------------
Model program for Testing Set
MAE:279625.1576
RMSE:502572.3576
R2:0.6645


Lasso
Model program for Training Set
MAE:268101.7491
RMSE:553850.0538
R2:0.6218
-----------------------
Model program for Testing Set
MAE:279682.7929
RMSE:502581.1494
R2:0.6645


KNeighborsRegressor
Model program for Training Set
MAE:96905.2401
RMSE:335

In [19]:
#initialize few paramters for Hyperparmater tuning (knn and rf becoaz they performed well)
knn_params = {
    'n_neighbors': [2, 3, 10, 20, 40, 50],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1=Manhattan, 2=Euclidean
}

rf_params = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [26]:
#randomized search cv
randomcv_models=[
    ('KNN',KNeighborsRegressor(),knn_params),
    ('RF',RandomForestRegressor(),rf_params)
]

from sklearn.model_selection import RandomizedSearchCV
model_params={}

for name,model,param in randomcv_models:
    random=RandomizedSearchCV(
        estimator=model,
        param_distributions=param,
        n_iter=100,
        cv=3,
        verbose=2,
        n_jobs=-1
    )
    random.fit(X_train,y_train)
    model_params[name]=random.best_params_

for model_name in model_params:
    print(f'-----Best params for {model_name}-----')
    print(model_params[model_name])

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END ................n_neighbors=2, p=2, weights=uniform; total time=   0.9s
[CV] END ................n_neighbors=2, p=2, weights=uniform; total time=   1.0s
[CV] END ...............n_neighbors=2, p=1, weights=distance; total time=   1.6s
[CV] END ...............n_neighbors=2, p=1, weights=distance; total time=   1.1s
[CV] END ................n_neighbors=3, p=1, weights=uniform; total time=   1.3s
[CV] END ................n_neighbors=2, p=2, weights=uniform; total time=   0.9s
[CV] END ...............n_neighbors=2, p=1, weights=distance; total time=   1.8s
[CV] END ...............n_neighbors=2, p=2, weights=distance; total time=   0.9s
[CV] END ................n_neighbors=3, p=1, weights=uniform; total time=   1.8s
[CV] END ................n_neighbors=2, p=1, weights=uniform; total time=   1.3s
[CV] END ................n_neighbors=2, p=1, weights=uniform; total time=   1.2s
[CV] END ................n_neighbors=3, p=1, wei

KeyboardInterrupt: 

In [None]:
# trying with best params we got from randomizedsearchcv
models={
    'RandomForestRegressor':RandomForestRegressor(
        n_estimators=100,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='log2',
        max_depth=None
    ),
    'KNeighborsRegressor':KNeighborsRegressor(
        weights='distance',
        p=1,
        n_neighbors=3
    )
}

for i in range(len(list(models))): #converted to list
    model=list(models.values())[i] #iterating through model names
    model.fit(X_train,y_train)

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    # Training metrics
    model_train_mae=mean_absolute_error(y_train,y_train_pred)
    model_train_rmse=np.sqrt(mean_squared_error(y_train,y_train_pred))
    model_train_r2=r2_score(y_train,y_train_pred)

    # Testing metrics
    model_test_mae=mean_absolute_error(y_test,y_test_pred)
    model_test_rmse=np.sqrt(mean_squared_error(y_test,y_test_pred))
    model_test_r2=r2_score(y_test,y_test_pred)

    print(list(models.keys())[i]) #printing the model name with metrics
    
    print('Model program for Training Set')
    print('MAE:{:.4f}'.format(model_train_mae))
    print('RMSE:{:.4f}'.format(model_train_rmse))
    print('R2:{:.4f}'.format(model_train_r2))
    
    print('-----------------------')
    print('Model program for Testing Set')
    print('MAE:{:.4f}'.format(model_test_mae))
    print('RMSE:{:.4f}'.format(model_test_rmse))
    print('R2:{:.4f}'.format(model_test_r2))
    print('='*35)
    print('\n')

RandomForestRegressor
Model program for Training Set
MAE:36879.5552
RMSE:130447.1356
R2:0.9790
-----------------------
Model program for Testing Set
MAE:97068.9753
RMSE:215367.8819
R2:0.9384


KNeighborsRegressor
Model program for Training Set
MAE:0.0000
RMSE:0.0000
R2:1.0000
-----------------------
Model program for Testing Set
MAE:114020.5144
RMSE:323224.1238
R2:0.8612


