In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../data/cleaned_car_data.csv')
df.head()

Unnamed: 0,Car Brand,Model,Year,Mileage,Fuel Type,Engine Size,Transmission,Body Type,Color,Owner History,Price,Age
0,nissan,model d,2006,244586,diesel,1.1,automatic,coupe,red,third owner,6501.73,17
1,honda,model d,2006,89556,electric,4.4,automatic,coupe,gray,second owner,15860.51,17
2,ford,model e,2007,258273,hybrid,1.4,manual,suv,black,first owner,8136.09,16
3,kia,model e,2023,217592,electric,3.7,automatic,coupe,red,second owner,18556.4,0
4,chevrolet,model c,2002,256919,hybrid,2.8,manual,suv,blue,third owner,7486.0,21


In [4]:
df.isnull().sum()

Car Brand        0
Model            0
Year             0
Mileage          0
Fuel Type        0
Engine Size      0
Transmission     0
Body Type        0
Color            0
Owner History    0
Price            0
Age              0
dtype: int64

In [5]:
df.dtypes

Car Brand         object
Model             object
Year               int64
Mileage            int64
Fuel Type         object
Engine Size      float64
Transmission      object
Body Type         object
Color             object
Owner History     object
Price            float64
Age                int64
dtype: object

In [6]:
X = df.drop('Price',axis=1)
y = df['Price']

In [7]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = ['Car Brand','Model', 'Fuel Type', 'Transmission','Body Type', 'Color', 'Owner History']

In [8]:
preprocesser = ColumnTransformer([
    ('OneHotEncoder', OneHotEncoder(), cat_features),
    ('MinMaxScaler', StandardScaler(), num_features)
])
X_preprocessed = preprocesser.fit_transform(X)
X_preprocessed.shape

(2000, 39)

In [57]:
X_train,X_test,y_train,y_test = train_test_split(X_preprocessed,y, test_size=0.2, random_state=42)
scores = []

### Linear Regresssion

In [58]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import cross_val_score

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred_train = linear_model.predict(X_train)
y_pred_test = linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train,y_pred_train)
r2_test = r2_score(y_test,y_pred_test)
cv = cross_val_score(linear_model, X_train,y_train,cv=5, scoring='r2')
rmse = np.sqrt(mse)

print("RMSE:", rmse)
print("r2_score_train:", r2_train)
print("r2_socre_test:",r2_test)
print("Cross validation score:", cv.mean())

scores.append({'Linear Regression': {'R2_Score_train': r2_train, 'R2_Score_test':r2_test,'RMSE':rmse,'Cross Validaiton Score': cv.mean()}})

RMSE: 1394.8697752480844
r2_score_train: 0.9203248984204583
r2_socre_test: 0.8934441910972903
Cross validation score: 0.9152470700075016


### Lasso Regresssion

In [59]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

estimator = Lasso()
paramgrid = {'alpha': list(range(1,100))}
grid_search_lasso = GridSearchCV(estimator,paramgrid,cv=5,scoring='r2')
grid_search_lasso.fit(X_train,y_train)
lasso_model = grid_search_lasso.best_estimator_

y_pred_train = lasso_model.predict(X_train)
y_pred_test = lasso_model.predict(X_test)

mse = mean_squared_error(y_test,y_pred_test)
r2_train = r2_score(y_train,y_pred_train)
r2_test = r2_score(y_test,y_pred_test)
cv = cross_val_score(lasso_model, X_train,y_train,cv=5,scoring='r2')
rmse = np.sqrt(mse)

print("Best params for Lasso Regresssion:", grid_search_lasso.best_params_)
print("RMSE:", rmse)
print("r2_score_train:",r2_train)
print("r2_score_test:", r2_test)
print("Cross validation score:", cv.mean())

scores.append({'Lasso Regression': {'R2_Score_train':r2_train,'R2_Score_test':r2_test,'RMSE':rmse,'Cross Validaiton Score': cv.mean()}})


Best params for Lasso Regresssion: {'alpha': 16}
RMSE: 1399.0829382437232
r2_score_train: 0.9189628656587738
r2_score_test: 0.8927995216098806
Cross validation score: 0.9163771526227427


### Ridge Regression

In [60]:
from sklearn.linear_model import Ridge

estimator = Ridge()
param_grid = {'alpha': list(range(1,100))}
gird_search_ridge = GridSearchCV(estimator,param_grid, cv=5, scoring='r2')
gird_search_ridge.fit(X_train,y_train)
ridge_model = gird_search_ridge.best_estimator_

y_pred_train = ridge_model.predict(X_train)
y_pred_test = ridge_model.predict(X_test)

mse =mean_squared_error(y_test,y_pred_test)
r2_train = r2_score(y_train,y_pred_train)
r2_test = r2_score(y_test,y_pred_test)
cv = cross_val_score(ridge_model,X_train,y_train,cv=5, scoring='r2')
rmse = np.sqrt(mse)

print("Best params for Ridge Regression:", gird_search_ridge.best_params_)
print("RMSE:", rmse)
print("r2_score_train:", r2_train)
print("r2_score_test:", r2_test)
print("Cross validation score:", cv.mean())

scores.append({'Ridge Regression': {'R2_Score_train': r2_train,'R2_Score_test': r2_test,'RMSE': rmse,'Cross Validaiton Score': cv.mean()}})

Best params for Ridge Regression: {'alpha': 1}
RMSE: 1394.3506365953292
r2_score_train: 0.9203172219675557
r2_score_test: 0.8935234916114349
Cross validation score: 0.9152529429786552


### KNeighbors Regressor

In [64]:
from sklearn.neighbors import KNeighborsRegressor

estimator = KNeighborsRegressor()
param_grid = {'n_neighbors': list(range(1,100))}
grid_search_knn = GridSearchCV(estimator,param_grid, cv=5,scoring='r2')
grid_search_knn.fit(X_train,y_train)
knn_model = grid_search_knn.best_estimator_

y_pred_train = knn_model.predict(X_train)
y_pred_test = knn_model.predict(X_test)

mse = mean_squared_error(y_test,y_pred_test)
r2_train = r2_score(y_train,y_pred_train)
r2_test = r2_score(y_test,y_pred_test)
cv = cross_val_score(knn_model,X_train,y_train,cv=5,scoring='r2')
rmse = np.sqrt(mse)

print("Best params for KNN Regressor:", grid_search_knn.best_params_)
print("RMSE:", rmse)
print("r2_score_train:", r2_train)
print("r2_score_test:", r2_test)
print("Cross validation score:", cv.mean())

scores.append({'KNeighbors Regressor': {'R2_Score_train': r2_train,'R2_Score_test': r2_test,'RMSE':rmse,'Cross Validaiton Score': cv.mean()}})

Best params for KNN Regressor: {'n_neighbors': 11}
RMSE: 2190.817203978746
r2_score_train: 0.8004902440366325
r2_score_test: 0.7371414620573089
Cross validation score: 0.7450997473965313


### Decision Regression

In [65]:
from sklearn.tree import DecisionTreeRegressor

estimator = DecisionTreeRegressor()
param_grid = {'max_depth': list(range(1,10)), 'criterion': ['squared_error','absolute_error']}
grid_search_decision = GridSearchCV(estimator,param_grid,cv=5,scoring='r2')
grid_search_decision.fit(X_train,y_train)
decisiontree_model = grid_search_decision.best_estimator_

y_pred_train = decisiontree_model.predict(X_train)
y_pred_test = decisiontree_model.predict(X_test)

mse = mean_squared_error(y_test,y_pred_test)
r2_train = r2_score(y_train,y_pred_train)
r2_test = r2_score(y_test,y_pred_test)
cv = cross_val_score(decisiontree_model, X_train,y_train,cv=5, scoring='r2')
rmse = np.sqrt(mse)

print("Best param for DecisionTree Regressor:", grid_search_decision.best_params_)
print("RMSE:", rmse)
print("r2_score_train:", r2_train)
print("r2_score_test:", r2_test)
print("Cross validation score:", cv.mean())

scores.append({'KNeighbors Regressor': {'R2_Score_train': r2_train,'R2_Score_test': r2_test,'RMSE':rmse,'Cross Validaiton Score': cv.mean()}})

Best param for DecisionTree Regressor: {'criterion': 'squared_error', 'max_depth': 7}
RMSE: 1487.3142343327984
r2_score_train: 0.9428259991754261
r2_score_test: 0.8788522726892909
Cross validation score: 0.9009731573566582


### Random Forest Regressor

In [66]:
from sklearn.ensemble import RandomForestRegressor

estimator = RandomForestRegressor()
param_grid = {'n_estimators': list(range(1,10)), 'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error']}
grid_search_randomforest = GridSearchCV(estimator,param_grid,cv=5, scoring='r2')
grid_search_randomforest.fit(X_train,y_train)
randomforest_model = grid_search_randomforest.best_estimator_

y_pred_train = randomforest_model.predict(X_train)
y_pred_test = randomforest_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(randomforest_model, X_train, y_train, cv=5, scoring='r2')
rmse = np.sqrt(mse)

print("Best params for RandomForest Regressor:", grid_search_randomforest.best_params_)
print("RMSE:", rmse)
print("r2_score_train:", r2_train)
print("r2_score_test:", r2_test)
print("Cross validation score:", cv.mean())

scores.append({'RandomForest Regresssor': {'R2_Score_train': r2_train, 'R2_Score_test': r2_test, 'RMSE': rmse,'Cross Validaiton Score': cv.mean()}})

Best params for RandomForest Regressor: {'criterion': 'squared_error', 'max_depth': 8, 'n_estimators': 9}
RMSE: 1284.845893235198
r2_score_train: 0.9572000144929567
r2_score_test: 0.9095909545304757
Cross validation score: 0.9214809498345845


### AdaBoost Regressor

In [67]:
from sklearn.ensemble import AdaBoostRegressor

estimator = AdaBoostRegressor()
param_grid = {'n_estimators': list(range(1,10)), 'learning_rate': [0.1,0.5,1.0]}
grid_search_adaboost = GridSearchCV(estimator,param_grid,cv=5, scoring='r2')
grid_search_adaboost.fit(X_train,y_train)
adaboost_model = grid_search_adaboost.best_estimator_

y_pred_train = adaboost_model.predict(X_train)
y_pred_test = adaboost_model.predict(X_test)

mse = mean_squared_error(y_test,y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(adaboost_model, X_train, y_train, cv=5, scoring='r2')
rmse = np.sqrt(mse)

print("Best params for AdaBoost Regressor:", grid_search_adaboost.best_params_)
print("RMSE:", rmse)
print("r2_score_train:", r2_train)
print("r2_score_test:", r2_test)
print("Cross validation score:", cv.mean())

scores.append({'AdaBoost Regressor': {'R2_Score_train': r2_train, 'R2_Score_test': r2_test, 'RMSE': rmse,'Cross Validaiton Score': cv.mean()}})

Best params for AdaBoost Regressor: {'learning_rate': 1.0, 'n_estimators': 9}
RMSE: 1909.8605009004098
r2_score_train: 0.8195680472027613
r2_score_test: 0.8002378993272472
Cross validation score: 0.7885921448685498


### Gradient Boosting Regressor

In [68]:
from sklearn.ensemble import GradientBoostingRegressor

estimator = GradientBoostingRegressor(random_state=42)
paramgrid = {'n_estimators': [50, 100, 150, 200,250,300], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5,0.8,0.1], 'max_depth': list(range(1,10))}
grid_search_gradientboost = GridSearchCV(estimator, param_grid,cv=5, scoring='r2')
grid_search_gradientboost.fit(X_train,y_train)
gradientboost_model = grid_search_gradientboost.best_estimator_

y_pred_train = gradientboost_model.predict(X_train)
y_pred_test = gradientboost_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(gradientboost_model, X_train, y_train, cv=5, scoring='r2')
rmse = np.sqrt(mse)

print("Best params for GradientBoost Regressor:", grid_search_gradientboost.best_params_)
print("RMSE:", rmse)
print("r2_score_train:", r2_train)
print("r2_score_test:", r2_test)
print("Cross validation score:", cv.mean())

scores.append({'GradientBoost Regressor': {'R2_Score_train': r2_train, 'R2_Score_test': r2_test, 'RMSE': rmse,'Cross Validaiton Score': cv.mean()}})

Best params for GradientBoost Regressor: {'learning_rate': 0.5, 'n_estimators': 9}
RMSE: 1281.9832823113363
r2_score_train: 0.9312684056475835
r2_score_test: 0.9099933648356071
Cross validation score: 0.9197621523074947


### Xgboost Regressor

In [69]:
from xgboost import XGBRegressor

estimator = XGBRegressor(random_state=42)
param_grid = {'n_estimators': [50,100,150,200], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5,0.8,0.1], 'max_depth': list(range(1,10)), 'gamma': [0, 0.1, 0.5,1]}
grid_search_xgboost = GridSearchCV(estimator,param_grid,cv=5,scoring='r2')
grid_search_xgboost.fit(X_train,y_train)
xgboost_model = grid_search_xgboost.best_estimator_

y_pred_train = xgboost_model.predict(X_train)
y_pred_test = xgboost_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(xgboost_model, X_train, y_train, cv=5, scoring='r2')
rmse = np.sqrt(mse)

print('Best params for Xgboost Regressor:', grid_search_xgboost.best_params_)
print("RMSE:", rmse)
print("r2_score_train:", r2_train)
print("r2_score_test:", r2_test)
print("Cross validation score:", cv.mean())

scores.append({'Xgboost Regressor': {'R2_Score_train': r2_train, 'R2_Score_test': r2_test, 'RMSE': rmse,'Cross Validaiton Score': cv.mean()}})

Best params for Xgboost Regressor: {'gamma': 0, 'learning_rate': 0.2, 'max_depth': 1, 'n_estimators': 150}
RMSE: 1156.2196833402325
r2_score_train: 0.9412646916206399
r2_score_test: 0.9267866094794561
Cross validation score: 0.9361449498477281


In [70]:
scores

[{'Linear Regression': {'R2_Score_train': 0.9203248984204583,
   'R2_Score_test': 0.8934441910972903,
   'RMSE': 1394.8697752480844,
   'Cross Validaiton Score': 0.9152470700075016}},
 {'Lasso Regression': {'R2_Score_train': 0.9189628656587738,
   'R2_Score_test': 0.8927995216098806,
   'RMSE': 1399.0829382437232,
   'Cross Validaiton Score': 0.9163771526227427}},
 {'Ridge Regression': {'R2_Score_train': 0.9203172219675557,
   'R2_Score_test': 0.8935234916114349,
   'RMSE': 1394.3506365953292,
   'Cross Validaiton Score': 0.9152529429786552}},
 {'KNeighbors Regressor': {'R2_Score_train': 0.8004902440366325,
   'R2_Score_test': 0.7371414620573089,
   'RMSE': 2190.817203978746,
   'Cross Validaiton Score': 0.7450997473965313}},
 {'KNeighbors Regressor': {'R2_Score_train': 0.9428259991754261,
   'R2_Score_test': 0.8788522726892909,
   'RMSE': 1487.3142343327984,
   'Cross Validaiton Score': 0.9009731573566582}},
 {'RandomForest Regresssor': {'R2_Score_train': 0.9572000144929567,
   'R2_Sc

In [71]:
model_name =[]
r2_train = []
r2_test = []
rmse =[]
cv = []

for model in scores:
    for name, metrics in model.items():
        model_name.append(name)
        r2_train.append(float(metrics['R2_Score_train']))
        r2_test.append(float(metrics['R2_Score_test']))
        rmse.append(float(metrics['RMSE']))
        cv.append(float(metrics['Cross Validaiton Score']))

scores_df = pd.DataFrame({
    'Model': model_name,
    'R2_Train': r2_train,
    'R2_Test': r2_test,
    'RMSE':rmse,
    'Cross Validaiton Score':cv
})
scores_df

Unnamed: 0,Model,R2_Train,R2_Test,RMSE,Cross Validaiton Score
0,Linear Regression,0.920325,0.893444,1394.869775,0.915247
1,Lasso Regression,0.918963,0.8928,1399.082938,0.916377
2,Ridge Regression,0.920317,0.893523,1394.350637,0.915253
3,KNeighbors Regressor,0.80049,0.737141,2190.817204,0.7451
4,KNeighbors Regressor,0.942826,0.878852,1487.314234,0.900973
5,RandomForest Regresssor,0.9572,0.909591,1284.845893,0.921481
6,AdaBoost Regressor,0.819568,0.800238,1909.860501,0.788592
7,GradientBoost Regressor,0.931268,0.909993,1281.983282,0.919762
8,Xgboost Regressor,0.941265,0.926787,1156.219683,0.936145
