## Model_training

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/skewed_data.csv')
df.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Touchscreen,Processor,Operating system,Wi-Fi,Bluetooth,GPS,3G,4G/ LTE,Resolution_width(px),Resolution_height(px),Price,Internal_storage(GB),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,oneplus,4085,7,Yes,8,android,Yes,Yes,Yes,Yes,Yes,1440,3120,10.985276,5.549076,4.142627,3.689184,3.180631,16
1,realme,4000,7,Yes,8,android,Yes,Yes,Yes,Yes,Yes,1080,2400,10.23996,4.174387,4.463819,3.689184,2.287895,16
2,apple,3969,7,Yes,6,ios,Yes,Yes,Yes,Yes,Yes,1242,2688,11.579658,4.174387,2.672344,3.254454,1.838961,16
3,apple,3110,6,Yes,6,ios,Yes,Yes,Yes,Yes,Yes,828,1792,11.049317,4.174387,2.672344,3.254454,1.838961,16
4,lg,4000,6,Yes,8,android,Yes,Yes,Yes,No,No,1080,2340,10.819598,4.859812,2.672344,4.858008,2.287895,1


In [3]:
df.dtypes

Brand                     object
Battery_capacity(mAh)      int64
Screen_size(inches)        int64
Touchscreen               object
Processor                  int64
Operating system          object
Wi-Fi                     object
Bluetooth                 object
GPS                       object
3G                        object
4G/ LTE                   object
Resolution_width(px)       int64
Resolution_height(px)      int64
Price                    float64
Internal_storage(GB)     float64
Rear_Camera(MP)          float64
Front_Camera(MP)         float64
RAM(GB)                  float64
Number of SIMs             int64
dtype: object

In [3]:
X = df.drop('Price', axis=1)
y = df['Price']

In [4]:
num_features = X.select_dtypes(exclude='object').columns
categorical_features = ['Brand', 'Touchscreen', 'Operating system', 'Wi-Fi',
       'Bluetooth', 'GPS', '3G', '4G/ LTE']


In [5]:

preprocessor = ColumnTransformer([
    ('OneHotEncoder',OneHotEncoder(), categorical_features),
    ('MinMaxScaler', StandardScaler(), num_features)
])
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed.shape


(1359, 105)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)
scores = []

## Linear regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_train = linear_model.predict(X_train)
y_pred_test = linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(linear_model, X_train, y_train, cv=5, scoring='r2')
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Linear Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



MSE:  0.160258124592934
r2_score_train:  0.7659135946654486
r2_score_test:  0.7373866629579231
Cross Validation Score:  0.7052685889793147


## Lasso Regression

In [9]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV


estimator = Lasso()
paramgrid = {'alpha': list(range(1,100))}
grid_search_lasso = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_lasso.fit(X_train, y_train)
lasso_model = grid_search_lasso.best_estimator_

y_pred_train = lasso_model.predict(X_train)
y_pred_test = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(lasso_model, X_train, y_train, cv=5, scoring='r2')
print('Best params for Lasso Regression: ', grid_search_lasso.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Lasso Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for Lasso Regression:  {'alpha': 1}
MSE:  0.6120362053965209
r2_score_train:  0.0
r2_score_test:  -0.002937421725182965
Cross Validation Score:  -0.0061920203840522435


  _data = np.array(data, dtype=dtype, copy=copy,


## Ridge Regression

In [10]:
from sklearn.linear_model import Ridge

estimator = Ridge()
paramgrid = {'alpha': list(range(1,100))}
grid_search_ridge = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_ridge.fit(X_train, y_train)
ridge_model = grid_search_ridge.best_estimator_ 

y_pred_train = ridge_model.predict(X_train)
y_pred_test = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(ridge_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Ridge Regression: ', grid_search_ridge.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Ridge Regression': {'R2_Score_train': float(r2_train),'R2_Score_test':float(r2_test), 'MSE': float(mse), 'Cross Validation Score': float(cv.mean())}})


Best params for Ridge Regression:  {'alpha': 1}
MSE:  0.15776077915773534
r2_score_train:  0.7607925578139166
r2_score_test:  0.741479037183256
Cross Validation Score:  0.7082008331796612


## KNeighbors Regressor

In [11]:
from sklearn.neighbors import KNeighborsRegressor

estimator = KNeighborsRegressor()
param_grid = {'n_neighbors': list(range(1,100))}
grid_search_knn = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
grid_search_knn.fit(X_train, y_train)
knn_model = grid_search_knn.best_estimator_

y_pred_train =knn_model.predict(X_train)
y_pred_test =knn_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(knn_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for KNN Regressor: ', grid_search_knn.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'KNeighbors Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for KNN Regressor:  {'n_neighbors': 11}
MSE:  0.19694216855337648
r2_score_train:  0.6853521932714332
r2_score_test:  0.6772728982104551
Cross Validation Score:  0.6206688721108049


## Decision Regression

In [12]:
from sklearn.tree import DecisionTreeRegressor

estimator = DecisionTreeRegressor(random_state=42)
paramgrid = {'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error'] }
grid_search_decision = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_decision.fit(X_train, y_train)
decisiontree_model = grid_search_decision.best_estimator_

y_pred_train = decisiontree_model.predict(X_train)
y_pred_test = decisiontree_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(decisiontree_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for DecisionTree Regressor: ', grid_search_decision.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'DecisionTree Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for DecisionTree Regressor:  {'criterion': 'squared_error', 'max_depth': 6}
MSE:  0.2182501299245858
r2_score_train:  0.7511013128924121
r2_score_test:  0.6423557615256815
Cross Validation Score:  0.5664175065761237


## Random Forest Regressor

In [13]:
from sklearn.ensemble import RandomForestRegressor

estimator = RandomForestRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)),'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error']}
grid_search_randomforest = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_randomforest.fit(X_train, y_train)
randomforest_model = grid_search_randomforest.best_estimator_

y_pred_train = randomforest_model.predict(X_train)
y_pred_test = randomforest_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(randomforest_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for RandomForest Regressor: ', grid_search_randomforest.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'RandomForest Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for RandomForest Regressor:  {'criterion': 'squared_error', 'max_depth': 9, 'n_estimators': 9}
MSE:  0.17182733668229727
r2_score_train:  0.8448048649625456
r2_score_test:  0.7184283143471895
Cross Validation Score:  0.6739261182213688


## AdaBoost Regressor

In [14]:
from sklearn.ensemble import AdaBoostRegressor

estimator = AdaBoostRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)), 'learning_rate': [0.1, 0.5, 1.0]}
grid_search_adaboost = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_adaboost.fit(X_train, y_train)
adaboost_model = grid_search_adaboost.best_estimator_

y_pred_train = adaboost_model.predict(X_train)
y_pred_test = adaboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(adaboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Adaboost Regressor: ', grid_search_adaboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'AdaBoost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for Adaboost Regressor:  {'learning_rate': 0.1, 'n_estimators': 6}
MSE:  0.22319492184043005
r2_score_train:  0.5902981498274031
r2_score_test:  0.6342527819775495
Cross Validation Score:  0.5686903127780095


## Gradient Boosting Regressor

In [11]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
estimator = GradientBoostingRegressor(random_state=42)
paramgrid = {'n_estimators': [50, 100, 150, 200,250,300], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5,0.8,0.1], 'max_depth': list(range(1,10))}
grid_search_gradientboost = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_gradientboost.fit(X_train, y_train)
gradientboost_model = grid_search_gradientboost.best_estimator_

y_pred_train = gradientboost_model.predict(X_train)
y_pred_test = gradientboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(gradientboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Gradientboost Regressor: ', grid_search_gradientboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'GradientBoost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for Gradientboost Regressor:  {'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 300}
MSE:  0.1480285372182897
r2_score_train:  0.8604998078306849
r2_score_test:  0.7574271617423735
Cross Validation Score:  0.7325138094717951


## Xgboost Regressor

In [16]:
from xgboost import XGBRegressor

estimator = XGBRegressor(random_state=42)
param_grid = {'n_estimators': [50, 100, 150, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5,0.8,0.1], 'max_depth': list(range(1,10)), 'gamma': [0, 0.1, 0.5,1]}
grid_search_xgboost = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
grid_search_xgboost.fit(X_train, y_train)
xgboost_model = grid_search_xgboost.best_estimator_

y_pred_train = xgboost_model.predict(X_train)
y_pred_test = xgboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(xgboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Xgboost Regressor: ', grid_search_xgboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Xgboost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})





Best params for Xgboost Regressor:  {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
MSE:  0.15235233215345426
r2_score_train:  0.8731713623115025
r2_score_test:  0.7503418035460674
Cross Validation Score:  0.7255026630945629


In [17]:
scores

[{'Linear Regression': {'R2_Score_train': 0.7659135946907334,
   'R2_Score_test': 0.7373863447797128,
   'MSE': np.float64(0.16025831875917038),
   'Cross Validation Score': np.float64(0.7052685825079711)}},
 {'Lasso Regression': {'R2_Score_train': 0.0,
   'R2_Score_test': -0.002937421725182965,
   'MSE': np.float64(0.6120362053965209),
   'Cross Validation Score': np.float64(-0.0061920203840522435)}},
 {'Ridge Regression': {'R2_Score_train': 0.7607925578139166,
   'R2_Score_test': 0.741479037183256,
   'MSE': 0.15776077915773534,
   'Cross Validation Score': 0.7082008331796612}},
 {'KNeighbors Regressor': {'R2_Score_train': 0.6853521932714332,
   'R2_Score_test': 0.6772728982104551,
   'MSE': np.float64(0.19694216855337648),
   'Cross Validation Score': np.float64(0.6206688721108049)}},
 {'DecisionTree Regression': {'R2_Score_train': 0.7511013128924121,
   'R2_Score_test': 0.6423557615256815,
   'MSE': np.float64(0.2182501299245858),
   'Cross Validation Score': np.float64(0.566417506

In [18]:
model_name = []
r2_train = []
r2_test = []
mse = []
cv = []
#pd.reset_option('display.float_format')
for model in scores:
    for name, metrics in model.items():
        model_name.append(name)
        r2_train.append(float(metrics['R2_Score_train']))
        r2_test .append(float(metrics['R2_Score_test']))
        mse.append(float(metrics['MSE']))
        cv.append(float(metrics['Cross Validation Score']))

scores_df = pd.DataFrame({
    'Model': model_name,
    'R2_Train': r2_train,
    'R2_Test': r2_test,
    'MSE': mse,
    'Cross Validation Score': cv
})

scores_df

Unnamed: 0,Model,R2_Train,R2_Test,MSE,Cross Validation Score
0,Linear Regression,0.765914,0.737386,0.160258,0.705269
1,Lasso Regression,0.0,-0.002937,0.612036,-0.006192
2,Ridge Regression,0.760793,0.741479,0.157761,0.708201
3,KNeighbors Regressor,0.685352,0.677273,0.196942,0.620669
4,DecisionTree Regression,0.751101,0.642356,0.21825,0.566418
5,RandomForest Regression,0.844805,0.718428,0.171827,0.673926
6,AdaBoost Regressor,0.590298,0.634253,0.223195,0.56869
7,GradientBoost Regressor,0.833843,0.754273,0.149953,0.727488
8,Xgboost Regressor,0.873171,0.750342,0.152352,0.725503


##### The best model should be XGBoost