## Model_training

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/skewed_data.csv')
df.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Processor,Operating system,Resolution_height(px),Price,Internal_storage(GB),Resolution_width(px),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,oneplus,4085,7,8,android,3120.0,10.985276,5.549076,7.273093,3.89182,2.833213,2.564949,16
1,realme,4000,7,8,android,2400.0,10.23996,4.174387,6.985642,4.174387,2.833213,1.94591,16
2,apple,3969,7,6,ios,2688.0,11.579658,4.174387,7.125283,2.564949,2.564949,1.609438,16
3,apple,3110,6,6,ios,1792.0,11.049317,4.174387,6.72022,2.564949,2.564949,1.609438,16
4,lg,4000,6,8,android,2340.0,10.819598,4.859812,6.985642,2.564949,3.496508,1.94591,1


In [3]:
df.dtypes

Brand                     object
Battery_capacity(mAh)      int64
Screen_size(inches)        int64
Processor                  int64
Operating system          object
Resolution_height(px)    float64
Price                    float64
Internal_storage(GB)     float64
Resolution_width(px)     float64
Rear_Camera(MP)          float64
Front_Camera(MP)         float64
RAM(GB)                  float64
Number of SIMs             int64
dtype: object

In [4]:
X = df.drop('Price', axis=1)
y = df['Price']

In [5]:
num_features = X.select_dtypes(exclude='object').columns
categorical_features = ['Brand', 'Operating system']


In [6]:

preprocessor = ColumnTransformer([
    ('OneHotEncoder',OneHotEncoder(), categorical_features),
    ('MinMaxScaler', StandardScaler(), num_features)
])
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed.shape


(1818, 96)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)
scores = []

## Linear regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_train = linear_model.predict(X_train)
y_pred_test = linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(linear_model, X_train, y_train, cv=5, scoring='r2')
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Linear Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## Lasso Regression

In [22]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV


estimator = Lasso()
paramgrid = {'alpha': list(range(1,100))}
grid_search_lasso = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_lasso.fit(X_train, y_train)
lasso_model = grid_search_lasso.best_estimator_

y_pred_train = lasso_model.predict(X_train)
y_pred_test = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(lasso_model, X_train, y_train, cv=5, scoring='r2')
print('Best params for Lasso Regression: ', grid_search_lasso.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Lasso Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for Lasso Regression:  {'alpha': 1}
MSE:  0.6120362053965209
r2_score_train:  0.0
r2_score_test:  -0.002937421725182965
Cross Validation Score:  -0.0061920203840522435


  _data = np.array(data, dtype=dtype, copy=copy,


## Ridge Regression

In [23]:
from sklearn.linear_model import Ridge

estimator = Ridge()
paramgrid = {'alpha': list(range(1,100))}
grid_search_ridge = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_ridge.fit(X_train, y_train)
ridge_model = grid_search_ridge.best_estimator_ 

y_pred_train = ridge_model.predict(X_train)
y_pred_test = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(ridge_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Ridge Regression: ', grid_search_ridge.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Ridge Regression': {'R2_Score_train': float(r2_train),'R2_Score_test':float(r2_test), 'MSE': float(mse), 'Cross Validation Score': float(cv.mean())}})


Best params for Ridge Regression:  {'alpha': 1}
MSE:  0.1559829330816742
r2_score_train:  0.7554569480560974
r2_score_test:  0.7443923752244161
Cross Validation Score:  0.7051661868608554


## KNeighbors Regressor

In [24]:
from sklearn.neighbors import KNeighborsRegressor

estimator = KNeighborsRegressor()
param_grid = {'n_neighbors': list(range(1,100))}
grid_search_knn = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
grid_search_knn.fit(X_train, y_train)
knn_model = grid_search_knn.best_estimator_

y_pred_train =knn_model.predict(X_train)
y_pred_test =knn_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(knn_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for KNN Regressor: ', grid_search_knn.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'KNeighbors Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for KNN Regressor:  {'n_neighbors': 8}
MSE:  0.203720365504696
r2_score_train:  0.7120216665972156
r2_score_test:  0.6661655367269992
Cross Validation Score:  0.6277932406813268


## Decision Regression

In [25]:
from sklearn.tree import DecisionTreeRegressor

estimator = DecisionTreeRegressor(random_state=42)
paramgrid = {'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error'] }
grid_search_decision = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_decision.fit(X_train, y_train)
decisiontree_model = grid_search_decision.best_estimator_

y_pred_train = decisiontree_model.predict(X_train)
y_pred_test = decisiontree_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(decisiontree_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for DecisionTree Regressor: ', grid_search_decision.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'DecisionTree Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for DecisionTree Regressor:  {'criterion': 'squared_error', 'max_depth': 5}
MSE:  0.21677048949320457
r2_score_train:  0.7008563812762896
r2_score_test:  0.6447804330504133
Cross Validation Score:  0.5596343118346094


## Random Forest Regressor

In [26]:
from sklearn.ensemble import RandomForestRegressor

estimator = RandomForestRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)),'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error']}
grid_search_randomforest = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_randomforest.fit(X_train, y_train)
randomforest_model = grid_search_randomforest.best_estimator_

y_pred_train = randomforest_model.predict(X_train)
y_pred_test = randomforest_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(randomforest_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for RandomForest Regressor: ', grid_search_randomforest.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'RandomForest Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for RandomForest Regressor:  {'criterion': 'squared_error', 'max_depth': 9, 'n_estimators': 9}
MSE:  0.17774577794470325
r2_score_train:  0.8442865748050865
r2_score_test:  0.7087298256499339
Cross Validation Score:  0.6749093959082982


## AdaBoost Regressor

In [27]:
from sklearn.ensemble import AdaBoostRegressor

estimator = AdaBoostRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)), 'learning_rate': [0.1, 0.5, 1.0]}
grid_search_adaboost = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_adaboost.fit(X_train, y_train)
adaboost_model = grid_search_adaboost.best_estimator_

y_pred_train = adaboost_model.predict(X_train)
y_pred_test = adaboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(adaboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Adaboost Regressor: ', grid_search_adaboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'AdaBoost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for Adaboost Regressor:  {'learning_rate': 0.1, 'n_estimators': 6}
MSE:  0.22319492184043005
r2_score_train:  0.5902981498274031
r2_score_test:  0.6342527819775495
Cross Validation Score:  0.5686903127780095


## Gradient Boosting Regressor

In [28]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
estimator = GradientBoostingRegressor(random_state=42)
paramgrid = {'n_estimators': [50, 100, 150, 200,250,300], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5,0.8,0.1], 'max_depth': list(range(1,10))}
grid_search_gradientboost = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_gradientboost.fit(X_train, y_train)
gradientboost_model = grid_search_gradientboost.best_estimator_

y_pred_train = gradientboost_model.predict(X_train)
y_pred_test = gradientboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(gradientboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Gradientboost Regressor: ', grid_search_gradientboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'GradientBoost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


  _data = np.array(data, dtype=dtype, copy=copy,


Best params for Gradientboost Regressor:  {'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 300}
MSE:  0.15058193849257276
r2_score_train:  0.8581559488924868
r2_score_test:  0.7532429293912822
Cross Validation Score:  0.7324726326805019


## Xgboost Regressor

In [29]:
from xgboost import XGBRegressor

estimator = XGBRegressor(random_state=42)
param_grid = {'n_estimators': [50, 100, 150, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5,0.8,0.1], 'max_depth': list(range(1,10)), 'gamma': [0, 0.1, 0.5,1]}
grid_search_xgboost = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
grid_search_xgboost.fit(X_train, y_train)
xgboost_model = grid_search_xgboost.best_estimator_

y_pred_train = xgboost_model.predict(X_train)
y_pred_test = xgboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(xgboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Xgboost Regressor: ', grid_search_xgboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Xgboost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})





Best params for Xgboost Regressor:  {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
MSE:  0.1501891998843222
r2_score_train:  0.8724065532193412
r2_score_test:  0.7538865061007933
Cross Validation Score:  0.7299810909096693


In [30]:
scores

[{'Linear Regression': {'R2_Score_train': 0.7604843212165646,
   'R2_Score_test': 0.7411758005674514,
   'MSE': np.float64(0.15794582738073923),
   'Cross Validation Score': np.float64(0.7034285469063831)}},
 {'Lasso Regression': {'R2_Score_train': 0.0,
   'R2_Score_test': -0.002937421725182965,
   'MSE': np.float64(0.6120362053965209),
   'Cross Validation Score': np.float64(-0.0061920203840522435)}},
 {'Ridge Regression': {'R2_Score_train': 0.7554569480560974,
   'R2_Score_test': 0.7443923752244161,
   'MSE': 0.1559829330816742,
   'Cross Validation Score': 0.7051661868608554}},
 {'KNeighbors Regressor': {'R2_Score_train': 0.7120216665972156,
   'R2_Score_test': 0.6661655367269992,
   'MSE': np.float64(0.203720365504696),
   'Cross Validation Score': np.float64(0.6277932406813268)}},
 {'DecisionTree Regression': {'R2_Score_train': 0.7008563812762896,
   'R2_Score_test': 0.6447804330504133,
   'MSE': np.float64(0.21677048949320457),
   'Cross Validation Score': np.float64(0.5596343118

In [31]:
model_name = []
r2_train = []
r2_test = []
mse = []
cv = []
#pd.reset_option('display.float_format')
for model in scores:
    for name, metrics in model.items():
        model_name.append(name)
        r2_train.append(float(metrics['R2_Score_train']))
        r2_test .append(float(metrics['R2_Score_test']))
        mse.append(float(metrics['MSE']))
        cv.append(float(metrics['Cross Validation Score']))

scores_df = pd.DataFrame({
    'Model': model_name,
    'R2_Train': r2_train,
    'R2_Test': r2_test,
    'MSE': mse,
    'Cross Validation Score': cv
})

scores_df

Unnamed: 0,Model,R2_Train,R2_Test,MSE,Cross Validation Score
0,Linear Regression,0.760484,0.741176,0.157946,0.703429
1,Lasso Regression,0.0,-0.002937,0.612036,-0.006192
2,Ridge Regression,0.755457,0.744392,0.155983,0.705166
3,KNeighbors Regressor,0.712022,0.666166,0.20372,0.627793
4,DecisionTree Regression,0.700856,0.64478,0.21677,0.559634
5,RandomForest Regression,0.844287,0.70873,0.177746,0.674909
6,AdaBoost Regressor,0.590298,0.634253,0.223195,0.56869
7,GradientBoost Regressor,0.858156,0.753243,0.150582,0.732473
8,Xgboost Regressor,0.872407,0.753887,0.150189,0.729981


##### The best model should be XGBoost