## Model_training

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv('../data/skewed_data.csv')
df.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Processor,Operating system,Resolution_height(px),Price,Internal_storage(GB),Resolution_width(px),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,oneplus,4085,7,8,android,3120.0,10.985276,5.549076,7.273093,3.89182,2.833213,2.564949,16
1,realme,4000,7,8,android,2400.0,10.23996,4.174387,6.985642,4.174387,2.833213,1.94591,16
2,apple,3969,7,6,ios,2688.0,11.579658,4.174387,7.125283,2.564949,2.564949,1.609438,16
3,apple,3110,6,6,ios,1792.0,11.049317,4.174387,6.72022,2.564949,2.564949,1.609438,16
4,lg,4000,6,8,android,2340.0,10.819598,4.859812,6.985642,2.564949,3.496508,1.94591,1


In [7]:
df.isnull().sum()

Brand                    0
Battery_capacity(mAh)    0
Screen_size(inches)      0
Processor                0
Operating system         0
Resolution_height(px)    0
Price                    0
Internal_storage(GB)     0
Resolution_width(px)     0
Rear_Camera(MP)          0
Front_Camera(MP)         0
RAM(GB)                  0
Number of SIMs           0
dtype: int64

In [8]:
df.dtypes

Brand                     object
Battery_capacity(mAh)      int64
Screen_size(inches)        int64
Processor                  int64
Operating system          object
Resolution_height(px)    float64
Price                    float64
Internal_storage(GB)     float64
Resolution_width(px)     float64
Rear_Camera(MP)          float64
Front_Camera(MP)         float64
RAM(GB)                  float64
Number of SIMs             int64
dtype: object

In [9]:
X = df.drop('Price', axis=1)
y = df['Price']

In [10]:
num_features = X.select_dtypes(exclude='object').columns
categorical_features = ['Brand', 'Operating system']


In [11]:

preprocessor = ColumnTransformer([
    ('OneHotEncoder',OneHotEncoder(), categorical_features),
    ('MinMaxScaler', StandardScaler(), num_features)
])
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed.shape


(1818, 96)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)
scores = []

## Linear regression

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_train = linear_model.predict(X_train)
y_pred_test = linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(linear_model, X_train, y_train, cv=5, scoring='r2')
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Linear Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



MSE:  0.13760109268088672
r2_score_train:  0.7934369223510199
r2_score_test:  0.7307507471987069
Cross Validation Score:  0.7550023514811268


## Lasso Regression

In [14]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV


estimator = Lasso()
paramgrid = {'alpha': list(range(1,100))}
grid_search_lasso = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_lasso.fit(X_train, y_train)
lasso_model = grid_search_lasso.best_estimator_

y_pred_train = lasso_model.predict(X_train)
y_pred_test = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(lasso_model, X_train, y_train, cv=5, scoring='r2')
print('Best params for Lasso Regression: ', grid_search_lasso.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Lasso Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for Lasso Regression:  {'alpha': 1}
MSE:  0.512034752293934
r2_score_train:  0.0
r2_score_test:  -0.0019177302840329702
Cross Validation Score:  -0.0036617932830861123


  _data = np.array(data, dtype=dtype, copy=copy,


## Ridge Regression

In [15]:
from sklearn.linear_model import Ridge

estimator = Ridge()
paramgrid = {'alpha': list(range(1,100))}
grid_search_ridge = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_ridge.fit(X_train, y_train)
ridge_model = grid_search_ridge.best_estimator_ 

y_pred_train = ridge_model.predict(X_train)
y_pred_test = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(ridge_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Ridge Regression: ', grid_search_ridge.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Ridge Regression': {'R2_Score_train': float(r2_train),'R2_Score_test':float(r2_test), 'MSE': float(mse), 'Cross Validation Score': float(cv.mean())}})


Best params for Ridge Regression:  {'alpha': 1}
MSE:  0.13799402976525327
r2_score_train:  0.7899212165626078
r2_score_test:  0.7299818723714628
Cross Validation Score:  0.7583180060151923


## KNeighbors Regressor

In [16]:
from sklearn.neighbors import KNeighborsRegressor

estimator = KNeighborsRegressor()
param_grid = {'n_neighbors': list(range(1,100))}
grid_search_knn = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
grid_search_knn.fit(X_train, y_train)
knn_model = grid_search_knn.best_estimator_

y_pred_train =knn_model.predict(X_train)
y_pred_test =knn_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(knn_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for KNN Regressor: ', grid_search_knn.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'KNeighbors Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for KNN Regressor:  {'n_neighbors': 5}
MSE:  0.1683849428477885
r2_score_train:  0.8073116251919987
r2_score_test:  0.6705148254171311
Cross Validation Score:  0.6881556976106398


## Decision Regression

In [17]:
from sklearn.tree import DecisionTreeRegressor

estimator = DecisionTreeRegressor(random_state=42)
paramgrid = {'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error'] }
grid_search_decision = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_decision.fit(X_train, y_train)
decisiontree_model = grid_search_decision.best_estimator_

y_pred_train = decisiontree_model.predict(X_train)
y_pred_test = decisiontree_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(decisiontree_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for DecisionTree Regressor: ', grid_search_decision.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'DecisionTree Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for DecisionTree Regressor:  {'criterion': 'absolute_error', 'max_depth': 9}
MSE:  0.16378385543345522
r2_score_train:  0.8311262023048422
r2_score_test:  0.6795179468622179
Cross Validation Score:  0.6484450331605658


## Random Forest Regressor

In [18]:
from sklearn.ensemble import RandomForestRegressor

estimator = RandomForestRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)),'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error']}
grid_search_randomforest = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_randomforest.fit(X_train, y_train)
randomforest_model = grid_search_randomforest.best_estimator_

y_pred_train = randomforest_model.predict(X_train)
y_pred_test = randomforest_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(randomforest_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for RandomForest Regressor: ', grid_search_randomforest.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'RandomForest Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



  _data = np.array(data, dtype=dtype, copy=copy,


Best params for RandomForest Regressor:  {'criterion': 'squared_error', 'max_depth': 9, 'n_estimators': 9}
MSE:  0.1426508715972986
r2_score_train:  0.8641889468454974
r2_score_test:  0.7208696541523835
Cross Validation Score:  0.7245944885716757


## AdaBoost Regressor

In [19]:
from sklearn.ensemble import AdaBoostRegressor

estimator = AdaBoostRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)), 'learning_rate': [0.1, 0.5, 1.0]}
grid_search_adaboost = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_adaboost.fit(X_train, y_train)
adaboost_model = grid_search_adaboost.best_estimator_

y_pred_train = adaboost_model.predict(X_train)
y_pred_test = adaboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(adaboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Adaboost Regressor: ', grid_search_adaboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'AdaBoost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for Adaboost Regressor:  {'learning_rate': 1.0, 'n_estimators': 8}
MSE:  0.1984010599042337
r2_score_train:  0.6746838716245651
r2_score_test:  0.6117811559964477
Cross Validation Score:  0.6376132285579548


## Gradient Boosting Regressor

In [20]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
estimator = GradientBoostingRegressor(random_state=42)
paramgrid = {'n_estimators': [50, 100, 150, 200,250,300], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5,0.8,0.1], 'max_depth': list(range(1,10))}
grid_search_gradientboost = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_gradientboost.fit(X_train, y_train)
gradientboost_model = grid_search_gradientboost.best_estimator_

y_pred_train = gradientboost_model.predict(X_train)
y_pred_test = gradientboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(gradientboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Gradientboost Regressor: ', grid_search_gradientboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'GradientBoost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


  _data = np.array(data, dtype=dtype, copy=copy,


Best params for Gradientboost Regressor:  {'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 300}
MSE:  0.1192362776749657
r2_score_train:  0.8754774909744089
r2_score_test:  0.7666858740341136
Cross Validation Score:  0.7845426090454938


## Xgboost Regressor

In [21]:
from xgboost import XGBRegressor

estimator = XGBRegressor(random_state=42)
param_grid = {'n_estimators': [50, 100, 150, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5,0.8,0.1], 'max_depth': list(range(1,10)), 'gamma': [0, 0.1, 0.5,1]}
grid_search_xgboost = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
grid_search_xgboost.fit(X_train, y_train)
xgboost_model = grid_search_xgboost.best_estimator_

y_pred_train = xgboost_model.predict(X_train)
y_pred_test = xgboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(xgboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Xgboost Regressor: ', grid_search_xgboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Xgboost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})





  _data = np.array(data, dtype=dtype, copy=copy,


Best params for Xgboost Regressor:  {'gamma': 0, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
MSE:  0.12056296697839769
r2_score_train:  0.8916620036024168
r2_score_test:  0.7640898909885652
Cross Validation Score:  0.7813432030343577


In [22]:
scores

[{'Linear Regression': {'R2_Score_train': 0.7934369223510199,
   'R2_Score_test': 0.7307507471987069,
   'MSE': np.float64(0.13760109268088672),
   'Cross Validation Score': np.float64(0.7550023514811268)}},
 {'Lasso Regression': {'R2_Score_train': 0.0,
   'R2_Score_test': -0.0019177302840329702,
   'MSE': np.float64(0.512034752293934),
   'Cross Validation Score': np.float64(-0.0036617932830861123)}},
 {'Ridge Regression': {'R2_Score_train': 0.7899212165626078,
   'R2_Score_test': 0.7299818723714628,
   'MSE': 0.13799402976525327,
   'Cross Validation Score': 0.7583180060151923}},
 {'KNeighbors Regressor': {'R2_Score_train': 0.8073116251919987,
   'R2_Score_test': 0.6705148254171311,
   'MSE': np.float64(0.1683849428477885),
   'Cross Validation Score': np.float64(0.6881556976106398)}},
 {'DecisionTree Regression': {'R2_Score_train': 0.8311262023048422,
   'R2_Score_test': 0.6795179468622179,
   'MSE': np.float64(0.16378385543345522),
   'Cross Validation Score': np.float64(0.64844503

In [23]:
model_name = []
r2_train = []
r2_test = []
mse = []
cv = []
#pd.reset_option('display.float_format')
for model in scores:
    for name, metrics in model.items():
        model_name.append(name)
        r2_train.append(float(metrics['R2_Score_train']))
        r2_test .append(float(metrics['R2_Score_test']))
        mse.append(float(metrics['MSE']))
        cv.append(float(metrics['Cross Validation Score']))

scores_df = pd.DataFrame({
    'Model': model_name,
    'R2_Train': r2_train,
    'R2_Test': r2_test,
    'MSE': mse,
    'Cross Validation Score': cv
})

scores_df

Unnamed: 0,Model,R2_Train,R2_Test,MSE,Cross Validation Score
0,Linear Regression,0.793437,0.730751,0.137601,0.755002
1,Lasso Regression,0.0,-0.001918,0.512035,-0.003662
2,Ridge Regression,0.789921,0.729982,0.137994,0.758318
3,KNeighbors Regressor,0.807312,0.670515,0.168385,0.688156
4,DecisionTree Regression,0.831126,0.679518,0.163784,0.648445
5,RandomForest Regression,0.864189,0.72087,0.142651,0.724594
6,AdaBoost Regressor,0.674684,0.611781,0.198401,0.637613
7,GradientBoost Regressor,0.875477,0.766686,0.119236,0.784543
8,Xgboost Regressor,0.891662,0.76409,0.120563,0.781343


##### The best model should be XGBoost