## Model_training

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv('../data/skewed_data.csv')
df.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Touchscreen,Processor,Operating system,Wi-Fi,Bluetooth,GPS,3G,4G/ LTE,Resolution_width(px),Resolution_height(px),Price,Internal_storage(GB),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,OnePlus,4085,7,Yes,8,Android,Yes,Yes,Yes,Yes,Yes,1440,3120,10.985276,5.549076,4.197814,3.696808,3.157549,16
1,Realme,4000,7,Yes,8,Android,Yes,Yes,Yes,Yes,Yes,1080,2400,10.23996,4.174387,4.527737,3.696808,2.275481,16
2,Apple,3969,7,Yes,6,iOS,Yes,Yes,Yes,Yes,Yes,1242,2688,11.579658,4.174387,2.695576,3.260498,1.830773,16
3,Apple,3110,6,Yes,6,iOS,Yes,Yes,Yes,Yes,Yes,828,1792,11.049317,4.174387,2.695576,3.260498,1.830773,16
4,LG,4000,6,Yes,8,Android,Yes,Yes,Yes,No,No,1080,2340,10.819598,4.859812,2.695576,4.870624,2.275481,1


In [7]:
df.dtypes

Brand                     object
Battery_capacity(mAh)      int64
Screen_size(inches)        int64
Touchscreen               object
Processor                  int64
Operating system          object
Wi-Fi                     object
Bluetooth                 object
GPS                       object
3G                        object
4G/ LTE                   object
Resolution_width(px)       int64
Resolution_height(px)      int64
Price                    float64
Internal_storage(GB)     float64
Rear_Camera(MP)          float64
Front_Camera(MP)         float64
RAM(GB)                  float64
Number of SIMs             int64
dtype: object

In [8]:
X = df.drop('Price', axis=1)
y = df['Price']

In [9]:
num_features = X.select_dtypes(exclude='object').columns
categorical_features = ['Brand', 'Touchscreen', 'Operating system', 'Wi-Fi',
       'Bluetooth', 'GPS', '3G', '4G/ LTE']


In [25]:

preprocessor = ColumnTransformer([
    ('OneHotEncoder',OneHotEncoder(), categorical_features),
    ('MinMaxScaler', StandardScaler(), num_features)
])
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed.shape


(1344, 104)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)
scores = []

## Linear regression

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_train = linear_model.predict(X_train)
y_pred_test = linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(linear_model, X_train, y_train, cv=5, scoring='r2')
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Linear Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



MSE:  0.16228356570221825
r2_score_train:  0.763850825536035
r2_score_test:  0.7294760616417002
Cross Validation Score:  0.7103994251760269


## Lasso Regression

In [13]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV


estimator = Lasso()
paramgrid = {'alpha': list(range(1,100))}
grid_search_lasso = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_lasso.fit(X_train, y_train)
lasso_model = grid_search_lasso.best_estimator_

y_pred_train = lasso_model.predict(X_train)
y_pred_test = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(lasso_model, X_train, y_train, cv=5, scoring='r2')
print('Best params for Lasso Regression: ', grid_search_lasso.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Lasso Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for Lasso Regression:  {'alpha': 1}
MSE:  0.5998863142004618
r2_score_train:  0.0
r2_score_test:  -2.6239585593224035e-07
Cross Validation Score:  -0.00954063546237265


  _data = np.array(data, dtype=dtype, copy=copy,


## Ridge Regression

In [14]:
from sklearn.linear_model import Ridge

estimator = Ridge()
paramgrid = {'alpha': list(range(1,100))}
grid_search_ridge = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_ridge.fit(X_train, y_train)
ridge_model = grid_search_ridge.best_estimator_ 

y_pred_train = ridge_model.predict(X_train)
y_pred_test = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(ridge_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Ridge Regression: ', grid_search_ridge.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Ridge Regression': {'R2_Score_train': float(r2_train),'R2_Score_test':float(r2_test), 'MSE': float(mse), 'Cross Validation Score': float(cv.mean())}})


Best params for Ridge Regression:  {'alpha': 1}
MSE:  0.15992067945079733
r2_score_train:  0.7592119791234878
r2_score_test:  0.7334149527540633
Cross Validation Score:  0.7111423568391279


## KNeighbors Regressor

In [15]:
from sklearn.neighbors import KNeighborsRegressor

estimator = KNeighborsRegressor()
param_grid = {'n_neighbors': list(range(1,100))}
grid_search_knn = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
grid_search_knn.fit(X_train, y_train)
knn_model = grid_search_knn.best_estimator_

y_pred_train =knn_model.predict(X_train)
y_pred_test =knn_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(knn_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for KNN Regressor: ', grid_search_knn.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'KNeighbors Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for KNN Regressor:  {'n_neighbors': 11}
MSE:  0.18643560619159877
r2_score_train:  0.6843010122599781
r2_score_test:  0.6892150217495565
Cross Validation Score:  0.6226344786965037


## Decision Regression

In [16]:
from sklearn.tree import DecisionTreeRegressor

estimator = DecisionTreeRegressor(random_state=42)
paramgrid = {'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error'] }
grid_search_decision = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_decision.fit(X_train, y_train)
decisiontree_model = grid_search_decision.best_estimator_

y_pred_train = decisiontree_model.predict(X_train)
y_pred_test = decisiontree_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(decisiontree_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for DecisionTree Regressor: ', grid_search_decision.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'DecisionTree Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for DecisionTree Regressor:  {'criterion': 'squared_error', 'max_depth': 7}
MSE:  0.2254209650935865
r2_score_train:  0.7853680180359982
r2_score_test:  0.6242270928558216
Cross Validation Score:  0.579222384510659


## Random Forest Regressor

In [17]:
from sklearn.ensemble import RandomForestRegressor

estimator = RandomForestRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)),'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error']}
grid_search_randomforest = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_randomforest.fit(X_train, y_train)
randomforest_model = grid_search_randomforest.best_estimator_

y_pred_train = randomforest_model.predict(X_train)
y_pred_test = randomforest_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(randomforest_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for RandomForest Regressor: ', grid_search_randomforest.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'RandomForest Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for RandomForest Regressor:  {'criterion': 'squared_error', 'max_depth': 9, 'n_estimators': 9}
MSE:  0.17539672242091836
r2_score_train:  0.8402851878334265
r2_score_test:  0.7076166528685304
Cross Validation Score:  0.6573920687624544


## AdaBoost Regressor

In [18]:
from sklearn.ensemble import AdaBoostRegressor

estimator = AdaBoostRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)), 'learning_rate': [0.1, 0.5, 1.0]}
grid_search_adaboost = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_adaboost.fit(X_train, y_train)
adaboost_model = grid_search_adaboost.best_estimator_

y_pred_train = adaboost_model.predict(X_train)
y_pred_test = adaboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(adaboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Adaboost Regressor: ', grid_search_adaboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'AdaBoost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for Adaboost Regressor:  {'learning_rate': 0.5, 'n_estimators': 9}
MSE:  0.24269603649137636
r2_score_train:  0.6135092681022793
r2_score_test:  0.5954298432407482
Cross Validation Score:  0.583535588957391


## Gradient Boosting Regressor

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

estimator = GradientBoostingRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)), 'learning_rate': [0.1, 0.5, 1.0]}
grid_search_gradientboost = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_gradientboost.fit(X_train, y_train)
gradientboost_model = grid_search_gradientboost.best_estimator_

y_pred_train = gradientboost_model.predict(X_train)
y_pred_test = gradientboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(gradientboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Gradientboost Regressor: ', grid_search_gradientboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'GradientBoost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for Gradientboost Regressor:  {'learning_rate': 0.5, 'n_estimators': 9}
MSE:  0.1758847642171237
r2_score_train:  0.7506962267098456
r2_score_test:  0.706803095511557
Cross Validation Score:  0.683105591915805


## Xgboost Regressor

In [20]:
from xgboost import XGBRegressor

estimator = XGBRegressor(random_state=42)
param_grid = {'n_estimators': list(range(1,10)), 'learning_rate': [0.1, 0.5, 1.0], 'gamma': [0, 0.1, 0.5,1]}
grid_search_xgboost = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
grid_search_xgboost.fit(X_train, y_train)
xgboost_model = grid_search_xgboost.best_estimator_

y_pred_train = xgboost_model.predict(X_train)
y_pred_test = xgboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(xgboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Xgboost Regressor: ', grid_search_xgboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Xgboost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})





Best params for Xgboost Regressor:  {'gamma': 0.1, 'learning_rate': 0.5, 'n_estimators': 9}
MSE:  0.15123584899353595
r2_score_train:  0.8546361225454101
r2_score_test:  0.7478924171177906
Cross Validation Score:  0.6825610465211035


In [21]:
scores

[{'Linear Regression': {'R2_Score_train': 0.763850825536035,
   'R2_Score_test': 0.7294760616417002,
   'MSE': np.float64(0.16228356570221825),
   'Cross Validation Score': np.float64(0.7103994251760269)}},
 {'Lasso Regression': {'R2_Score_train': 0.0,
   'R2_Score_test': -2.6239585593224035e-07,
   'MSE': np.float64(0.5998863142004618),
   'Cross Validation Score': np.float64(-0.00954063546237265)}},
 {'Ridge Regression': {'R2_Score_train': 0.7592119791234878,
   'R2_Score_test': 0.7334149527540633,
   'MSE': 0.15992067945079733,
   'Cross Validation Score': 0.7111423568391279}},
 {'KNeighbors Regressor': {'R2_Score_train': 0.6843010122599781,
   'R2_Score_test': 0.6892150217495565,
   'MSE': np.float64(0.18643560619159877),
   'Cross Validation Score': np.float64(0.6226344786965037)}},
 {'DecisionTree Regression': {'R2_Score_train': 0.7853680180359982,
   'R2_Score_test': 0.6242270928558216,
   'MSE': np.float64(0.2254209650935865),
   'Cross Validation Score': np.float64(0.579222384

In [22]:
model_name = []
r2_train = []
r2_test = []
mse = []
cv = []
#pd.reset_option('display.float_format')
for model in scores:
    for name, metrics in model.items():
        model_name.append(name)
        r2_train.append(float(metrics['R2_Score_train']))
        r2_test .append(float(metrics['R2_Score_test']))
        mse.append(float(metrics['MSE']))
        cv.append(float(metrics['Cross Validation Score']))

scores_df = pd.DataFrame({
    'Model': model_name,
    'R2_Train': r2_train,
    'R2_Test': r2_test,
    'MSE': mse,
    'Cross Validation Score': cv
})

scores_df

Unnamed: 0,Model,R2_Train,R2_Test,MSE,Cross Validation Score
0,Linear Regression,0.763851,0.7294761,0.162284,0.710399
1,Lasso Regression,0.0,-2.623959e-07,0.599886,-0.009541
2,Ridge Regression,0.759212,0.733415,0.159921,0.711142
3,KNeighbors Regressor,0.684301,0.689215,0.186436,0.622634
4,DecisionTree Regression,0.785368,0.6242271,0.225421,0.579222
5,RandomForest Regression,0.840285,0.7076167,0.175397,0.657392
6,AdaBoost Regressor,0.613509,0.5954298,0.242696,0.583536
7,GradientBoost Regressor,0.750696,0.7068031,0.175885,0.683106
8,Xgboost Regressor,0.854636,0.7478924,0.151236,0.682561


##### The best model should be Xgboost Regressor

In [10]:
import pandas as pd
df = pd.DataFrame(columns=['One', 'Two'])
df['One'] = 2
df['Two'] = 3
df

Unnamed: 0,One,Two
