## Model_training

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('../data/skewed_data.csv')
df.head()

Unnamed: 0,Name,Brand,Battery_capacity(mAh),Screen_size(inches),Touchscreen,Processor,Operating system,Wi-Fi,Bluetooth,GPS,3G,4G/ LTE,Resolution_width(px),Resolution_height(px),Price,Internal_storage(GB),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,OnePlus 7T Pro McLaren Edition,OnePlus,4085,7,Yes,8,Android,Yes,Yes,Yes,Yes,Yes,1440,3120,10.985276,5.549076,4.197814,3.696808,3.157549,16
1,Realme X2 Pro,Realme,4000,7,Yes,8,Android,Yes,Yes,Yes,Yes,Yes,1080,2400,10.23996,4.174387,4.527737,3.696808,2.275481,16
2,iPhone 11 Pro Max,Apple,3969,7,Yes,6,iOS,Yes,Yes,Yes,Yes,Yes,1242,2688,11.579658,4.174387,2.695576,3.260498,1.830773,16
3,iPhone 11,Apple,3110,6,Yes,6,iOS,Yes,Yes,Yes,Yes,Yes,828,1792,11.049317,4.174387,2.695576,3.260498,1.830773,16
4,LG G8X ThinQ,LG,4000,6,Yes,8,Android,Yes,Yes,Yes,No,No,1080,2340,10.819598,4.859812,2.695576,4.870624,2.275481,1


In [3]:
tfidf = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf.fit_transform(df['Name']).toarray()
tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf.get_feature_names_out())
tfidf_df


Unnamed: 0,10,2018,4g,a1,alcatel,andi,apple,aqua,asus,bharat,...,xiaomi,xolo,xperia,yu,z1,zen,zenfone,ziox,zopo,zte
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.735657,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1340,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1341,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.735657,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1342,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df = pd.concat([df, tfidf_df], axis=1)
df.drop('Name', axis=1, inplace=True)
df.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Touchscreen,Processor,Operating system,Wi-Fi,Bluetooth,GPS,3G,...,xiaomi,xolo,xperia,yu,z1,zen,zenfone,ziox,zopo,zte
0,OnePlus,4085,7,Yes,8,Android,Yes,Yes,Yes,Yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Realme,4000,7,Yes,8,Android,Yes,Yes,Yes,Yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Apple,3969,7,Yes,6,iOS,Yes,Yes,Yes,Yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Apple,3110,6,Yes,6,iOS,Yes,Yes,Yes,Yes,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,LG,4000,6,Yes,8,Android,Yes,Yes,Yes,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
X = df.drop('Price', axis=1)
y = df['Price']

In [6]:
num_features = X.select_dtypes(exclude='object').columns
categorical_features = ['Brand']


In [7]:

preprocessor = ColumnTransformer([
    ('OneHotEncoder',OneHotEncoder(), categorical_features),
    ('MinMaxScaler', StandardScaler(), num_features)
])
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed.shape

(1344, 185)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)
scores = []

## Linear regression

In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_train = linear_model.predict(X_train)
y_pred_test = linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(linear_model, X_train, y_train, cv=5, scoring='r2')
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Linear Regression': {'R2_Score_train': float(r2_train),'R2_Score_test':float(r2_test), 'MSE': float(mse), 'Cross Validation Score': float(cv.mean())}})

MSE:  2.6782991499627597e+20
r2_score_train:  0.7886913752329004
r2_score_test:  -4.464679038905962e+20
Cross Validation Score:  -5.327585032671364e+22


## Lasso Regression

In [11]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV


estimator = Lasso()
paramgrid = {'alpha': list(range(1,100))}
grid_search_lasso = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_lasso.fit(X_train, y_train)
lasso_model = grid_search_lasso.best_estimator_

y_pred_train = lasso_model.predict(X_train)
y_pred_test = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(lasso_model, X_train, y_train, cv=5, scoring='r2')
print('Best params for Lasso Regression: ', grid_search_lasso.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Lasso Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for Lasso Regression:  {'alpha': 1}
MSE:  0.5998863142004618
r2_score_train:  0.0
r2_score_test:  -2.6239585593224035e-07
Cross Validation Score:  -0.00954063546237265


  _data = np.array(data, dtype=dtype, copy=copy,


## Ridge Regression

In [38]:
from sklearn.linear_model import Ridge

estimator = Ridge()
paramgrid = {'alpha': list(range(1,100))}
grid_search_ridge = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_ridge.fit(X_train, y_train)
ridge_model = grid_search_ridge.best_estimator_ 

y_pred_train = ridge_model.predict(X_train)
y_pred_test = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(ridge_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Ridge Regression: ', grid_search_ridge.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Ridge Regression': {'R2_Score_train': float(r2_train),'R2_Score_test':float(r2_test), 'MSE': float(mse), 'Cross Validation Score': float(cv.mean())}})


Best params for Ridge Regression:  {'alpha': 1}
MSE:  0.16593534695545742
r2_score_train:  0.7784491078815353
r2_score_test:  0.7233886045268991
Cross Validation Score:  0.691757942209234


## KNeighbors Regressor

In [14]:
from sklearn.neighbors import KNeighborsRegressor

estimator = KNeighborsRegressor()
param_grid = {'n_neighbors': list(range(1,100))}
grid_search_knn = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
grid_search_knn.fit(X_train, y_train)
knn_model = grid_search_knn.best_estimator_

y_pred_train =knn_model.predict(X_train)
y_pred_test =knn_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(knn_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for KNN Regressor: ', grid_search_knn.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'KNeighbors Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for KNN Regressor:  {'n_neighbors': 6}
MSE:  0.226644681654297
r2_score_train:  0.6907852655575827
r2_score_test:  0.6221871782039268
Cross Validation Score:  0.5536882555117693


## Decision Regression

In [15]:
from sklearn.tree import DecisionTreeRegressor

estimator = DecisionTreeRegressor(random_state=42)
paramgrid = {'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error'] }
grid_search_decision = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_decision.fit(X_train, y_train)
decisiontree_model = grid_search_decision.best_estimator_

y_pred_train = decisiontree_model.predict(X_train)
y_pred_test = decisiontree_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(decisiontree_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for DecisionTree Regressor: ', grid_search_decision.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'DecisionTree Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for DecisionTree Regressor:  {'criterion': 'squared_error', 'max_depth': 8}
MSE:  0.2077491498325675
r2_score_train:  0.8243461535026796
r2_score_test:  0.6536857077295137
Cross Validation Score:  0.5732130776941906


## Random Forest Regressor

In [16]:
from sklearn.ensemble import RandomForestRegressor

estimator = RandomForestRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)),'max_depth': list(range(1,10)), 'criterion': ['squared_error', 'absolute_error']}
grid_search_randomforest = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_randomforest.fit(X_train, y_train)
randomforest_model = grid_search_randomforest.best_estimator_

y_pred_train = randomforest_model.predict(X_train)
y_pred_test = randomforest_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(randomforest_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for RandomForest Regressor: ', grid_search_randomforest.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'RandomForest Regression': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for RandomForest Regressor:  {'criterion': 'squared_error', 'max_depth': 8, 'n_estimators': 9}
MSE:  0.17711333922591715
r2_score_train:  0.8207873760493982
r2_score_test:  0.7047550819095065
Cross Validation Score:  0.6471010703506648


## AdaBoost Regressor

In [17]:
from sklearn.ensemble import AdaBoostRegressor

estimator = AdaBoostRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)), 'learning_rate': [0.1, 0.5, 1.0]}
grid_search_adaboost = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_adaboost.fit(X_train, y_train)
adaboost_model = grid_search_adaboost.best_estimator_

y_pred_train = adaboost_model.predict(X_train)
y_pred_test = adaboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(adaboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Adaboost Regressor: ', grid_search_adaboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'AdaBoost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})



Best params for Adaboost Regressor:  {'learning_rate': 0.5, 'n_estimators': 9}
MSE:  0.24900319661686088
r2_score_train:  0.6141789880862267
r2_score_test:  0.5849159147993843
Cross Validation Score:  0.5843008122600122


## Gradient Boosting Regressor

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

estimator = GradientBoostingRegressor(random_state=42)
paramgrid = {'n_estimators': list(range(1,10)), 'learning_rate': [0.1, 0.5, 1.0]}
grid_search_gradientboost = GridSearchCV(estimator, paramgrid, cv=5, scoring='r2')
grid_search_gradientboost.fit(X_train, y_train)
gradientboost_model = grid_search_gradientboost.best_estimator_

y_pred_train = gradientboost_model.predict(X_train)
y_pred_test = gradientboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(gradientboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Gradientboost Regressor: ', grid_search_gradientboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'GradientBoost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})


Best params for Gradientboost Regressor:  {'learning_rate': 0.5, 'n_estimators': 9}
MSE:  0.18330011648467656
r2_score_train:  0.7616900026392617
r2_score_test:  0.6944418296553857
Cross Validation Score:  0.6726797491862043


## Xgboost Regressor

In [51]:
from xgboost import XGBRegressor

estimator = XGBRegressor(random_state=42)
param_grid = {'n_estimators': list(range(1,10)), 'learning_rate': [0.1, 0.5, 1.0], 'gamma': [0, 0.1, 0.5,1]}
grid_search_xgboost = GridSearchCV(estimator, param_grid, cv=5, scoring='r2')
grid_search_xgboost.fit(X_train, y_train)
xgboost_model = grid_search_xgboost.best_estimator_

y_pred_train = xgboost_model.predict(X_train)
y_pred_test = xgboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
cv = cross_val_score(xgboost_model, X_train, y_train, cv=5, scoring='r2')

print('Best params for Xgboost Regressor: ', grid_search_xgboost.best_params_)
print('MSE: ', mse)
print('r2_score_train: ', r2_train)
print('r2_score_test: ', r2_test)
print('Cross Validation Score: ', cv.mean())

scores.append({'Xgboost Regressor': {'R2_Score_train': r2_train,'R2_Score_test':r2_test, 'MSE': mse, 'Cross Validation Score': cv.mean()}})





Best alpha for Xgboost Regressor:  {'gamma': 0.5, 'learning_rate': 0.5, 'n_estimators': 9}
MSE:  0.14104287333390458
r2_score_train:  0.8327784846339032
r2_score_test:  0.7530970697777191
Cross Validation Score:  0.6796162303265417


In [39]:
scores

[{'Linear Regression': {'R2_Score_train': 0.7886913752329004,
   'R2_Score_test': -4.464679038905962e+20,
   'MSE': np.float64(2.6782991499627597e+20),
   'Cross Validation Score': np.float64(-5.327585032671364e+22)}},
 {'Lasso Regression': {'R2_Score_train': 0.0,
   'R2_Score_test': -2.6239585593224035e-07,
   'MSE': np.float64(0.5998863142004618),
   'Cross Validation Score': np.float64(-0.00954063546237265)}},
 {'Ridge Regression': {'R2_Score_train': 0.7784491078815353,
   'R2_Score_test': 0.7233886045268991,
   'MSE': np.float64(0.16593534695545742),
   'Cross Validation Score': np.float64(0.691757942209234)}},
 {'KNeighbors Regressor': {'R2_Score_train': 0.6907852655575827,
   'R2_Score_test': 0.6221871782039268,
   'MSE': np.float64(0.226644681654297),
   'Cross Validation Score': np.float64(0.5536882555117693)}},
 {'DecisionTree Regression': {'R2_Score_train': 0.8243461535026796,
   'R2_Score_test': 0.6536857077295137,
   'MSE': np.float64(0.2077491498325675),
   'Cross Validati

In [40]:
model_name = []
r2_train = []
r2_test = []
mse = []
cv = []
#pd.reset_option('display.float_format')
for model in scores:
    for name, metrics in model.items():
        model_name.append(name)
        r2_train.append(float(metrics['R2_Score_train']))
        r2_test .append(float(metrics['R2_Score_test']))
        mse.append(float(metrics['MSE']))
        cv.append(float(metrics['Cross Validation Score']))

scores_df = pd.DataFrame({
    'Model': model_name,
    'R2_Train': r2_train,
    'R2_Test': r2_test,
    'MSE': mse,
    'Cross Validation Score': cv
})

scores_df

Unnamed: 0,Model,R2_Train,R2_Test,MSE,Cross Validation Score
0,Linear Regression,0.788691,-4.464679e+20,2.678299e+20,-5.327585e+22
1,Lasso Regression,0.0,-2.623959e-07,0.5998863,-0.009540635
2,Ridge Regression,0.778449,0.7233886,0.1659353,0.6917579
3,KNeighbors Regressor,0.690785,0.6221872,0.2266447,0.5536883
4,DecisionTree Regression,0.824346,0.6536857,0.2077491,0.5732131
5,RandomForest Regression,0.820787,0.7047551,0.1771133,0.6471011
6,AdaBoost Regressor,0.614179,0.5849159,0.2490032,0.5843008
7,GradientBoost Regressor,0.76169,0.6944418,0.1833001,0.6726797
8,Linear Regression,0.788691,-4.464679e+20,2.678299e+20,-5.327585e+22
9,Ridge Regression,0.778449,0.7233886,0.1659353,0.6917579
