# Model Training

In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor,
                              AdaBoostRegressor ,
                                GradientBoostingRegressor,
                                VotingRegressor,
                                StackingRegressor)

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error , r2_score
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler # for num_cols
from sklearn.preprocessing import OneHotEncoder # for cat_cols
from sklearn.compose import ColumnTransformer # to transform features
from sklearn.model_selection import train_test_split

import pickle
import os

In [41]:
df = pd.read_csv('data/laptop_data_cleaned.csv')
df.drop_duplicates(inplace=True)

In [42]:
df

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,Ips,Ppi,Cpu_brand,HDD,SSD,Gpu_brand,Os
0,Apple,Ultrabook,8,1.37,11.175755,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,10.776777,0,0,127.677940,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,10.329931,0,0,141.211998,Intel Core i5,0,256,Intel,Others
3,Apple,Ultrabook,16,1.83,11.814476,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,11.473101,0,1,226.983005,Intel Core i5,0,256,Intel,Mac
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,Asus,Notebook,4,2.20,10.555257,0,0,100.454670,Intel Core i7,500,0,Nvidia,Windows
1269,Lenovo,2 in 1 Convertible,4,1.80,10.433899,1,1,157.350512,Intel Core i7,0,128,Intel,Windows
1270,Lenovo,2 in 1 Convertible,16,1.30,11.288115,1,1,276.053530,Intel Core i7,0,512,Intel,Windows
1271,Lenovo,Notebook,2,1.50,9.409283,0,0,111.935204,Other Intel Processor,0,0,Intel,Windows


In [43]:
df.describe()

Unnamed: 0,Ram,Weight,Price,TouchScreen,Ips,Ppi,HDD,SSD
count,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0
mean,8.451258,2.041761,10.828035,0.147013,0.279874,146.936126,414.040881,186.298742
std,5.099249,0.669088,0.619774,0.354258,0.449114,42.940459,518.128204,186.597777
min,2.0,0.69,9.134616,0.0,0.0,90.583402,0.0,0.0
25%,4.0,1.5,10.387379,0.0,0.0,127.335675,0.0,0.0
50%,8.0,2.04,10.872255,0.0,0.0,141.211998,0.0,256.0
75%,8.0,2.31,11.287447,0.0,1.0,157.350512,1000.0,256.0
max,64.0,4.7,12.691441,1.0,1.0,352.465147,2000.0,1024.0


In [44]:
X = df.drop(columns=['Price'])
y = df['Price']

In [45]:
cat_cols = [features for features in X.columns if X[features].dtypes == 'O']
num_cols = [features for features in X.columns if X[features].dtypes != 'O']
len(num_cols) , len(cat_cols)

(7, 5)

In [7]:
cat_pipeline = Pipeline(
    steps=[
        ('ohe',OneHotEncoder(sparse=False, # this will return me numpy array by the first time
        drop='first'# This will drop first column
              )),
        ('imputer',SimpleImputer())
    ]
)

In [8]:
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer()),
        ('scaler',StandardScaler()),
    ]
)

In [9]:
preprocessor_obj = ColumnTransformer(
    transformers=[
        ('cat_col_pipeline',cat_pipeline,cat_cols),
        ('num_col_pipeline',num_pipeline,num_cols)
    ],remainder='passthrough'
)
X = preprocessor_obj.fit_transform(X)



In [38]:

X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=0.2, random_state=1)

X_train.shape , X_test.shape , y_train.shape ,y_test.shape


((1017, 27), (255, 27), (1017,), (255,))

In [11]:
X_train.shape


(1017, 27)

In [12]:
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)

In [13]:
y_pred = dt.predict(X_test)

In [14]:
r2_score(y_test,y_pred)

0.8230357371560469

In [15]:
models = {
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
}


In [16]:
params = {
    "Random Forest": {
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        # 'max_features': ['sqrt', 'log2', None],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "Gradient Boosting": {
        'loss': ['squared_error', 'huber', 'absolute_error', 'quantile'],
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        # 'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        'criterion': ['squared_error', 'friedman_mse'],
        # 'max_features': ['auto', 'sqrt', 'log2'],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "XGBRegressor": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "CatBoosting Regressor": {
        'depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    },
    "AdaBoost Regressor": {
        'learning_rate': [0.1, 0.01, 0.5, 0.001],
        # 'loss': ['linear', 'square', 'exponential'],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    }
}


In [17]:
len(params.keys())

5

In [18]:
def error_Accuracy(true,pred):
    r2_accuracy = r2_score(y_true=true,y_pred=pred)
    mse = mean_squared_error(y_true=true,y_pred=pred)
    mae = mean_absolute_error(y_true=true,y_pred=pred)
    return r2_accuracy,mse,mae

In [19]:

class ModelTraining:
    def __init__(self):
        self.model_list = []
        self.accuracy_list = []
        self.best_params_dict = {}

    def train_model(self, models, params, X_train, y_train, X_test, y_test):
        for i, model_name in enumerate(models.keys()):
            model = models[model_name]
            param = params[model_name]

            grid_search_cv = GridSearchCV(estimator=model, param_grid=param, cv=5)
            grid_search_cv.fit(X_train, y_train)

            best_params = grid_search_cv.best_params_
            model.set_params(**best_params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            r2_accuracy, mse, mae = error_Accuracy(true=y_test, pred=y_pred)

            print(model_name)
            self.model_list.append(model_name)

            print("- Mean Absolute Error: {:.4f}".format(mae))
            print("- Mean Squared Error: {:.4f}".format(mse))
            print("- R2 Score: {:.4f}".format(r2_accuracy))
            print('\n')

            self.best_params_dict[model_name] = best_params

            self.accuracy_list.append(r2_accuracy)

    def return_accuracy_list(self):
        return self.accuracy_list
    
    def best_params(self):
        return self.best_params_dict


In [20]:
train_model = ModelTraining()

In [21]:
train_model.train_model(models=models,params=params,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)

Random Forest
- Mean Absolute Error: 0.1592
- Mean Squared Error: 0.0427
- R2 Score: 0.8818


Gradient Boosting
- Mean Absolute Error: 0.1573
- Mean Squared Error: 0.0415
- R2 Score: 0.8854


XGBRegressor
- Mean Absolute Error: 0.1460
- Mean Squared Error: 0.0348
- R2 Score: 0.9038


CatBoosting Regressor
- Mean Absolute Error: 0.1582
- Mean Squared Error: 0.0424
- R2 Score: 0.8829


AdaBoost Regressor
- Mean Absolute Error: 0.2084
- Mean Squared Error: 0.0675
- R2 Score: 0.8133




In [22]:
train_model.best_params_dict

{'Random Forest': {'criterion': 'poisson', 'n_estimators': 32},
 'Gradient Boosting': {'criterion': 'squared_error',
  'learning_rate': 0.1,
  'loss': 'huber',
  'n_estimators': 256},
 'XGBRegressor': {'learning_rate': 0.1, 'n_estimators': 128},
 'CatBoosting Regressor': {'depth': 10,
  'iterations': 100,
  'learning_rate': 0.1},
 'AdaBoost Regressor': {'learning_rate': 0.5, 'n_estimators': 128}}

In [23]:
train_model.accuracy_list

[0.881824027341725,
 0.8853869313418019,
 0.9038188963455114,
 0.8828531633936608,
 0.8133451359350746]

In [24]:
model_accuracy=pd.DataFrame(list(zip(list(models.keys()),train_model.accuracy_list)),columns=['model','accuracy'])
model_accuracy

Unnamed: 0,model,accuracy
0,Random Forest,0.881824
1,Gradient Boosting,0.885387
2,XGBRegressor,0.903819
3,CatBoosting Regressor,0.882853
4,AdaBoost Regressor,0.813345


In [25]:
best_models = model_accuracy.sort_values(by='accuracy',ascending=False).head(4)
best_models

Unnamed: 0,model,accuracy
2,XGBRegressor,0.903819
1,Gradient Boosting,0.885387
3,CatBoosting Regressor,0.882853
0,Random Forest,0.881824


In [26]:
best_models

Unnamed: 0,model,accuracy
2,XGBRegressor,0.903819
1,Gradient Boosting,0.885387
3,CatBoosting Regressor,0.882853
0,Random Forest,0.881824


## Voting Regressor

In [27]:
train_model.best_params_dict

{'Random Forest': {'criterion': 'poisson', 'n_estimators': 32},
 'Gradient Boosting': {'criterion': 'squared_error',
  'learning_rate': 0.1,
  'loss': 'huber',
  'n_estimators': 256},
 'XGBRegressor': {'learning_rate': 0.1, 'n_estimators': 128},
 'CatBoosting Regressor': {'depth': 10,
  'iterations': 100,
  'learning_rate': 0.1},
 'AdaBoost Regressor': {'learning_rate': 0.5, 'n_estimators': 128}}

In [28]:
RandomForest =RandomForestRegressor(criterion='squared_error',n_estimators=256)
GradientBoost = GradientBoostingRegressor(criterion='squared_error',learning_rate=0.1,loss='huber',n_estimators=256)
Xgboost = XGBRegressor(learning_rate = 0.1,n_estimators = 256)
Catboost = CatBoostRegressor(learning_rate=0.5,n_estimators=256)

In [29]:
votingRegressor = VotingRegressor(
    estimators=[
        ('RandomForest',RandomForest),
        ('GradientBoost',GradientBoost),
        ('Xgboost',Xgboost),
        ('Catboost',Catboost)
    ]
)

In [30]:
votingRegressor.fit(X_train,y_train)

0:	learn: 0.4551484	total: 1.29ms	remaining: 329ms
1:	learn: 0.3532305	total: 2.36ms	remaining: 300ms
2:	learn: 0.3026478	total: 4.46ms	remaining: 376ms
3:	learn: 0.2767455	total: 5.46ms	remaining: 344ms
4:	learn: 0.2596069	total: 6.38ms	remaining: 320ms
5:	learn: 0.2473792	total: 7.29ms	remaining: 304ms
6:	learn: 0.2403755	total: 8.18ms	remaining: 291ms
7:	learn: 0.2338417	total: 9.13ms	remaining: 283ms
8:	learn: 0.2267980	total: 10.1ms	remaining: 277ms
9:	learn: 0.2229614	total: 11.1ms	remaining: 273ms
10:	learn: 0.2186184	total: 12.2ms	remaining: 273ms
11:	learn: 0.2126338	total: 13.3ms	remaining: 271ms
12:	learn: 0.2100780	total: 14.3ms	remaining: 267ms
13:	learn: 0.2048778	total: 15.4ms	remaining: 266ms
14:	learn: 0.2006720	total: 16.3ms	remaining: 263ms
15:	learn: 0.1965787	total: 17.4ms	remaining: 261ms
16:	learn: 0.1928599	total: 19.3ms	remaining: 272ms
17:	learn: 0.1899565	total: 20.5ms	remaining: 272ms
18:	learn: 0.1872584	total: 21.5ms	remaining: 268ms
19:	learn: 0.1822854	t

In [31]:
y_pred_voting_regressor = votingRegressor.predict(X_test)

In [32]:
r2_accuracy, mse, mae = error_Accuracy(true=y_test, pred=y_pred_voting_regressor)
print("- Mean Absolute Error: {:.4f}".format(mae))
print("- Mean Squared Error: {:.4f}".format(mse))
print("- R2 Score: {:.4f}".format(r2_accuracy))

- Mean Absolute Error: 0.1429
- Mean Squared Error: 0.0344
- R2 Score: 0.9049


# Finally Voting Regressor is the best Algo For my MODEL

## Pickling Entire Model

In [46]:
X = df.drop(columns=['Price'])
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=0.2, random_state=1)

X_train.shape , X_test.shape , y_train.shape ,y_test.shape


((1017, 12), (255, 12), (1017,), (255,))

In [47]:
y_train

208     11.254190
1015    11.073937
754     10.720680
76      10.227465
228     10.767905
          ...    
715     10.729081
905     11.624777
1097    11.352695
235     10.776844
1062    10.301710
Name: Price, Length: 1017, dtype: float64

In [48]:
preprocess_model = Pipeline(
    steps=[
   ( 'preprocessor_obj',preprocessor_obj),
   ('votingRegressor',votingRegressor)
    ]
)
preprocess_model.fit_transform(X_train,y_train)




0:	learn: 0.4551484	total: 1.43ms	remaining: 364ms
1:	learn: 0.3532305	total: 3.08ms	remaining: 391ms
2:	learn: 0.3026478	total: 4.19ms	remaining: 353ms
3:	learn: 0.2767455	total: 5.15ms	remaining: 325ms
4:	learn: 0.2596069	total: 6.15ms	remaining: 309ms
5:	learn: 0.2473792	total: 7.2ms	remaining: 300ms
6:	learn: 0.2403755	total: 8.13ms	remaining: 289ms
7:	learn: 0.2338417	total: 9.01ms	remaining: 279ms
8:	learn: 0.2267980	total: 9.95ms	remaining: 273ms
9:	learn: 0.2229614	total: 10.9ms	remaining: 269ms
10:	learn: 0.2186184	total: 11.8ms	remaining: 264ms
11:	learn: 0.2126338	total: 12.9ms	remaining: 262ms
12:	learn: 0.2100780	total: 13.8ms	remaining: 257ms
13:	learn: 0.2048778	total: 14.7ms	remaining: 254ms
14:	learn: 0.2006720	total: 15.8ms	remaining: 254ms
15:	learn: 0.1965787	total: 16.9ms	remaining: 254ms
16:	learn: 0.1928599	total: 18.4ms	remaining: 258ms
17:	learn: 0.1899565	total: 19.4ms	remaining: 257ms
18:	learn: 0.1872584	total: 20.4ms	remaining: 255ms
19:	learn: 0.1822854	to

array([[11.29203319, 11.26960334, 11.26957226, 11.27423795],
       [11.03156034, 10.90607293, 11.02036953, 11.04060873],
       [10.67028285, 10.56711723, 10.70139408, 10.72026345],
       ...,
       [11.2548588 , 11.16633573, 11.32388115, 11.31827176],
       [10.85094126, 10.79161449, 10.81684208, 10.80220758],
       [10.36521819, 10.47634519, 10.33621311, 10.33025474]])

In [54]:
os.makedirs(name='model',exist_ok=True)
pickle.dump(df,open(file='model/df.pkl',mode='wb'))
pickle.dump(votingRegressor,open(file='model/model.pkl',mode='wb'))
pickle.dump(preprocessor_obj,open(file='model/preprocessor.pkl',mode='wb'))
pickle.dump(preprocess_model,open(file='model/preprocess_model.pkl',mode='wb'))