# Model Training

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor,
                              AdaBoostRegressor ,
                                GradientBoostingRegressor,
                                VotingRegressor,
                                StackingRegressor)

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error , r2_score
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler # for num_cols
from sklearn.preprocessing import OneHotEncoder # for cat_cols
from sklearn.compose import ColumnTransformer # to transform features
from sklearn.model_selection import train_test_split

import pickle
import os

In [2]:
df = pd.read_csv('data/laptop_data_cleaned.csv')
df.drop_duplicates(inplace=True)

In [3]:
df

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,Ips,Ppi,Cpu_brand,HDD,SSD,Gpu_brand,Os
0,Apple,Ultrabook,8,1.37,11.175755,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,10.776777,0,0,127.677940,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,10.329931,0,0,141.211998,Intel Core i5,0,256,Intel,Others
3,Apple,Ultrabook,16,1.83,11.814476,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,11.473101,0,1,226.983005,Intel Core i5,0,256,Intel,Mac
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,Asus,Notebook,4,2.20,10.555257,0,0,100.454670,Intel Core i7,500,0,Nvidia,Windows
1269,Lenovo,2 in 1 Convertible,4,1.80,10.433899,1,1,157.350512,Intel Core i7,0,128,Intel,Windows
1270,Lenovo,2 in 1 Convertible,16,1.30,11.288115,1,1,276.053530,Intel Core i7,0,512,Intel,Windows
1271,Lenovo,Notebook,2,1.50,9.409283,0,0,111.935204,Other Intel Processor,0,0,Intel,Windows


In [4]:
df.describe()

Unnamed: 0,Ram,Weight,Price,TouchScreen,Ips,Ppi,HDD,SSD
count,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0
mean,8.451258,2.041761,10.828035,0.147013,0.279874,146.936126,414.040881,186.298742
std,5.099249,0.669088,0.619774,0.354258,0.449114,42.940459,518.128204,186.597777
min,2.0,0.69,9.134616,0.0,0.0,90.583402,0.0,0.0
25%,4.0,1.5,10.387379,0.0,0.0,127.335675,0.0,0.0
50%,8.0,2.04,10.872255,0.0,0.0,141.211998,0.0,256.0
75%,8.0,2.31,11.287447,0.0,1.0,157.350512,1000.0,256.0
max,64.0,4.7,12.691441,1.0,1.0,352.465147,2000.0,1024.0


In [5]:
X = df.drop(columns=['Price'])
y = df['Price']

In [6]:
cat_cols = [features for features in X.columns if X[features].dtypes == 'O']
num_cols = [features for features in X.columns if X[features].dtypes != 'O']
len(num_cols) , len(cat_cols)

(7, 5)

In [7]:
cat_pipeline = Pipeline(
    steps=[
        ('ohe',OneHotEncoder(sparse=False, # this will return me numpy array by the first time
        drop='first'# This will drop first column
              )),
        ('imputer',SimpleImputer())
    ]
)

In [8]:
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer()),
        ('scaler',StandardScaler()),
    ]
)

In [9]:
preprocessor_obj = ColumnTransformer(
    transformers=[
        ('cat_col_pipeline',cat_pipeline,cat_cols),
        ('num_col_pipeline',num_pipeline,num_cols)
    ],remainder='passthrough'
)
X = preprocessor_obj.fit_transform(X)



In [10]:
X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=0.2, random_state=1)

X_train.shape , X_test.shape , y_train.shape ,y_test.shape


((1017, 38), (255, 38), (1017,), (255,))

In [11]:
X_train.shape


(1017, 38)

In [12]:
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)

In [13]:
y_pred = dt.predict(X_test)

In [14]:
r2_score(y_test,y_pred)

0.7998901175472479

In [15]:
models = {
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
}


In [16]:
params = {
    "Random Forest": {
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        # 'max_features': ['sqrt', 'log2', None],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "Gradient Boosting": {
        'loss': ['squared_error', 'huber', 'absolute_error', 'quantile'],
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        # 'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        'criterion': ['squared_error', 'friedman_mse'],
        # 'max_features': ['auto', 'sqrt', 'log2'],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "XGBRegressor": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "CatBoosting Regressor": {
        'depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    },
    "AdaBoost Regressor": {
        'learning_rate': [0.1, 0.01, 0.5, 0.001],
        # 'loss': ['linear', 'square', 'exponential'],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    }
}


In [17]:
len(params.keys())

5

In [18]:
def error_Accuracy(true,pred):
    r2_accuracy = r2_score(y_true=true,y_pred=pred)
    mse = mean_squared_error(y_true=true,y_pred=pred)
    mae = mean_absolute_error(y_true=true,y_pred=pred)
    return r2_accuracy,mse,mae

In [19]:

class ModelTraining:
    def __init__(self):
        self.model_list = []
        self.accuracy_list = []
        self.best_params_dict = {}

    def train_model(self, models, params, X_train, y_train, X_test, y_test):
        for i, model_name in enumerate(models.keys()):
            model = models[model_name]
            param = params[model_name]

            grid_search_cv = GridSearchCV(estimator=model, param_grid=param, cv=5)
            grid_search_cv.fit(X_train, y_train)

            best_params = grid_search_cv.best_params_
            model.set_params(**best_params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            r2_accuracy, mse, mae = error_Accuracy(true=y_test, pred=y_pred)

            print(model_name)
            self.model_list.append(model_name)

            print("- Mean Absolute Error: {:.4f}".format(mae))
            print("- Mean Squared Error: {:.4f}".format(mse))
            print("- R2 Score: {:.4f}".format(r2_accuracy))
            print('\n')

            self.best_params_dict[model_name] = best_params

            self.accuracy_list.append(r2_accuracy)

    def return_accuracy_list(self):
        return self.accuracy_list
    
    def best_params(self):
        return self.best_params_dict


In [20]:
train_model = ModelTraining()

In [21]:
train_model.train_model(models=models,params=params,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)

Random Forest
- Mean Absolute Error: 0.1534
- Mean Squared Error: 0.0401
- R2 Score: 0.8893


Gradient Boosting
- Mean Absolute Error: 0.1604
- Mean Squared Error: 0.0411
- R2 Score: 0.8864


XGBRegressor
- Mean Absolute Error: 0.1396
- Mean Squared Error: 0.0346
- R2 Score: 0.9044


CatBoosting Regressor
- Mean Absolute Error: 0.1583
- Mean Squared Error: 0.0427
- R2 Score: 0.8818


AdaBoost Regressor
- Mean Absolute Error: 0.2066
- Mean Squared Error: 0.0671
- R2 Score: 0.8144




In [22]:
train_model.best_params_dict

{'Random Forest': {'criterion': 'poisson', 'n_estimators': 64},
 'Gradient Boosting': {'criterion': 'friedman_mse',
  'learning_rate': 0.1,
  'loss': 'huber',
  'n_estimators': 256},
 'XGBRegressor': {'learning_rate': 0.1, 'n_estimators': 256},
 'CatBoosting Regressor': {'depth': 10,
  'iterations': 100,
  'learning_rate': 0.1},
 'AdaBoost Regressor': {'learning_rate': 0.5, 'n_estimators': 128}}

In [23]:
train_model.accuracy_list

[0.8892532733084139,
 0.8863883603598792,
 0.90441338344959,
 0.881847048029996,
 0.8144300008146685]

In [24]:
model_accuracy=pd.DataFrame(list(zip(list(models.keys()),train_model.accuracy_list)),columns=['model','accuracy'])
model_accuracy

Unnamed: 0,model,accuracy
0,Random Forest,0.889253
1,Gradient Boosting,0.886388
2,XGBRegressor,0.904413
3,CatBoosting Regressor,0.881847
4,AdaBoost Regressor,0.81443


In [40]:
list(zip(list(models.keys()),train_model.accuracy_list))

[('Random Forest', 0.8892532733084139),
 ('Gradient Boosting', 0.8863883603598792),
 ('XGBRegressor', 0.90441338344959),
 ('CatBoosting Regressor', 0.881847048029996),
 ('AdaBoost Regressor', 0.8144300008146685)]

In [44]:
best_model = max(list(zip(list(models.keys()),train_model.accuracy_list)))[0]

In [45]:
models[best_model]

In [25]:
best_models = model_accuracy.sort_values(by='accuracy',ascending=False).head(4)
best_models

Unnamed: 0,model,accuracy
2,XGBRegressor,0.904413
0,Random Forest,0.889253
1,Gradient Boosting,0.886388
3,CatBoosting Regressor,0.881847


In [26]:
best_models

Unnamed: 0,model,accuracy
2,XGBRegressor,0.904413
0,Random Forest,0.889253
1,Gradient Boosting,0.886388
3,CatBoosting Regressor,0.881847


## Voting Regressor

In [27]:
train_model.best_params_dict

{'Random Forest': {'criterion': 'poisson', 'n_estimators': 64},
 'Gradient Boosting': {'criterion': 'friedman_mse',
  'learning_rate': 0.1,
  'loss': 'huber',
  'n_estimators': 256},
 'XGBRegressor': {'learning_rate': 0.1, 'n_estimators': 256},
 'CatBoosting Regressor': {'depth': 10,
  'iterations': 100,
  'learning_rate': 0.1},
 'AdaBoost Regressor': {'learning_rate': 0.5, 'n_estimators': 128}}

In [28]:
train_model.best_params_dict['Random Forest']

{'criterion': 'poisson', 'n_estimators': 64}

In [29]:
RandomForest =RandomForestRegressor().set_params(**train_model.best_params_dict['Random Forest'])
GradientBoost = GradientBoostingRegressor().set_params(**train_model.best_params_dict['Gradient Boosting'])
Xgboost = XGBRegressor().set_params(**train_model.best_params_dict['XGBRegressor'])
Catboost = CatBoostRegressor().set_params(**train_model.best_params_dict['CatBoosting Regressor'])


In [30]:
votingRegressor = VotingRegressor(
    estimators=[
        ('RandomForest',RandomForest),
        ('GradientBoost',GradientBoost),
        ('Xgboost',Xgboost),
        ('Catboost',Catboost)
    ]
)

In [31]:
votingRegressor.fit(X_train,y_train)

0:	learn: 0.5852870	total: 7.15ms	remaining: 708ms
1:	learn: 0.5483074	total: 15.4ms	remaining: 753ms
2:	learn: 0.5137807	total: 22.5ms	remaining: 727ms
3:	learn: 0.4843408	total: 30.8ms	remaining: 739ms
4:	learn: 0.4572128	total: 38ms	remaining: 722ms
5:	learn: 0.4338705	total: 39.4ms	remaining: 617ms
6:	learn: 0.4091679	total: 47.8ms	remaining: 635ms
7:	learn: 0.3877900	total: 50ms	remaining: 575ms
8:	learn: 0.3700159	total: 56.9ms	remaining: 575ms
9:	learn: 0.3533477	total: 64.3ms	remaining: 579ms
10:	learn: 0.3380532	total: 70.7ms	remaining: 572ms
11:	learn: 0.3258126	total: 77.6ms	remaining: 569ms
12:	learn: 0.3130784	total: 84.7ms	remaining: 567ms
13:	learn: 0.3010350	total: 91ms	remaining: 559ms
14:	learn: 0.2916865	total: 98.7ms	remaining: 559ms
15:	learn: 0.2838042	total: 100ms	remaining: 525ms
16:	learn: 0.2754030	total: 107ms	remaining: 521ms
17:	learn: 0.2687499	total: 114ms	remaining: 518ms
18:	learn: 0.2627074	total: 120ms	remaining: 512ms
19:	learn: 0.2566275	total: 128m

In [32]:
y_pred_voting_regressor = votingRegressor.predict(X_test)

In [33]:
r2_accuracy, mse, mae = error_Accuracy(true=y_test, pred=y_pred_voting_regressor)
print("- Mean Absolute Error: {:.4f}".format(mae))
print("- Mean Squared Error: {:.4f}".format(mse))
print("- R2 Score: {:.4f}".format(r2_accuracy))

- Mean Absolute Error: 0.1449
- Mean Squared Error: 0.0343
- R2 Score: 0.9051


# Finally Voting Regressor is the best Algo For my MODEL

## Pickling Entire Model

In [34]:
X = df.drop(columns=['Price'])
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=0.2, random_state=1)

X_train.shape , X_test.shape , y_train.shape ,y_test.shape


((1017, 12), (255, 12), (1017,), (255,))

In [35]:
y_train

208     11.254190
1015    11.073937
754     10.720680
76      10.227465
228     10.767905
          ...    
715     10.729081
905     11.624777
1097    11.352695
235     10.776844
1062    10.301710
Name: Price, Length: 1017, dtype: float64

In [36]:
preprocess_model = Pipeline(
    steps=[
   ( 'preprocessor_obj',preprocessor_obj),
   ('votingRegressor',votingRegressor)
    ]
)
preprocess_model.fit_transform(X_train,y_train)




0:	learn: 0.5852870	total: 7.54ms	remaining: 746ms
1:	learn: 0.5483074	total: 15.1ms	remaining: 740ms
2:	learn: 0.5137807	total: 22.3ms	remaining: 721ms
3:	learn: 0.4843408	total: 30.4ms	remaining: 730ms
4:	learn: 0.4572128	total: 37.5ms	remaining: 712ms
5:	learn: 0.4338705	total: 38.9ms	remaining: 610ms
6:	learn: 0.4091679	total: 46.6ms	remaining: 619ms
7:	learn: 0.3877900	total: 48.6ms	remaining: 559ms
8:	learn: 0.3700159	total: 56.2ms	remaining: 568ms
9:	learn: 0.3533477	total: 64ms	remaining: 576ms
10:	learn: 0.3380532	total: 70.8ms	remaining: 573ms
11:	learn: 0.3258126	total: 78.1ms	remaining: 573ms
12:	learn: 0.3130784	total: 84.9ms	remaining: 568ms
13:	learn: 0.3010350	total: 92.9ms	remaining: 571ms
14:	learn: 0.2916865	total: 99.6ms	remaining: 565ms
15:	learn: 0.2838042	total: 101ms	remaining: 530ms
16:	learn: 0.2754030	total: 109ms	remaining: 531ms
17:	learn: 0.2687499	total: 116ms	remaining: 527ms
18:	learn: 0.2627074	total: 123ms	remaining: 526ms
19:	learn: 0.2566275	total: 

array([[11.30012173, 11.27271795, 11.26846313, 11.27339603],
       [11.07239642, 10.96735978, 11.03379059, 11.02431968],
       [10.67882669, 10.51583158, 10.71079922, 10.55949221],
       ...,
       [11.25509927, 11.1515882 , 11.32951164, 11.16498864],
       [10.8777631 , 10.79864121, 10.80856609, 10.84839161],
       [10.36021396, 10.47695599, 10.33291721, 10.43438214]])

In [37]:
os.makedirs(name='model',exist_ok=True)
pickle.dump(df,open(file='model/df.pkl',mode='wb'))
# pickle.dump(votingRegressor,open(file='model/model.pkl',mode='wb'))
# pickle.dump(preprocessor_obj,open(file='model/preprocessor.pkl',mode='wb'))
pickle.dump(preprocess_model,open(file='model/preprocess_model.pkl',mode='wb'))

In [38]:
df

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,Ips,Ppi,Cpu_brand,HDD,SSD,Gpu_brand,Os
0,Apple,Ultrabook,8,1.37,11.175755,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,10.776777,0,0,127.677940,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,10.329931,0,0,141.211998,Intel Core i5,0,256,Intel,Others
3,Apple,Ultrabook,16,1.83,11.814476,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,11.473101,0,1,226.983005,Intel Core i5,0,256,Intel,Mac
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,Asus,Notebook,4,2.20,10.555257,0,0,100.454670,Intel Core i7,500,0,Nvidia,Windows
1269,Lenovo,2 in 1 Convertible,4,1.80,10.433899,1,1,157.350512,Intel Core i7,0,128,Intel,Windows
1270,Lenovo,2 in 1 Convertible,16,1.30,11.288115,1,1,276.053530,Intel Core i7,0,512,Intel,Windows
1271,Lenovo,Notebook,2,1.50,9.409283,0,0,111.935204,Other Intel Processor,0,0,Intel,Windows
