# Model Training

In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor,
                              AdaBoostRegressor ,
                                GradientBoostingRegressor,
                                VotingRegressor,
                                StackingRegressor)

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error , r2_score
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler # for num_cols
from sklearn.preprocessing import OneHotEncoder # for cat_cols
from sklearn.compose import ColumnTransformer # to transform features
from sklearn.model_selection import train_test_split

import pickle
import os

In [3]:
df = pd.read_csv('data/laptop_data_cleaned.csv')
df.drop_duplicates(inplace=True)

In [4]:
df

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,Ips,Ppi,Cpu_brand,HDD,SSD,Gpu_brand,Os
0,Apple,Ultrabook,8,1.37,11.175755,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,10.776777,0,0,127.677940,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,10.329931,0,0,141.211998,Intel Core i5,0,256,Intel,Others
3,Apple,Ultrabook,16,1.83,11.814476,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,11.473101,0,1,226.983005,Intel Core i5,0,256,Intel,Mac
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,Asus,Notebook,4,2.20,10.555257,0,0,100.454670,Intel Core i7,500,0,Nvidia,Windows
1269,Lenovo,2 in 1 Convertible,4,1.80,10.433899,1,1,157.350512,Intel Core i7,0,128,Intel,Windows
1270,Lenovo,2 in 1 Convertible,16,1.30,11.288115,1,1,276.053530,Intel Core i7,0,512,Intel,Windows
1271,Lenovo,Notebook,2,1.50,9.409283,0,0,111.935204,Other Intel Processor,0,0,Intel,Windows


In [5]:
df.describe()

Unnamed: 0,Ram,Weight,Price,TouchScreen,Ips,Ppi,HDD,SSD
count,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0
mean,8.451258,2.041761,10.828035,0.147013,0.279874,146.936126,414.040881,186.298742
std,5.099249,0.669088,0.619774,0.354258,0.449114,42.940459,518.128204,186.597777
min,2.0,0.69,9.134616,0.0,0.0,90.583402,0.0,0.0
25%,4.0,1.5,10.387379,0.0,0.0,127.335675,0.0,0.0
50%,8.0,2.04,10.872255,0.0,0.0,141.211998,0.0,256.0
75%,8.0,2.31,11.287447,0.0,1.0,157.350512,1000.0,256.0
max,64.0,4.7,12.691441,1.0,1.0,352.465147,2000.0,1024.0


In [6]:
X = df.drop(columns=['Price'])
y = df['Price']

In [7]:
cat_cols = [features for features in X.columns if X[features].dtypes == 'O']
num_cols = [features for features in X.columns if X[features].dtypes != 'O']
len(num_cols) , len(cat_cols)

(7, 5)

In [8]:
cat_pipeline = Pipeline(
    steps=[
        ('ohe',OneHotEncoder(sparse=False, # this will return me numpy array by the first time
        drop='first'# This will drop first column
              )),
        ('imputer',SimpleImputer())
    ]
)

In [9]:
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer()),
        ('scaler',StandardScaler()),
    ]
)

In [10]:
preprocessor_obj = ColumnTransformer(
    transformers=[
        ('cat_col_pipeline',cat_pipeline,cat_cols),
        ('num_col_pipeline',num_pipeline,num_cols)
    ],remainder='passthrough'
)
X = preprocessor_obj.fit_transform(X)



In [11]:

X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=0.2, random_state=1)

X_train.shape , X_test.shape , y_train.shape ,y_test.shape


((1017, 38), (255, 38), (1017,), (255,))

In [12]:
X_train.shape


(1017, 38)

In [13]:
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)

In [14]:
y_pred = dt.predict(X_test)

In [15]:
r2_score(y_test,y_pred)

0.8208865376484062

In [16]:
models = {
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
}


In [17]:
params = {
    "Random Forest": {
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        # 'max_features': ['sqrt', 'log2', None],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "Gradient Boosting": {
        'loss': ['squared_error', 'huber', 'absolute_error', 'quantile'],
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        # 'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        'criterion': ['squared_error', 'friedman_mse'],
        # 'max_features': ['auto', 'sqrt', 'log2'],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "XGBRegressor": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    },
    "CatBoosting Regressor": {
        'depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    },
    "AdaBoost Regressor": {
        'learning_rate': [0.1, 0.01, 0.5, 0.001],
        # 'loss': ['linear', 'square', 'exponential'],
        'n_estimators': [8, 16, 32, 64, 128, 256]
    }
}


In [18]:
len(params.keys())

5

In [19]:
def error_Accuracy(true,pred):
    r2_accuracy = r2_score(y_true=true,y_pred=pred)
    mse = mean_squared_error(y_true=true,y_pred=pred)
    mae = mean_absolute_error(y_true=true,y_pred=pred)
    return r2_accuracy,mse,mae

In [20]:

class ModelTraining:
    def __init__(self):
        self.model_list = []
        self.accuracy_list = []
        self.best_params_dict = {}

    def train_model(self, models, params, X_train, y_train, X_test, y_test):
        for i, model_name in enumerate(models.keys()):
            model = models[model_name]
            param = params[model_name]

            grid_search_cv = GridSearchCV(estimator=model, param_grid=param, cv=5)
            grid_search_cv.fit(X_train, y_train)

            best_params = grid_search_cv.best_params_
            model.set_params(**best_params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            r2_accuracy, mse, mae = error_Accuracy(true=y_test, pred=y_pred)

            print(model_name)
            self.model_list.append(model_name)

            print("- Mean Absolute Error: {:.4f}".format(mae))
            print("- Mean Squared Error: {:.4f}".format(mse))
            print("- R2 Score: {:.4f}".format(r2_accuracy))
            print('\n')

            self.best_params_dict[model_name] = best_params

            self.accuracy_list.append(r2_accuracy)

    def return_accuracy_list(self):
        return self.accuracy_list
    
    def best_params(self):
        return self.best_params_dict


In [21]:
train_model = ModelTraining()

In [22]:
train_model.train_model(models=models,params=params,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)

Random Forest
- Mean Absolute Error: 0.1533
- Mean Squared Error: 0.0398
- R2 Score: 0.8898


Gradient Boosting
- Mean Absolute Error: 0.1594
- Mean Squared Error: 0.0406
- R2 Score: 0.8877


XGBRegressor
- Mean Absolute Error: 0.1396
- Mean Squared Error: 0.0346
- R2 Score: 0.9044


CatBoosting Regressor
- Mean Absolute Error: 0.1583
- Mean Squared Error: 0.0427
- R2 Score: 0.8818


AdaBoost Regressor
- Mean Absolute Error: 0.2092
- Mean Squared Error: 0.0675
- R2 Score: 0.8133




In [23]:
train_model.best_params_dict

{'Random Forest': {'criterion': 'squared_error', 'n_estimators': 128},
 'Gradient Boosting': {'criterion': 'squared_error',
  'learning_rate': 0.1,
  'loss': 'huber',
  'n_estimators': 256},
 'XGBRegressor': {'learning_rate': 0.1, 'n_estimators': 256},
 'CatBoosting Regressor': {'depth': 10,
  'iterations': 100,
  'learning_rate': 0.1},
 'AdaBoost Regressor': {'learning_rate': 0.5, 'n_estimators': 128}}

In [24]:
train_model.accuracy_list

[0.8898268396511232,
 0.8877299247686741,
 0.90441338344959,
 0.881847048029996,
 0.8133052210290694]

In [25]:
model_accuracy=pd.DataFrame(list(zip(list(models.keys()),train_model.accuracy_list)),columns=['model','accuracy'])
model_accuracy

Unnamed: 0,model,accuracy
0,Random Forest,0.889827
1,Gradient Boosting,0.88773
2,XGBRegressor,0.904413
3,CatBoosting Regressor,0.881847
4,AdaBoost Regressor,0.813305


In [26]:
best_models = model_accuracy.sort_values(by='accuracy',ascending=False).head(4)
best_models

Unnamed: 0,model,accuracy
2,XGBRegressor,0.904413
0,Random Forest,0.889827
1,Gradient Boosting,0.88773
3,CatBoosting Regressor,0.881847


In [27]:
best_models

Unnamed: 0,model,accuracy
2,XGBRegressor,0.904413
0,Random Forest,0.889827
1,Gradient Boosting,0.88773
3,CatBoosting Regressor,0.881847


## Voting Regressor

In [28]:
train_model.best_params_dict

{'Random Forest': {'criterion': 'squared_error', 'n_estimators': 128},
 'Gradient Boosting': {'criterion': 'squared_error',
  'learning_rate': 0.1,
  'loss': 'huber',
  'n_estimators': 256},
 'XGBRegressor': {'learning_rate': 0.1, 'n_estimators': 256},
 'CatBoosting Regressor': {'depth': 10,
  'iterations': 100,
  'learning_rate': 0.1},
 'AdaBoost Regressor': {'learning_rate': 0.5, 'n_estimators': 128}}

In [29]:
RandomForest =RandomForestRegressor(criterion='squared_error',n_estimators=256)
GradientBoost = GradientBoostingRegressor(criterion='squared_error',learning_rate=0.1,loss='huber',n_estimators=256)
Xgboost = XGBRegressor(learning_rate = 0.1,n_estimators = 256)
Catboost = CatBoostRegressor(learning_rate=0.5,n_estimators=256)

In [30]:
votingRegressor = VotingRegressor(
    estimators=[
        ('RandomForest',RandomForest),
        ('GradientBoost',GradientBoost),
        ('Xgboost',Xgboost),
        ('Catboost',Catboost)
    ]
)

In [31]:
votingRegressor.fit(X_train,y_train)

0:	learn: 0.4550098	total: 1.43ms	remaining: 365ms
1:	learn: 0.3455767	total: 2.75ms	remaining: 349ms
2:	learn: 0.2889211	total: 3.8ms	remaining: 321ms
3:	learn: 0.2644756	total: 4.91ms	remaining: 309ms
4:	learn: 0.2525025	total: 6.02ms	remaining: 302ms
5:	learn: 0.2404295	total: 7.16ms	remaining: 298ms
6:	learn: 0.2343844	total: 8.32ms	remaining: 296ms
7:	learn: 0.2312554	total: 9.41ms	remaining: 292ms
8:	learn: 0.2268876	total: 11.2ms	remaining: 308ms
9:	learn: 0.2209449	total: 12.5ms	remaining: 307ms
10:	learn: 0.2170065	total: 13.4ms	remaining: 299ms
11:	learn: 0.2142584	total: 14.4ms	remaining: 292ms
12:	learn: 0.2086405	total: 15.4ms	remaining: 288ms
13:	learn: 0.2054647	total: 16.7ms	remaining: 288ms
14:	learn: 0.2019314	total: 17.8ms	remaining: 286ms
15:	learn: 0.1986108	total: 18.8ms	remaining: 282ms
16:	learn: 0.1943821	total: 19.8ms	remaining: 279ms
17:	learn: 0.1919346	total: 20.9ms	remaining: 277ms
18:	learn: 0.1898856	total: 21.9ms	remaining: 273ms
19:	learn: 0.1879048	to

In [32]:
y_pred_voting_regressor = votingRegressor.predict(X_test)

In [37]:
r2_accuracy, mse, mae = error_Accuracy(true=y_test, pred=y_pred_voting_regressor)
print("- Mean Absolute Error: {:.4f}".format(mae))
print("- Mean Squared Error: {:.4f}".format(mse))
print("- R2 Score: {:.4f}".format(r2_accuracy))

- Mean Absolute Error: 0.1407
- Mean Squared Error: 0.0328
- R2 Score: 0.9093


# Finally Voting Regressor is the best Algo For my MODEL

## Pickling



In [41]:
os.makedirs(name='model',exist_ok=True)
pickle.dump(df,open(file='model/df.pkl',mode='wb'))
pickle.dump(votingRegressor,open(file='model/model.pkl',mode='wb'))