# Model training
After doing the data preprocessing and some EDA, we now shall begin our model training. The dataset is the subset of the EDA, you can use the data in Github.

In [2]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib # used for save and reload models
from sklearn.model_selection import GridSearchCV
import time
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('subset.csv')
print(data.shape) 
data.head()

(100000, 11)


Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color
0,12500,2014.0,dodge,good,6 cylinders,other,112529.0,automatic,rwd,sedan,black
1,11899,2008.0,infiniti,good,8 cylinders,gas,124755.0,automatic,4wd,SUV,blue
2,16500,2012.0,ford,good,8 cylinders,gas,107000.0,automatic,4wd,pickup,white
3,6900,1985.0,chevrolet,excellent,8 cylinders,gas,129000.0,automatic,rwd,coupe,black
4,7600,2014.0,ford,excellent,4 cylinders,gas,42000.0,automatic,fwd,sedan,grey


In [4]:
# data['year'] = (data['year']-data.year.min()).astype(int)
# data['odometer'] = data['odometer'] // 5000 # scaler, integer division
data = data[data['price'] > 1000]
data = data[data['price'] < 40000]
# Rounded ['odometer'] to 5000
data['odometer'] = data['odometer'] // 5000
data = data[data['year'] > 110]

In [5]:
# separate the numerical values and categorical values
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical = []
for col in data.columns:
    if data[col].dtype not in numerics: 
        categorical.append(col)

In [6]:
# label enconder
from sklearn.preprocessing import LabelEncoder
for col in categorical:
    if col in data.columns:
        le = LabelEncoder()
        le.fit(list(data[col].astype(str).values))
        data[col] = le.transform(list(data[col].astype(str).values))
data_new = data.copy()

In [7]:
temp = pd.DataFrame(data_new)
X = temp.drop(columns = ['price'], axis = 1)
y = temp['price'].values
X.shape,y.shape

((100000, 10), (100000,))

In [8]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((80000, 10), (20000, 10), (80000,), (20000,))

In [9]:
# standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scl = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test_scl = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

In [74]:
X_train_scl_copy = X_train_scl.copy()
X_test_scl_copy = X_test_scl.copy()
train = pd.DataFrame(X_train_scl_copy)
test = pd.DataFrame(X_test_scl_copy)
train['price'] = y_train
test['price'] = y_test

In [75]:
# Save the variables
joblib.dump(train,'train.pkl')
joblib.dump(test,'test.pkl')

['test.pkl']

# Training Models

In [12]:
# define the evaluation index
from sklearn import metrics
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
# to store the scores in the form of (train_score,test_score)
evs = []
mae = []
rmse = []
r2 = []
model_names = []

## Baseline Models


### Linear Regression

In [13]:
# training for Linear Regresion Model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_scl,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [14]:
y_train_pred = lr.predict(X_train_scl)
train_evs = explained_variance_score(y_train,y_train_pred) 
train_mae = mean_absolute_error(y_train,y_train_pred)
train_rmse = mean_squared_error(y_train,y_train_pred)**0.5
train_r2 = r2_score(y_train,y_train_pred)
str1 = '''Training Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(train_evs,train_mae,train_rmse,train_r2)
print(str1)

Training Score:
       Explained Variance Score(EVC): 0.3853
       Mean Absolute Error(MAE): 4859.83
       Root Mean Squard Error(RMSE): 6699.26
       R Square(R2): 0.3853


In [15]:
y_test_pred = lr.predict(X_test_scl)
test_evs = explained_variance_score(y_test,y_test_pred) 
test_mae = mean_absolute_error(y_test,y_test_pred)
test_rmse = mean_squared_error(y_test,y_test_pred)**0.5
test_r2 = r2_score(y_test,y_test_pred)
str1 = '''Testing Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(test_evs,test_mae,test_rmse,test_r2)
print(str1)

Testing Score:
       Explained Variance Score(EVC): 0.3469
       Mean Absolute Error(MAE): 4940.68
       Root Mean Squard Error(RMSE): 6846.84
       R Square(R2): 0.3467


In [13]:
# # only store once
# evs.append((train_evs,test_evs))
# mae.append((train_mae,test_mae))
# rmse.append((train_rmse,test_rmse))
# r2.append((train_r2,test_r2))
# model_names.append('Linear Regression')

### Supoort Vector Regression

In [16]:
from sklearn.svm import LinearSVR
# # below codes are tested for selecting the best C
# # C = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100,200,300]
# C = [i for i in np.arange(2.5,3.5,0.1)]
# # C = [i for i in np.arange(2,2.3,0.05)]
# a,b,c,d = [],[],[],[]
# for i in C:
#     svr = LinearSVR(C = i)
#     svr.fit(X_train_scl, y_train)
#     y_train_pred = svr.predict(X_train_scl)
#     a.append(explained_variance_score(y_train,y_train_pred))
#     b.append(mean_absolute_error(y_train,y_train_pred))
#     c.append(mean_squared_error(y_train,y_train_pred)**0.5)
#     d.append(r2_score(y_train,y_train_pred))
# np.argmax(np.array(a)), np.argmin(np.array(b)), np.argmin(np.array(c)), np.argmax(np.array(d))

In [17]:
# C is 3.3, perfomance better
svr = LinearSVR(C = 3.3)
svr.fit(X_train_scl,y_train)

LinearSVR(C=3.3, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)

In [18]:
y_train_pred = svr.predict(X_train_scl)
train_evs = explained_variance_score(y_train,y_train_pred) 
train_mae = mean_absolute_error(y_train,y_train_pred)
train_rmse = mean_squared_error(y_train,y_train_pred)**0.5
train_r2 = r2_score(y_train,y_train_pred)
str1 = '''Training Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(train_evs,train_mae,train_rmse,train_r2)
print(str1)

Training Score:
       Explained Variance Score(EVC): 0.2367
       Mean Absolute Error(MAE): 4341.69
       Root Mean Squard Error(RMSE): 7624.04
       R Square(R2): 0.2039


In [19]:
y_test_pred = svr.predict(X_test_scl)
test_evs = explained_variance_score(y_test,y_test_pred) 
test_mae = mean_absolute_error(y_test,y_test_pred)
test_rmse = mean_squared_error(y_test,y_test_pred)**0.5
test_r2 = r2_score(y_test,y_test_pred)
str1 = '''Testing Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(test_evs,test_mae,test_rmse,test_r2)
print(str1)

Testing Score:
       Explained Variance Score(EVC): 0.1713
       Mean Absolute Error(MAE): 4375.88
       Root Mean Squard Error(RMSE): 7839.09
       R Square(R2): 0.1436


In [18]:
# # only store once
# evs.append((train_evs,test_evs))
# mae.append((train_mae,test_mae))
# rmse.append((train_rmse,test_rmse))
# r2.append((train_r2,test_r2))
# model_names.append('Support Vector Regression')

## SGDRegressor

In [21]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor()
sgd.fit(X_train_scl, y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [22]:
y_train_pred = sgd.predict(X_train_scl)
train_evs = explained_variance_score(y_train,y_train_pred) 
train_mae = mean_absolute_error(y_train,y_train_pred)
train_rmse = mean_squared_error(y_train,y_train_pred)**0.5
train_r2 = r2_score(y_train,y_train_pred)
str1 = '''Training Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(train_evs,train_mae,train_rmse,train_r2)
print(str1)

Training Score:
       Explained Variance Score(EVC): 0.3818
       Mean Absolute Error(MAE): 4862.78
       Root Mean Squard Error(RMSE): 6718.69
       R Square(R2): 0.3817


In [23]:
y_test_pred = sgd.predict(X_test_scl)
test_evs = explained_variance_score(y_test,y_test_pred) 
test_mae = mean_absolute_error(y_test,y_test_pred)
test_rmse = mean_squared_error(y_test,y_test_pred)**0.5
test_r2 = r2_score(y_test,y_test_pred)
str1 = '''Testing Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(test_evs,test_mae,test_rmse,test_r2)
print(str1)

Testing Score:
       Explained Variance Score(EVC): 0.3447
       Mean Absolute Error(MAE): 4938.19
       Root Mean Squard Error(RMSE): 6857.39
       R Square(R2): 0.3446


In [22]:
# # only store once
# evs.append((train_evs,test_evs))
# mae.append((train_mae,test_mae))
# rmse.append((train_rmse,test_rmse))
# r2.append((train_r2,test_r2))
# model_names.append('SDGRegressor')

## Ridge Regression

In [24]:
from sklearn.linear_model import RidgeCV
alpha = [0.01, 0.03, 0.1, 0.3, 1, 3, 10]
rcv = RidgeCV(alphas= alpha, cv = 5, scoring='r2')
rcv.fit(X_train_scl,y_train)

RidgeCV(alphas=array([ 0.01,  0.03,  0.1 ,  0.3 ,  1.  ,  3.  , 10.  ]), cv=5,
    fit_intercept=True, gcv_mode=None, normalize=False, scoring='r2',
    store_cv_values=False)

In [25]:
y_train_pred = rcv.predict(X_train_scl)
train_evs = explained_variance_score(y_train,y_train_pred) 
train_mae = mean_absolute_error(y_train,y_train_pred)
train_rmse = mean_squared_error(y_train,y_train_pred)**0.5
train_r2 = r2_score(y_train,y_train_pred)
str1 = '''Training Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(train_evs,train_mae,train_rmse,train_r2)
print(str1)

Training Score:
       Explained Variance Score(EVC): 0.3853
       Mean Absolute Error(MAE): 4859.95
       Root Mean Squard Error(RMSE): 6699.26
       R Square(R2): 0.3853


In [26]:
y_test_pred = rcv.predict(X_test_scl)
test_evs = explained_variance_score(y_test,y_test_pred) 
test_mae = mean_absolute_error(y_test,y_test_pred)
test_rmse = mean_squared_error(y_test,y_test_pred)**0.5
test_r2 = r2_score(y_test,y_test_pred)
str1 = '''Testing Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(test_evs,test_mae,test_rmse,test_r2)
print(str1)

Testing Score:
       Explained Variance Score(EVC): 0.3469
       Mean Absolute Error(MAE): 4940.79
       Root Mean Squard Error(RMSE): 6846.82
       R Square(R2): 0.3467


# Tree models

## Decision Tree

In [27]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=10,max_features='auto')
dt.fit(X_train_scl,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [28]:
y_train_pred = dt.predict(X_train_scl)
train_evs = explained_variance_score(y_train,y_train_pred) 
train_mae = mean_absolute_error(y_train,y_train_pred)
train_rmse = mean_squared_error(y_train,y_train_pred)**0.5
train_r2 = r2_score(y_train,y_train_pred)
str1 = '''Training Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(train_evs,train_mae,train_rmse,train_r2)
print(str1)

Training Score:
       Explained Variance Score(EVC): 0.8326
       Mean Absolute Error(MAE): 2464.94
       Root Mean Squard Error(RMSE): 3496.42
       R Square(R2): 0.8326


In [29]:
y_test_pred = dt.predict(X_test_scl)
test_evs = explained_variance_score(y_test,y_test_pred) 
test_mae = mean_absolute_error(y_test,y_test_pred)
test_rmse = mean_squared_error(y_test,y_test_pred)**0.5
test_r2 = r2_score(y_test,y_test_pred)
str1 = '''Testing Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(test_evs,test_mae,test_rmse,test_r2)
print(str1)

Testing Score:
       Explained Variance Score(EVC): 0.8013
       Mean Absolute Error(MAE): 2625.76
       Root Mean Squard Error(RMSE): 3775.44
       R Square(R2): 0.8013


## Random Forest

This block takes training time for around 20 minutes
If you want to test for this model, you can
use the pre-trained model in the next sub-section of Random Forest
See in rf = joblib.load('RandomForest.pkl') function

In [30]:
from sklearn.ensemble import RandomForestRegressor
start = time.time()
rf = GridSearchCV(RandomForestRegressor(),param_grid={'n_estimators':[10,50,100,300],'max_depth':[5,10,15,20]},cv = 5)
# rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train_scl,y_train)
end = time.time()
print('Training time')
print(end-start)

Training time
1072.392615556717


In [34]:
rf.best_params_

{'max_depth': 20, 'n_estimators': 300}

In [68]:
# save the model
joblib.dump(rf,'RandomeForest.pkl')

['RandomeForest.pkl']

Load the pretrained model of Random Forest

In [32]:
y_train_pred = rf.predict(X_train_scl)
train_evs = explained_variance_score(y_train,y_train_pred) 
train_mae = mean_absolute_error(y_train,y_train_pred)
train_rmse = mean_squared_error(y_train,y_train_pred)**0.5
train_r2 = r2_score(y_train,y_train_pred)
str1 = '''Training Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(train_evs,train_mae,train_rmse,train_r2)
print(str1)

Training Score:
       Explained Variance Score(EVC): 0.9746
       Mean Absolute Error(MAE): 889.43
       Root Mean Squard Error(RMSE): 1362.40
       R Square(R2): 0.9746


In [33]:
y_test_pred = rf.predict(X_test_scl)
test_evs = explained_variance_score(y_test,y_test_pred) 
test_mae = mean_absolute_error(y_test,y_test_pred)
test_rmse = mean_squared_error(y_test,y_test_pred)**0.5
test_r2 = r2_score(y_test,y_test_pred)
str1 = '''Testing Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(test_evs,test_mae,test_rmse,test_r2)
print(str1)

Testing Score:
       Explained Variance Score(EVC): 0.8657
       Mean Absolute Error(MAE): 2012.86
       Root Mean Squard Error(RMSE): 3104.29
       R Square(R2): 0.8657


## Gradient Boosting
http://hyperopt.github.io/hyperopt/

In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# used pre-stored training and testing dataset
X_train_scl = train.drop(columns='price',axis = 1)
y_train = train['price']
X_test_scl = test.drop(columns='price',axis = 1)
y_test = test['price']
X_train_scl.shape, y_train.shape, X_test_scl.shape, y_test.shape

((80000, 10), (80000,), (20000, 10), (20000,))

In [8]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
# ! pip install hyperopt
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
import time
start = time.time()
def hyperopt_gb_score(params):
    clf = GradientBoostingRegressor(**params)
    current_score = cross_val_score(clf, X_train_scl, y_train, cv=5).mean()
    print(current_score, params)
    return current_score 
 
parameters = {
            'n_estimators': hp.choice('n_estimators', range(100, 1000)),
            'max_depth': hp.choice('max_depth', np.arange(2, 10, dtype=int))            
        }
 
best = fmin(fn=hyperopt_gb_score, space=parameters, algo=tpe.suggest, max_evals=5)
end = time.time()
print('best:')
print(best)
print('training time for 5-fold cross validation')
print(end-start)

0.873837205508892                                                                                                      
{'max_depth': 8, 'n_estimators': 124}                                                                                  
0.7911314320953385                                                                                                     
{'max_depth': 2, 'n_estimators': 113}                                                                                  
0.883081129379687                                                                                                      
{'max_depth': 8, 'n_estimators': 699}                                                                                  
0.882955730119062                                                                                                      
{'max_depth': 9, 'n_estimators': 874}                                                                                  
0.8516579340417476                      

In [59]:
params = space_eval(parameters, best)
params

{'max_depth': 2, 'n_estimators': 338}

In [62]:
parameter = {'max_depth': 2, 'n_estimators': 338}
grd_boost_reg = GradientBoostingRegressor(**parameter)
grd_boost_reg.fit(X_train_scl,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=338, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [67]:
joblib.dump(grd_boost_reg,'GradientBoostingRegressor.pkl')

['GradientBoostingRegressor.pkl']

In [63]:
y_train_pred = grd_boost_reg.predict(X_train_scl)
train_evs = explained_variance_score(y_train,y_train_pred) 
train_mae = mean_absolute_error(y_train,y_train_pred)
train_rmse = mean_squared_error(y_train,y_train_pred)**0.5
train_r2 = r2_score(y_train,y_train_pred)
str1 = '''Training Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(train_evs,train_mae,train_rmse,train_r2)
print(str1)

Training Score:
       Explained Variance Score(EVC): 0.8193
       Mean Absolute Error(MAE): 2548.04
       Root Mean Squard Error(RMSE): 3632.09
       R Square(R2): 0.8193


In [64]:
y_test_pred = grd_boost_reg.predict(X_test_scl)
test_evs = explained_variance_score(y_test,y_test_pred) 
test_mae = mean_absolute_error(y_test,y_test_pred)
test_rmse = mean_squared_error(y_test,y_test_pred)**0.5
test_r2 = r2_score(y_test,y_test_pred)
str1 = '''Testing Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(test_evs,test_mae,test_rmse,test_r2)
print(str1)

Testing Score:
       Explained Variance Score(EVC): 0.8046
       Mean Absolute Error(MAE): 2606.67
       Root Mean Squard Error(RMSE): 3744.42
       R Square(R2): 0.8046


## XGBoost
https://blog.csdn.net/weixin_41358871/article/details/81541482

In [70]:
import xgboost as xgb
start = time.time()
para = {}
xgb_cls = xgb.XGBRegressor(objective= 'reg:squarederror',tree_method = 'gpu_hist') 
parameters = {'n_estimators': [60, 100, 120, 140], 
              'learning_rate': [0.01, 0.1],
              'max_depth': [5, 7],
              'reg_lambda': [0.5]}
xgb_reg = GridSearchCV(estimator=xgb_cls, param_grid=parameters, cv=5).fit(X_train_scl, y_train)
print("Best score: %0.3f" % xgb_reg.best_score_)
print("Best parameters set:", xgb_reg.best_params_)
end = time.time()
print(end-start)

Best score: 0.869
Best parameters set: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 140, 'reg_lambda': 0.5}
228.7132592201233


In [73]:
joblib.dump(xgb_reg,'XGBoostRegressor.pkl')

['XGBoostRegressor.pkl']

In [71]:
y_train_pred = xgb_reg.predict(X_train_scl)
train_evs = explained_variance_score(y_train,y_train_pred) 
train_mae = mean_absolute_error(y_train,y_train_pred)
train_rmse = mean_squared_error(y_train,y_train_pred)**0.5
train_r2 = r2_score(y_train,y_train_pred)
str1 = '''Training Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(train_evs,train_mae,train_rmse,train_r2)
print(str1)

Training Score:
       Explained Variance Score(EVC): 0.8898
       Mean Absolute Error(MAE): 1954.43
       Root Mean Squard Error(RMSE): 2836.02
       R Square(R2): 0.8898


In [72]:
y_test_pred = xgb_reg.predict(X_test_scl)
test_evs = explained_variance_score(y_test,y_test_pred) 
test_mae = mean_absolute_error(y_test,y_test_pred)
test_rmse = mean_squared_error(y_test,y_test_pred)**0.5
test_r2 = r2_score(y_test,y_test_pred)
str1 = '''Testing Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(test_evs,test_mae,test_rmse,test_r2)
print(str1)

Testing Score:
       Explained Variance Score(EVC): 0.7188
       Mean Absolute Error(MAE): 3196.60
       Root Mean Squard Error(RMSE): 4602.66
       R Square(R2): 0.7048


# Nueral Network

In [48]:
from sklearn.neural_network import MLPRegressor
start = time.time()
mlp = MLPRegressor()
param_grid = {'hidden_layer_sizes': [i for i in range(2,20)],
              'activation': ['relu'],
              'solver': ['adam'],
              'learning_rate': ['constant'],
              'learning_rate_init': [0.01],
              'power_t': [0.5],
              'alpha': [0.0001],
              'max_iter': [1000],
              'early_stopping': [True],
              'warm_start': [False]}
mlp_GS = GridSearchCV(mlp, param_grid=param_grid, 
                   cv=10, verbose=True, pre_dispatch='2*n_jobs')
mlp_GS.fit(X_train_scl, y_train)
end = time.time()
print('Training Time:')
print(end - start)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 73.5min finished


Training Time:
4465.311811923981


In [69]:
joblib.dump(mlp_GS,'MlPRegressor.pkl')

['MlPRegressor.pkl']

# Ensemble Learning

## Bagging 

In [33]:
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor()
bagging.fit(X_train_scl, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [34]:
y_train_pred = bagging.predict(X_train_scl)
train_evs = explained_variance_score(y_train,y_train_pred) 
train_mae = mean_absolute_error(y_train,y_train_pred)
train_rmse = mean_squared_error(y_train,y_train_pred)**0.5
train_r2 = r2_score(y_train,y_train_pred)
str1 = '''Training Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(train_evs,train_mae,train_rmse,train_r2)
print(str1)

Training Score:
       Explained Variance Score(EVC): 0.9092
       Mean Absolute Error(MAE): 1625.30
       Root Mean Squard Error(RMSE): 2682.44
       R Square(R2): 0.9091


In [36]:
y_test_pred = bagging.predict(X_test_scl)
test_evs = explained_variance_score(y_test,y_test_pred) 
test_mae = mean_absolute_error(y_test,y_test_pred)
test_rmse = mean_squared_error(y_test,y_test_pred)**0.5
test_r2 = r2_score(y_test,y_test_pred)
str1 = '''Testing Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(test_evs,test_mae,test_rmse,test_r2)
print(str1)

Testing Score:
       Explained Variance Score(EVC): 0.7153
       Mean Absolute Error(MAE): 3227.37
       Root Mean Squard Error(RMSE): 4795.71
       R Square(R2): 0.7153


## AdaBoost

In [37]:
from sklearn.ensemble import AdaBoostRegressor
Ada_Boost = AdaBoostRegressor()
Ada_Boost.fit(X_train_scl, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)

In [39]:
y_train_pred = Ada_Boost.predict(X_train_scl)
train_evs = explained_variance_score(y_train,y_train_pred) 
train_mae = mean_absolute_error(y_train,y_train_pred)
train_rmse = mean_squared_error(y_train,y_train_pred)**0.5
train_r2 = r2_score(y_train,y_train_pred)
str1 = '''Training Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(train_evs,train_mae,train_rmse,train_r2)
print(str1)

Training Score:
       Explained Variance Score(EVC): 0.5148
       Mean Absolute Error(MAE): 5516.57
       Root Mean Squard Error(RMSE): 6597.94
       R Square(R2): 0.4503


In [40]:
y_test_pred = Ada_Boost.predict(X_test_scl)
test_evs = explained_variance_score(y_test,y_test_pred) 
test_mae = mean_absolute_error(y_test,y_test_pred)
test_rmse = mean_squared_error(y_test,y_test_pred)**0.5
test_r2 = r2_score(y_test,y_test_pred)
str1 = '''Testing Score:
       Explained Variance Score(EVC): %.4f
       Mean Absolute Error(MAE): %.2f
       Root Mean Squard Error(RMSE): %.2f
       R Square(R2): %.4f''' %(test_evs,test_mae,test_rmse,test_r2)
print(str1)

Testing Score:
       Explained Variance Score(EVC): 0.5087
       Mean Absolute Error(MAE): 5572.93
       Root Mean Squard Error(RMSE): 6669.31
       R Square(R2): 0.4493
