In [35]:
%matplotlib inline
import os
import pandas as pd
import numpy as np

# from sklearn.cross_validation import cross_val_predict
from sklearn import linear_model, tree, neighbors, svm, ensemble
import xgboost
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import scikitplot as skplt
from math import sqrt

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=FutureWarning)
import time

Using TensorFlow backend.


In [18]:
# setting OS directory
os.chdir('C:\\Users\\rckar\\OneDrive\\Documents\\MSBA\\Fall Semester\\6420 Predictive Analytics\\HW3')

In [19]:
# Reading Data
df = pd.read_excel("HW3.xlsx")
df.head(2)
# df.dtypes

Unnamed: 0,sequence_number,US,source_a,source_c,source_b,source_d,source_e,source_m,source_o,source_h,...,source_x,source_w,Freq,last_update_days_ago,1st_update_days_ago,Web order,Gender=male,Address_is_res,Purchase,Spending
0,1,1,0,0,1,0,0,0,0,0,...,0,0,2,3662,3662,1,0,1,1,127.87
1,2,1,0,0,0,0,1,0,0,0,...,0,0,0,2900,2900,1,1,0,0,0.0


In [20]:
# Data pre-processing

# checking for null values
df.isnull().values.any()

False

In [21]:
df_purchase = df[df['Purchase'] == 1]

# Selecting required columns
X_df = df_purchase.iloc[:,1:23]
y_df = df_purchase.iloc[:,24:]

##### Creating the Train and Test split

In [22]:
X_train, X_test_holdout, y_train, y_test_holdout = train_test_split(X_df, y_df, test_size=0.25, random_state=42)

##### Normalizing data

In [23]:
scaler = MinMaxScaler(feature_range=(0, 1))

X_train_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train_scaled)

x_test_scaled = scaler.fit_transform(X_test_holdout)
X_test_holdout = pd.DataFrame(x_test_scaled)

### Linear Regression

In [24]:
start = time.process_time()
# create linear regression object 
lr = linear_model.LinearRegression()

# cross validation 
scores = cross_val_score(lr, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")

print("Below are the mean absolute errors for each model run")
print(scores)
print(" ")
print("Mean score: %0.2f " % (abs(scores.mean())))

# Model fit on training data and predicting on testing data
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test_holdout)

# Model performance on testing data
print(" ")
print(" ")
print('Model performance on testing data')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_holdout, y_pred))  
print('Root Mean Squared Error:', sqrt(metrics.mean_squared_error(y_test_holdout, y_pred)))
# print('Explained Variance:',metrics.explained_variance_score(y_test_holdout, y_pred))
print('r2:',metrics.r2_score(y_test_holdout, y_pred))

print(" ")
print('Time Taken = ', time.process_time() - start)

Below are the R2 scores for each model run
[0.46288714 0.51444955 0.54446809 0.61393709 0.49581015]
 
Mean score: 0.53 
 
 
Model performance on testing data
Mean Absolute Error: 83.30524573400128
Root Mean Squared Error: 129.14949277100777
r2: 0.5173325785488885


## Lasso Regression

In [25]:
start = time.process_time()

lasso = linear_model.Lasso()

#Hyper Parameter tuning
param_set ={'alpha': list(np.arange(0.1,2,0.2))}
grid_lasso = GridSearchCV(lasso, param_grid = param_set, cv=5, scoring='neg_mean_absolute_error', verbose = 0)
grid_lasso.fit(X_train,y_train)

# examine the best model
print("Best score achieved across all parameters: ", abs(grid_lasso.best_score_))

# Dictionary containing the parameters used to generate that score
print(" ")
print("Best parameters")
print (grid_lasso.best_params_)

print(" ")
print("Best estimator")
print (grid_lasso.best_estimator_)

# predicting on test data
lasso = grid_lasso.best_estimator_
lasso.fit(X_train,y_train)
y_pred = lasso.predict(X_test_holdout)
print(" ")
print(" ")
print('Model performance on testing data')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_holdout, y_pred))  
print('Root Mean Squared Error:', sqrt(metrics.mean_squared_error(y_test_holdout, y_pred)))
print('r2:',metrics.r2_score(y_test_holdout, y_pred))
print('Time Taken = ', time.process_time() - start)

Lambda value chosen automatically:  0.3305464215472726
R2 value of fit:  0.5450757959237003
 
 
Model performance on testing data
Mean Absolute Error: 83.04858999339456
Root Mean Squared Error: 128.6950932677424
r2: 0.5207230371162469


## Ridge Regression

In [26]:
start = time.process_time()

ridge = linear_model.Ridge()

#Hyper Parameter tuning
param_set ={'alpha': list(np.arange(0.1,2,0.2))}
grid_ridge = GridSearchCV(ridge, param_grid = param_set, cv=5, scoring='neg_mean_absolute_error', verbose = 0)
grid_ridge.fit(X_train,y_train)

# examine the best model
print("Best score achieved across all parameters: ", abs(grid_ridge.best_score_))

# Dictionary containing the parameters used to generate that score
print(" ")
print("Best parameters")
print (grid_ridge.best_params_)

print(" ")
print("Best estimator")
print (grid_ridge.best_estimator_)

# predicting on test data
ridge = grid_ridge.best_estimator_
ridge.fit(X_train,y_train)
y_pred = ridge.predict(X_test_holdout)
print(" ")
print(" ")
print('Model performance on testing data')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_holdout, y_pred))  
print('Root Mean Squared Error:', sqrt(metrics.mean_squared_error(y_test_holdout, y_pred)))
print('r2:',metrics.r2_score(y_test_holdout, y_pred))
print('Time Taken = ', time.process_time() - start)

Lambda value chosen automatically:  0.1
R2 value of fit:  0.546304599858776
 
 
Model performance on testing data
Mean Absolute Error: 83.07180538716989
Root Mean Squared Error: 129.07449520085848
r2: 0.5178929891647148


## KNN

In [27]:
start = time.process_time()

knn = neighbors.KNeighborsRegressor()

#Hyper Parameter tuning
param_set ={'n_neighbors': list(range(1,30)), 'weights': ["uniform", "distance"]}
grid_knn = GridSearchCV(knn, param_grid = param_set, cv=5, scoring='neg_mean_absolute_error', verbose = 0)
grid_knn.fit(X_train,y_train)

# examine the best model
print("Best score achieved across all parameters: ", abs(grid_knn.best_score_))

# Dictionary containing the parameters used to generate that score
print(" ")
print("Best parameters")
print (grid_knn.best_params_)

print(" ")
print("Best estimator")
print (grid_knn.best_estimator_)

# predicting on test data
knn = grid_knn.best_estimator_
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test_holdout)
print(" ")
print(" ")
print('Model performance on testing data')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_holdout, y_pred))  
print('Root Mean Squared Error:', sqrt(metrics.mean_squared_error(y_test_holdout, y_pred)))
print('r2:',metrics.r2_score(y_test_holdout, y_pred))
print('Time Taken = ', time.process_time() - start)

Best score achieved across all parameters:  0.16641958244962163
 
Best parameters
{'n_neighbors': 7, 'weights': 'distance'}
 
Best estimator
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=7, p=2,
          weights='distance')
Mean Absolute Error: 93.00968436153802
Root Mean Squared Error: 169.2427642650885
r2: 0.17113691712822487


## Decision Tree

In [28]:
start = time.process_time()

DTree = tree.DecisionTreeRegressor()

#Hyper parameter tuning
param_set ={'max_depth': range(1,20), 'min_samples_split' : range(2,30)}
grid_DTree = GridSearchCV(DTree, param_grid = param_set, cv=5, scoring='neg_mean_absolute_error')
grid_DTree.fit(X_train,y_train)

# examine the best model
print("Best score achieved across all parameters: ", abs(grid_DTree.best_score_))

# Dictionary containing the parameters used to generate that score
print(" ")
print("Best parameters")
print (grid_DTree.best_params_)

print(" ")
print("Best estimator")
print (grid_DTree.best_estimator_)

# predicting on test data
DTree = grid_DTree.best_estimator_
DTree.fit(X_train,y_train)
y_pred = DTree.predict(X_test_holdout)

print(" ")
print(" ")
print('Model performance on testing data')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_holdout, y_pred))  
print('Root Mean Squared Error:', sqrt(metrics.mean_squared_error(y_test_holdout, y_pred)))
print('r2:',metrics.r2_score(y_test_holdout, y_pred))

print('Time Taken = ', time.process_time() - start)

Best score achieved across all parameters:  0.44244670216858456
 
Best parameters
{'max_depth': 6, 'min_samples_split': 5}
 
Best estimator
DecisionTreeRegressor(criterion='mse', max_depth=6, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=5, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
Mean Absolute Error: 77.21227384438299
Root Mean Squared Error: 145.351923367142
r2: 0.3886299740258399


## Support Vector Regression

In [29]:
start = time.process_time()

SVR = svm.SVR()
# Hyper parameter tuning using GridSearch
param_set = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1,2,10,100,1000], 'epsilon':[0.05,0.1,0.2,0.3,0.5]},
                    {'kernel': ['linear'], 'C': [1,2,5,10,100], 'epsilon':[0.05,0.1,0.2,0.3,0.5]}]
grid_SVR = GridSearchCV(SVR, param_grid = param_set, cv=5, scoring='neg_mean_absolute_error')
grid_SVR.fit(X_train,y_train)

# examine the best model
print("Best score achieved across all parameters: ", abs(grid_SVR.best_score_))

# Dictionary containing the parameters used to generate that score
print(" ")
print("Best parameters")
print (grid_SVR.best_params_)

print(" ")
print("Best estimator")
print (grid_SVR.best_estimator_)

# predicting on test data
SVR = grid_SVR.best_estimator_
SVR.fit(X_train,y_train)
y_pred = SVR.predict(X_test_holdout)

print(" ")
print(" ")
print('Model performance on testing data')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_holdout, y_pred))  
print('Root Mean Squared Error:', sqrt(metrics.mean_squared_error(y_test_holdout, y_pred)))
print('r2:',metrics.explained_variance_score(y_test_holdout, y_pred))
print('Time Taken = ', time.process_time() - start)

Best score achieved across all parameters:  0.4501830423622665
 
Best parameters
{'C': 100, 'epsilon': 0.5, 'kernel': 'linear'}
 
Best estimator
SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.5,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)
Mean Absolute Error: 71.54832519913802
Root Mean Squared Error: 136.11685262380934
Explained Variance: 0.4678513127886078


## Ensemble model : Random Forest

In [42]:
start = time.process_time()
RF = ensemble.RandomForestRegressor()

# Hyper parameter tuning using GridSearch
param_set ={'max_depth': [3,10,20],
            'min_samples_split' :[4,5,10],
            'n_estimators': [100,250,500],
            'bootstrap':[True, False] ,
            'max_features':['auto','sqrt'],
            'n_jobs':[-1]
           }
grid_RF = GridSearchCV(RF, param_grid = param_set, cv=5, scoring='neg_mean_absolute_error')
grid_RF.fit(X_train,y_train)

# examine the best model
print("Best score achieved across all parameters: ", abs(grid_RF.best_score_))

# Dictionary containing the parameters used to generate that score
print(" ")
print("Best parameters")
print (grid_RF.best_params_)

print(" ")
print("Best estimator")
print (grid_RF.best_estimator_)

# predicting on test data
RF = grid_RF.best_estimator_
RF.fit(X_train,y_train)
y_pred = RF.predict(X_test_holdout)

print(" ")
print(" ")
print('Model performance on testing data')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_holdout, y_pred))  
print('Root Mean Squared Error:', sqrt(metrics.mean_squared_error(y_test_holdout, y_pred)))
print('r2:',metrics.explained_variance_score(y_test_holdout, y_pred))
print(" ")
print('Time Taken = ', time.process_time() - start)

Best score achieved across all parameters:  0.5294118843528809
 
Best parameters
{'bootstrap': True, 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 4, 'n_estimators': 200}
 
Best estimator
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Mean Absolute Error: 73.82155270690394
Root Mean Squared Error: 128.3942691461728
r2: 0.5233405337219695
 
Time Taken =  11.25


## Ensemble model : XGBoost

In [40]:
start = time.process_time()
XGB = xgboost.XGBRegressor()

# Hyper parameter tuning using GridSearch
param_set ={'learning_rate': [0.01,0.05,0.10,0.20,0.30],
            'max_depth' : [2,4,5,10,15],
            'n_estimators': [100,250,500],
            'gamma':[0.05, 0.1, 0.2, 0.5] ,
            'colsample_bytree':[0.3, 0.6, 0.8, 1.0],
            'reg_alpha':[0,0.5, 1],
            'reg_lambda':[1,1.5,2],
            'n_jobs':[-1]
           }

grid_XGB = GridSearchCV(XGB, param_grid = param_set, cv=5, scoring='neg_mean_absolute_error')
grid_XGB.fit(X_train,y_train)

# examine the best model
print("Best score achieved across all parameters: ", abs(grid_XGB.best_score_))

# Dictionary containing the parameters used to generate that score
print(" ")
print("Best parameters")
print (grid_XGB.best_params_)

print(" ")
print("Best estimator")
print (grid_XGB.best_estimator_)

# predicting on test data
XGB = grid_XGB.best_estimator_
XGB.fit(X_train,y_train)
y_pred = XGB.predict(X_test_holdout)

# print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_holdout, y_pred))  
# print('Root Mean Squared Error:', sqrt(metrics.mean_squared_error(y_test_holdout, y_pred)))
# print('r2:',metrics.explained_variance_score(y_test_holdout, y_pred))
# print(" ")
print('Time Taken = ', time.process_time() - start)

























Best score achieved across all parameters:  0.5463375340560351
 
Best parameters
{'colsample_bytree': 0.6, 'gamma': 0.05, 'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200, 'reg_alpha': 0.5, 'reg_lambda': 1.5}
 
Best estimator
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.6, gamma=0.05,
       importance_type='gain', learning_rate=0.05, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0.5, reg_lambda=1.5, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)
Mean Absolute Error: 78.80273122936248
Root Mean Squared Error: 124.8673882355542
r2: 0.561773464736743
 
Time Taken =  195.078125


In [None]:
# predicting on test data
XGB = grid_XGB.best_estimator_
XGB.fit(X_train,y_train)
y_pred = XGB.predict(X_test_holdout)

print(" ")
print(" ")
print('Model performance on testing data')

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_holdout, y_pred))  
print('Root Mean Squared Error:', sqrt(metrics.mean_squared_error(y_test_holdout, y_pred)))
print('r2:',metrics.explained_variance_score(y_test_holdout, y_pred))

## Neural Network

In [37]:
def build_model():
    model = Sequential()
    model.add(Dense(units=44, activation = 'relu', input_dim=22))
    model.add(Dense(units=22, activation = 'relu', input_dim=22))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_absolute_error',  metrics=['mae','mse'])
    return model

In [39]:
start = time.process_time()

NN_Regressor = KerasRegressor(build_fn=build_model, batch_size=40,epochs=100)    

scores = cross_val_score(NN_Regressor, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
print("Below are the scores for each model run")
print(scores)
print(" ")
print("Mean and variance: %0.2f (+/- %0.2f)" % (abs(scores.mean()), scores.std() * 2))

NN = NN_Regressor.fit(X_train,y_train)
y_pred= NN_Regressor.predict(X_test_holdout)
print(" ")
print(" ")
print('Model performance on testing data')

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_holdout, y_pred))  
print('Root Mean Squared Error:', metrics.mean_squared_error(y_test_holdout, y_pred))

print('Time Taken = ', time.process_time() - start)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Below are the scores for each model run
[ -94.2747879   -95.2595787  -114.11573029  -82.24007543 -105.07243474]
 
Mean and variance: -98.19 (+/- 21.53)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 
 
Mean Absolute Error: 96.0219929617536
Root Mean Squared Error: 194.14429759608623
