In [1]:
# import importants
import pandas as pd
import pickle
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import ElasticNetCV

In [2]:
# Cross Validation Starts Here:

In [3]:
with open('pickled_X5', 'rb') as f:
    X = pickle.load(f)

with open('pickled_Y5', 'rb') as f:
    Y = pickle.load(f)

In [4]:
# Splitting the data into training and testing sets from the best model
# Test size is 20% or total dataframe
# Specifying a random_state means we will get the same shuffle each time
# Setting a random_state is helpful for debugging since it ensures the model will train on the same data each time this is run
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=7)

In [5]:
# Specify the type of regression that will be used in cross-validation
# This variable will be referenced later

L_Reg = LinearRegression()

In [6]:
kf = KFold(n_splits=5, # number of folds
     shuffle=True, # randomizes order of rows before split
     random_state = 7) # using same number in future code/ rerunning allows us to replicate the split 

In [7]:
# Need to check for overfitting by comparing training scores to val scores
val_and_train_scores = cross_validate(L_Reg, X_train, y_train, 
                                      cv=kf, 
                                      scoring='r2',
                                      return_train_score=True)
# print('Validation Score:  ', val_and_train_scores['test_score']) # test_score is validation score
# print('Train Score: ', val_and_train_scores['train_score']) # train_score is training score

In [8]:
computer_model = sm.OLS(Y, X)
results = computer_model.fit()

In [9]:
# Printing the mean validation score with standard deviation, rounded to 2 decimals
val_score = np.round(np.mean(val_and_train_scores['test_score']), 3)
std_val = np.round(np.std(val_and_train_scores['test_score']), 3)

# Printing the mean training score with standard deviation, rounded to 2 decimals
train_score = np.round(np.mean(val_and_train_scores['train_score']), 3)
std_train = np.round(np.std(val_and_train_scores['train_score']), 3)

# Resulting Test Score
y_pred = results.predict(X_test)
test_r2 = r2_score(y_test, y_pred)
test_score = np.round(test_r2, 3)

# Print the difference between the training score and testing score to ensure the model isn't overfitting
overfit = np.round(abs(np.mean(val_and_train_scores['test_score'])-np.mean(val_and_train_scores['train_score'])), 4)


print('Validation Score:...', val_score, "+-", std_val)
print('Train Score:........', train_score, "+-", std_train)
print('Test Score:.........', test_score)
print('Overfit Score:......', overfit)
# Since there is only a very small difference between validation and training scores, we can infer the model is not overfitting!

Validation Score:... 0.776 +- 0.027
Train Score:........ 0.787 +- 0.007
Test Score:......... 0.703
Overfit Score:...... 0.0114


In [10]:
# Elastic Net Regularization Starts Here (Cross-Validation of this model at the end):
lambdas = [0.0001, 0.01, 0.1, 0.3, 0.5, 0.7, 1]
L1s = [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]

In [11]:
enet_model = ElasticNetCV(alphas=lambdas,
                         l1_ratio = L1s,
                         cv=kf, 
                          random_state=7)

In [12]:
# Training on 80% of available data, as separated earlier
enet_model.fit(X_train, y_train)

ElasticNetCV(alphas=[0.0001, 0.01, 0.1, 0.3, 0.5, 0.7, 1],
             cv=KFold(n_splits=5, random_state=7, shuffle=True),
             l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], random_state=7)

In [13]:
# Now to cross-validate our elastic net regularization:
# Need to check for overfitting by comparing training scores to val scores
enet_val_and_train_scores = cross_validate(enet_model, X_train, y_train, 
                                      cv=kf, 
                                      scoring='r2',
                                      return_train_score=True)
# print('Validation Score:  ', enet_val_and_train_scores['test_score']) # test_score is validation score
# print('Train Score: ', enet_val_and_train_scores['train_score']) # train_score is training score

In [14]:
# Printing the mean validation score with standard deviation, rounded to 2 decimals
enet_val_score = np.round(np.mean(enet_val_and_train_scores['test_score']), 3)
enet_std_val = np.round(np.std(enet_val_and_train_scores['test_score']), 3)

# Printing the mean training score with standard deviation, rounded to 2 decimals
enet_train_score = np.round(np.mean(enet_val_and_train_scores['train_score']), 3)
enet_std_train = np.round(np.std(enet_val_and_train_scores['train_score']), 3)

# Final score on testing data (Data that was not included in training the model)
# Test Score from Elastic Net with 5-k-folds
enet_test_score = np.round(enet_model.score(X_test, y_test), 3) 

# Print the difference between the training score and testing score to ensure the model isn't overfitting
enet_overfit = np.round(abs(np.mean(enet_val_and_train_scores['test_score'])-np.mean(enet_val_and_train_scores['train_score'])), 4)

print('E-Net Validation Score:...', enet_val_score, "+-", enet_std_val)
print('E-Net Train Score:........', enet_train_score, "+-", enet_std_train)
print('E-Net Test Score:.........', enet_test_score)
print('Overfit Score:............', enet_overfit)

E-Net Validation Score:... 0.78 +- 0.029
E-Net Train Score:........ 0.785 +- 0.007
E-Net Test Score:......... 0.694
Overfit Score:............ 0.0058


In [15]:
# Conclusions:

# It can be seen that even though both the price_log model and enet model get the same average score for training
# and validation, the scores on each k-fold are not exactly the same. This is a sanity check to ensure we are in fact testing
# different models.

# It can be concluded that the enet model does not significantly improve the R^2 score