In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

In [2]:
#We load the google colab packages 
from google.colab import drive                          

# Then we load our drive #
drive.mount('/content/drive')

# We load the OS package which allows us to access the opperating system commands #
import os 

# We change directory to the directory below - This will differ on your system #
os.chdir("/content/drive/MyDrive/DSO 530 Project")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df_train = pd.read_csv("option_train.csv")

In [4]:
# delete tau > 50 years, S =0, and S > 40333
df_train.describe()

Unnamed: 0,Value,S,K,tau,r
count,1678.0,1679.0,1678.0,1679.0,1680.0
mean,15.068709,464.402535,438.241955,0.437519,0.030235
std,14.040023,973.652179,23.408989,7.057555,0.000557
min,0.125,0.0,375.0,0.003968,0.02951
25%,2.255001,433.863864,420.0,0.119048,0.02982
50%,11.190967,442.634081,440.0,0.202381,0.03013
75%,25.747434,447.320414,455.0,0.285714,0.03054
max,60.149367,40333.0,500.0,250.0,0.03188


In [5]:
# need to drop NaNs
df_train.isnull().sum()

Value    2
S        1
K        2
tau      1
r        0
BS       0
dtype: int64

In [6]:
# drop NaNs
df_options= df_train.dropna()

df_options.isnull().sum()

Value    0
S        0
K        0
tau      0
r        0
BS       0
dtype: int64

In [7]:
# drop observations where t > 50 years, S = 0, and S > 40000
df_options = df_options[df_options['tau'] <= 50]
df_options = df_options[df_options['S'] <= 40000]
df_options = df_options[df_options['S'] > 0]

In [8]:
df_options.describe()

Unnamed: 0,Value,S,K,tau,r
count,1673.0,1673.0,1673.0,1673.0,1673.0
mean,15.096361,440.90085,438.21578,0.202023,0.030235
std,14.050476,7.529079,23.420806,0.099814,0.000557
min,0.125,425.472331,375.0,0.003968,0.02951
25%,2.220002,433.863864,420.0,0.119048,0.02982
50%,11.25,442.525366,440.0,0.202381,0.03013
75%,25.819526,447.320414,455.0,0.285714,0.03054
max,60.149367,455.880619,500.0,0.392857,0.03188


In [9]:
# Result DF
regression_result_df = pd.DataFrame(columns = ['Model', ' 5-fold Mean R2'])

# We want to first build a regression model to predict the Value

In [10]:
# designate predictors and response variable
X, y = df_options[['S','K','tau','r']].values, df_options['Value'].values

In [11]:
print(X)

[[4.31623898e+02 4.20000000e+02 3.41269841e-01 3.01300000e-02]
 [4.27015526e+02 4.65000000e+02 1.66666667e-01 3.12600000e-02]
 [4.27762336e+02 4.15000000e+02 2.65873016e-01 3.11600000e-02]
 ...
 [4.28042219e+02 3.90000000e+02 1.70634921e-01 3.18800000e-02]
 [4.39081203e+02 4.80000000e+02 2.93650794e-01 2.96200000e-02]
 [4.32167692e+02 4.65000000e+02 2.18253968e-01 2.99300000e-02]]


In [12]:
# create k-fold where k =5
kfolds = KFold(n_splits = 5, random_state = 1, shuffle = True)

# (Least squares) linear regression model: mean R-squared of 5-fold CV is 0.91

- Notice that we tried to normalize the predictors before running linear regression model and testsing its robustness using CV, but the mean R-squared of the 5-fold CV is the same as that without normalization.

In [13]:
X_reshape = X.reshape(-1,1)
print(X_reshape)

[[4.31623898e+02]
 [4.20000000e+02]
 [3.41269841e-01]
 ...
 [4.65000000e+02]
 [2.18253968e-01]
 [2.99300000e-02]]


In [14]:
# Normalize the data before running regression model: mean R^2 of 5-fold CV 
# is the same as the model where we does not normalize
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

X_norm = mms.fit_transform(X_reshape)

X_correct_dimensions = X_norm.reshape(-1,4)

linear_regresssion_model_norm = LinearRegression()

r2_linear_regresssion_model_norm_cv = cross_val_score(linear_regresssion_model_norm, X_correct_dimensions, y, cv=kfolds, scoring = 'r2')

print("Linear Regression Model (after Normalization) R-squared of 5-folds:", r2_linear_regresssion_model_norm_cv ,"(mean r squared:", np. mean(r2_linear_regresssion_model_norm_cv),")")


Linear Regression Model (after Normalization) R-squared of 5-folds: [0.91970852 0.91645431 0.90703987 0.89906318 0.9096128 ] (mean r squared: 0.9103757373884773 )


In [15]:
# Fit linear regression model: R-squared of 5-fold CV is 0.91
linear_regresssion_model = LinearRegression()

#linear_regresssion_model.fit(X, y)

r2_linear_regresssion_model_cv = cross_val_score(linear_regresssion_model, X, y, cv=kfolds, scoring = 'r2')

print("Linear Regression Model R-squared of 5-folds:", r2_linear_regresssion_model_cv ,"(mean r squared:", np. mean(r2_linear_regresssion_model_cv),")")

regression_result_df.loc[len(regression_result_df.index)] = ['Linear Regression', np. mean(r2_linear_regresssion_model_cv)]

Linear Regression Model R-squared of 5-folds: [0.91970852 0.91645431 0.90703987 0.89906318 0.9096128 ] (mean r squared: 0.9103757373884772 )


# KNN (K=5) regression model: mean R-squared of 5-fold CV is 0.971

In [16]:

KNN_model = KNeighborsRegressor(n_neighbors=5)

#KNN_model.fit(X, y)

r2_KNN_regresssion_model_cv = cross_val_score(KNN_model, X, y, cv=kfolds, scoring = 'r2')

print("KNN Regression Model R-squared of 5-folds:", r2_KNN_regresssion_model_cv ,"(mean r squared:", np. mean(r2_KNN_regresssion_model_cv),")")

regression_result_df.loc[len(regression_result_df.index)] = ['KNN Regression', np. mean(r2_KNN_regresssion_model_cv)]

KNN Regression Model R-squared of 5-folds: [0.96758867 0.97089989 0.97249587 0.97199414 0.97266035] (mean r squared: 0.9711277864853083 )


# Decision_Tree_Model: mean R-squared of 5-fold CV is 0.9903

In [17]:
Decision_Tree_Model = DecisionTreeRegressor(max_depth=25)

r2_Decision_Tree_Model_cv = cross_val_score(Decision_Tree_Model, X, y, cv=kfolds, scoring = 'r2')

print("Decision Tree Model R-squared of 5-folds:", r2_Decision_Tree_Model_cv ,"(mean r squared:", np. mean(r2_Decision_Tree_Model_cv),")")

regression_result_df.loc[len(regression_result_df.index)] = ['Decision Tree Regression', np. mean(r2_Decision_Tree_Model_cv)]

Decision Tree Model R-squared of 5-folds: [0.98928062 0.99029455 0.99069626 0.99064689 0.99130713] (mean r squared: 0.990445088763704 )


# Random_Forest_Model: mean R-squared of 5-fold CV is 0.995895

In [18]:
# The parameters are optimal according to mean R^2 of 5-fold CV
Random_Forest_Model = RandomForestRegressor(n_estimators=100, max_depth=30)

r2_Random_Forest_Model_cv = cross_val_score(Random_Forest_Model, X, y, cv=kfolds, scoring = 'r2')

print("Random Forest Model R-squared of 5-folds:", r2_Random_Forest_Model_cv ,"(mean r squared:", np. mean(r2_Random_Forest_Model_cv),")")

regression_result_df.loc[len(regression_result_df.index)] = ['Random Forest Regression', np. mean(r2_Random_Forest_Model_cv)]

Random Forest Model R-squared of 5-folds: [0.99436317 0.9958412  0.99723412 0.99602224 0.99632298] (mean r squared: 0.9959567423805149 )


# XGBoost: mean R-squared of 5-fold CV is 0.995827

In [19]:
# The parameters are optimal according to mean R^2 of 5-fold CV
Boosting_model = XGBRegressor(n_estimators=100, max_depth=10, eta=0.2, subsample=0.7, colsample_bytree=0.8)

r2_Boosting_Model_cv = cross_val_score(Boosting_model, X, y, cv=kfolds, scoring = 'r2')

print("Boosting Model R-squared of 5-folds:", r2_Boosting_Model_cv ,"(mean r squared:", np. mean(r2_Boosting_Model_cv),")")

regression_result_df.loc[len(regression_result_df.index)] = ['Boosting Regression', np. mean(r2_Boosting_Model_cv)]

Boosting Model R-squared of 5-folds: [0.99490032 0.9961635  0.99446725 0.99666447 0.99694179] (mean r squared: 0.9958274656810977 )


# SVM Model: mean R-squared of 5-fold CV is 0.894469

In [20]:
SVM_Model = SVR(kernel='linear', C=1.0, epsilon=0.1)

r2_SVM_Model_cv = cross_val_score(SVM_Model, X, y, cv=kfolds, scoring = 'r2')

print("SVM Model R-squared of 5-folds:", r2_SVM_Model_cv ,"(mean r squared:", np. mean(r2_SVM_Model_cv),")")

regression_result_df.loc[len(regression_result_df.index)] = ['SVM Regression', np. mean(r2_SVM_Model_cv)]

SVM Model R-squared of 5-folds: [0.90913961 0.90697327 0.8910591  0.87030715 0.89486515] (mean r squared: 0.8944688556306705 )


# Compare results from different models

In [21]:
print(regression_result_df)

                      Model   5-fold Mean R2
0         Linear Regression         0.910376
1            KNN Regression         0.971128
2  Decision Tree Regression         0.990445
3  Random Forest Regression         0.995957
4       Boosting Regression         0.995827
5            SVM Regression         0.894469
