In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [None]:
#We load the google colab packages 
from google.colab import drive                          

# Then we load our drive #
drive.mount('/content/drive')

# We load the OS package which allows us to access the opperating system commands #
import os 

# We change directory to the directory below - This will differ on your system #
os.chdir("/content/drive/MyDrive/DSO 530 Project")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_train = pd.read_csv("option_train.csv")
df_train = df_train[df_train['tau'] <= 50]
df_train = df_train[df_train['S'] <= 40000]
df_train = df_train[df_train['S'] > 0]
df_train['BS_updated'] = df_train['BS']
df_train['BS_updated'] = df_train['BS_updated'].replace({'Over': 1, 'Under': 0})
df_train = df_train.dropna()

In [None]:
df_train.head()

Unnamed: 0,Value,S,K,tau,r,BS,BS_updated
0,21.670404,431.623898,420.0,0.34127,0.03013,Under,0
1,0.125,427.015526,465.0,0.166667,0.03126,Over,1
2,20.691244,427.762336,415.0,0.265873,0.03116,Under,0
3,1.035002,451.711658,460.0,0.063492,0.02972,Over,1
4,39.55302,446.718974,410.0,0.166667,0.02962,Under,0


In [None]:
df_train.describe()

Unnamed: 0,Value,S,K,tau,r,BS_updated
count,1673.0,1673.0,1673.0,1673.0,1673.0,1673.0
mean,15.096361,440.90085,438.21578,0.202023,0.030235,0.435744
std,14.050476,7.529079,23.420806,0.099814,0.000557,0.496002
min,0.125,425.472331,375.0,0.003968,0.02951,0.0
25%,2.220002,433.863864,420.0,0.119048,0.02982,0.0
50%,11.25,442.525366,440.0,0.202381,0.03013,0.0
75%,25.819526,447.320414,455.0,0.285714,0.03054,1.0
max,60.149367,455.880619,500.0,0.392857,0.03188,1.0


In [None]:
result_df = pd.DataFrame(columns = ['Classifier', 'Error %'])

In [None]:
X, y = df_train[['S','K','tau','r']].values, df_train['BS_updated'].values

kfold = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

In [None]:
X_reshape = X.reshape(-1,1)
print(X_reshape)

[[4.31623898e+02]
 [4.20000000e+02]
 [3.41269841e-01]
 ...
 [4.65000000e+02]
 [2.18253968e-01]
 [2.99300000e-02]]


# Create standardized data based on the predictors X, i.e. ['S','K','tau','r'] 

In [None]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_standardized = stdsc.fit_transform(X_reshape)

# restore X's original dimension so we can use cross_val_score() function
X_correct_dimensions = X_standardized.reshape(-1,4)
print(X_correct_dimensions)

[[ 0.96237366  0.90955384 -0.99740515 -0.998819  ]
 [ 0.94143289  1.11403703 -0.99819856 -0.99881386]
 [ 0.94482645  0.88683349 -0.99774776 -0.99881432]
 ...
 [ 0.94609825  0.77323172 -0.99818053 -0.99881104]
 [ 0.99626018  1.18219809 -0.99762154 -0.99882131]
 [ 0.9648447   1.11403703 -0.99796415 -0.99881991]]


In [None]:
#KNN

knn = KNeighborsClassifier()

accuracy = cross_val_score(knn, X, y, cv=kfold, scoring = 'accuracy')

error_list = [1 - x for x in accuracy]
    
print('KNN Classifier Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)

result_df.loc[len(result_df.index)] = ['KNN', ((round(np.mean(error_list),4))) * 100]

KNN Classifier Error List for KFolds :  [0.07462686567164178, 0.09253731343283578, 0.11641791044776117, 0.08383233532934131, 0.09880239520958078] 

Mean Classification Error :  0.0932


# Now we want to run KNN using standardized data

- Note that the mean R-squared of 5-fold CV does Not change when we use standardized data.

In [None]:
knn_1 = KNeighborsClassifier()

accuracy = cross_val_score(knn_1, X_correct_dimensions, y, cv=kfold)

error_list = [1 - x for x in accuracy]
    
print('KNN (after standardization) Classifier Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)

KNN (after standardization) Classifier Error List for KFolds :  [0.07462686567164178, 0.09253731343283578, 0.11641791044776117, 0.08383233532934131, 0.09880239520958078] 

Mean Classification Error :  0.0932


In [None]:
#Logistic

logreg = LogisticRegression()


accuracy = cross_val_score(logreg, X, y, cv=kfold)
    
    
error_list = [1 - x for x in accuracy]
    
print('Logistic Classifier Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)

result_df.loc[len(result_df.index)] = ['Logistic', ((round(np.mean(error_list),4))) * 100]

Logistic Classifier Error List for KFolds :  [0.07462686567164178, 0.09253731343283578, 0.09253731343283578, 0.06886227544910184, 0.11976047904191611] 

Mean Classification Error :  0.0897


# Now we want to run Logistic model using standardized data

In [None]:
logreg_1 = LogisticRegression()


accuracy = cross_val_score(logreg_1, X_correct_dimensions, y, cv=kfold)
    
    
error_list = [1 - x for x in accuracy]
    
print('Logistic Classifier (after standardization) Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)


Logistic Classifier (after standardization) Error List for KFolds :  [0.11044776119402988, 0.11940298507462688, 0.13432835820895528, 0.09281437125748504, 0.11976047904191611] 

Mean Classification Error :  0.1154


In [None]:
#LDA

lda = LinearDiscriminantAnalysis()

accuracy = cross_val_score(lda, X, y, cv=kfold)
    
    
error_list = [1 - x for x in accuracy]
    
print('LDA Classifier Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)

result_df.loc[len(result_df.index)] = ['LDA', ((round(np.mean(error_list),4))) * 100]

LDA Classifier Error List for KFolds :  [0.06865671641791049, 0.09850746268656718, 0.09253731343283578, 0.06886227544910184, 0.11976047904191611] 

Mean Classification Error :  0.0897


In [None]:
#Naive Bayes

nb = GaussianNB()

accuracy = cross_val_score(nb, X, y, cv=kfold)
    
    
error_list = [1 - x for x in accuracy]
    
print('Naive Bayes Classifier Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)

result_df.loc[len(result_df.index)] = ['Naive Bayes', ((round(np.mean(error_list),4))) * 100]

Naive Bayes Classifier Error List for KFolds :  [0.10447761194029848, 0.13432835820895528, 0.14328358208955227, 0.08682634730538918, 0.12574850299401197] 

Mean Classification Error :  0.1189


In [None]:
#Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=20)

accuracy = cross_val_score(rf, X, y, cv=kfold)
    
    
error_list = [1 - x for x in accuracy]
    
print('Random Forest Classifier Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)

result_df.loc[len(result_df.index)] = ['Random Forest', ((round(np.mean(error_list),4))) * 100]

Random Forest Classifier Error List for KFolds :  [0.05970149253731338, 0.05373134328358209, 0.08955223880597019, 0.04790419161676651, 0.07485029940119758] 

Mean Classification Error :  0.0651


In [None]:
#Decison Tree

dt = DecisionTreeClassifier(max_depth = 25)

accuracy = cross_val_score(dt, X, y, cv=kfold)
    
    
error_list = [1 - x for x in accuracy]
    
print('Decision Tree Classifier Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)

result_df.loc[len(result_df.index)] = ['Decision Tree', ((round(np.mean(error_list),4))) * 100]

Decision Tree Classifier Error List for KFolds :  [0.07164179104477608, 0.08656716417910448, 0.10447761194029848, 0.053892215568862256, 0.08682634730538918] 

Mean Classification Error :  0.0807


In [None]:
#SVM
clf = svm.SVC(kernel='linear', C=1, random_state=42)


accuracy = cross_val_score(clf, X, y, cv=kfold)
    
    
error_list = [1 - x for x in accuracy]
    
print('SVM Classifier Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)

result_df.loc[len(result_df.index)] = ['SVM', ((round(np.mean(error_list),4))) * 100]

SVM Classifier Error List for KFolds :  [0.07462686567164178, 0.09552238805970148, 0.08955223880597019, 0.06886227544910184, 0.11676646706586824] 

Mean Classification Error :  0.0891


# Now we want to run SVM model using standardized data

In [None]:
clf_1 = svm.SVC(kernel='linear', C=1, random_state=42)

accuracy = cross_val_score(clf_1, X_correct_dimensions, y, cv=kfold)
    
error_list = [1 - x for x in accuracy]
    
print('SVM Classifier (after standardization) Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)

SVM Classifier (after standardization) Error List for KFolds :  [0.09253731343283578, 0.11044776119402988, 0.11641791044776117, 0.06586826347305386, 0.10479041916167664] 

Mean Classification Error :  0.098


In [None]:
#XGBoost

xgb_clf = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

accuracy = cross_val_score(xgb_clf, X, y, cv=kfold)
    
    
error_list = [1 - x for x in accuracy]
    
print('XGBoost Classifier Error List for KFolds : ', error_list, 
      '\n\nMean Classification Error : ', round(np.mean(error_list),4)
)

result_df.loc[len(result_df.index)] = ['XGB', ((round(np.mean(error_list),4))) * 100]

XGBoost Classifier Error List for KFolds :  [0.04477611940298509, 0.06567164179104479, 0.07462686567164178, 0.05988023952095811, 0.09580838323353291] 

Mean Classification Error :  0.0682


In [None]:
result_df

Unnamed: 0,Classifier,Error %
0,KNN,9.32
1,Logistic,8.97
2,LDA,8.97
3,Naive Bayes,11.89
4,Random Forest,6.51
5,Decision Tree,8.07
6,SVM,8.91
7,XGB,6.82
