In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [2]:
#We load the google colab packages 
from google.colab import drive                          

# Then we load our drive #
drive.mount('/content/drive')

# We load the OS package which allows us to access the opperating system commands #
import os 

# We change directory to the directory below - This will differ on your system #
os.chdir("/content/drive/MyDrive/DSO 530 Project")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Training data
df_train = pd.read_csv("option_train.csv")
df_train= df_train.dropna()
df_train = df_train[df_train['tau'] <= 50]
df_train = df_train[df_train['S'] <= 40000]
df_train = df_train[df_train['S'] > 0]

df_train.describe()

Unnamed: 0,Value,S,K,tau,r
count,1673.0,1673.0,1673.0,1673.0,1673.0
mean,15.096361,440.90085,438.21578,0.202023,0.030235
std,14.050476,7.529079,23.420806,0.099814,0.000557
min,0.125,425.472331,375.0,0.003968,0.02951
25%,2.220002,433.863864,420.0,0.119048,0.02982
50%,11.25,442.525366,440.0,0.202381,0.03013
75%,25.819526,447.320414,455.0,0.285714,0.03054
max,60.149367,455.880619,500.0,0.392857,0.03188


In [4]:
# Test data
df_test = pd.read_csv("option_test_wolabel.csv")

In [5]:
# Replacing 'Over' and 'Under' in the training data BS variable with 1 and 0

BS_mapping = {'Under': 0, 'Over': 1}

df_train['BS'] = df_train['BS'].map(BS_mapping)

df_train.head()

Unnamed: 0,Value,S,K,tau,r,BS
0,21.670404,431.623898,420.0,0.34127,0.03013,0
1,0.125,427.015526,465.0,0.166667,0.03126,1
2,20.691244,427.762336,415.0,0.265873,0.03116,0
3,1.035002,451.711658,460.0,0.063492,0.02972,1
4,39.55302,446.718974,410.0,0.166667,0.02962,0


In [6]:
# designate predictors and response variables of the regression model in training set
X_train_regression, y_train_regression = df_train[['S','K','tau','r']].values, df_train['Value'].values

In [7]:
# designate predictors and response variables of the classification model in training set
X_train_classification, y_train_classification = df_train[['S','K','tau','r']].values, df_train['BS'].values

# Fitting the model

In [8]:
# Regression Model: 
# these parameters are determined as optimal according to mean R^2 of 5-fold CV

Random_Forest_Regression_Model = RandomForestRegressor(n_estimators=100, max_depth=30)

Random_Forest_Regression_Model.fit(X_train_regression, y_train_regression)

In [9]:
# Classification Model: 
# these parameters are determined as optimal according to mean classification error of 5-fold CV

Random_Forest_Classification_Model = RandomForestClassifier(n_estimators=100, max_depth=20)

Random_Forest_Classification_Model.fit(X_train_classification, y_train_classification)

# Making predictions

In [10]:
X_test = df_test[['S','K','tau','r']].values

In [11]:
value_pred = Random_Forest_Regression_Model.predict(X_test)

In [12]:
print(value_pred)

[ 1.34525045 17.73468521 10.23571228 ...  0.39185348 25.76593061
 15.83637994]


In [13]:
BS_predict = Random_Forest_Classification_Model.predict(X_test)

In [14]:
print(BS_predict)

[1 0 0 ... 1 0 0]


# Saving the prediction result and exporting it to CSV

In [15]:
df_prediction = pd.DataFrame({
    'Value': value_pred,
    'BS': BS_predict
})

In [16]:
df_prediction.head()

Unnamed: 0,Value,BS
0,1.34525,1
1,17.734685,0
2,10.235712,0
3,22.198572,0
4,15.479365,0


In [17]:
# Export dataframe as CSV file
df_prediction.to_csv('group_12_prediction.csv', index=False)

In [18]:
!mv group_12_prediction.csv /content/drive/MyDrive/Project/