In [90]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.svm import SVR
import matplotlib.pyplot as plt

from preprocess import *

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# 0 Support Vector Regression (SVR) Algorithm

Given a data set $D=\left\{x_i, y_i\right\}_{i=1}^n$ of $N$ points, the method of $\varepsilon$-Support Vector Regression(denoted SVR) fits a function $f$ to the data $D$ of the following form:
$$
f(x)=w^T \phi(x)+b
$$


We aim to minimize
$$
\frac{1}{2}|| w||^2+C \sum_{i=1}^n\left|\xi_i\right|
$$
with constraints
$$
\left|y_i-w_i \phi(x_i)\right| \leq \varepsilon+\left|\xi_i\right|
$$
where:

*   $w, b$: coefficients to be estimated
*   $\phi$: mapping from lower dimensional $x$-space to higer dimensional feature space
* $C$: a hyperparameter that can be tuned, as C increases, the tolerence for points outside margin increases
* $\xi$: slack variable, for any data point that falls outside of $\varepsilon$, its deviation from the margin is denoted as $\xi$ 
* $\varepsilon$ : distance from margins to hyperplane, only data points with absolute error less than or equal to $\varepsilon$ will be considered


# 1 Data

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
df = pd.read_csv('/content/drive/MyDrive/data (2).csv')
df.head()

Unnamed: 0.1,Unnamed: 0,optionid,securityid,strike,callput,date_traded,contract_price,market_price,underlyings_price,contract_volume,days_to_maturity,moneyness,rate,volatility
0,0,150034236.0,504569.0,0.42,C,2006-10-18,0.0715,0.07025,0.4885,5.0,2.0,1.163095,0.053646,0.022956
1,1,150247468.0,504880.0,40.0,C,2006-10-18,0.124,0.1225,39.913799,56137.0,2.0,0.997845,0.053646,0.114784
2,2,150255000.0,506496.0,62.0,C,2006-10-18,0.172,0.174,61.827798,27369.0,2.0,0.997223,0.053646,0.106823
3,3,150255496.0,506497.0,53.5,C,2006-10-18,0.296,0.2655,53.6129,1224.0,2.0,1.00211,0.053646,0.110336
4,4,150255498.0,506497.0,54.0,C,2006-10-18,0.075,0.0645,53.6129,963.0,2.0,0.992831,0.053646,0.110336


In [17]:
dataframe_BS = np.vstack((df['strike'].values,
                      df['underlyings_price'].values,
                      df['days_to_maturity'].values,
                      df['volatility'].values,
                      df['rate'].values,
                      df['contract_price'].values)).T  

Get train and test data in tuples of features and targets. Print out their dimensions to check they are in shapes we want.

In [112]:
(x_train, y_train) , (x_test, y_test)= propocessed(dataframe_BS)
print(np.shape(x_train), np.shape(y_train), np.shape(x_test), np.shape(y_test))

(85999, 5) (85999,) (21500, 5) (21500,)


# 2 Model

Fit the model:

In [113]:
regressor = SVR(kernel = 'rbf')
regressor.fit(x_train, y_train)

SVR()

Evaluate the model:

In [114]:
y_pred = regressor.predict(x_test)
rmse = np.sqrt(np.mean((y_test-y_pred)**2))
rmse

0.09538321117538016

# 3 Tuning hyperparameters

## Set up ranges of hyperparameters for searching

The ranges of hyperparameter are chosen based on experiments in [Practical Option Pricing with
Support Vector Regression and MART
by
Ian I-En Choo
Stanford University](http://cs229.stanford.edu/proj2009/Choo.pdf).

In [115]:
C_range = np.logspace(1,3,3)
print(f'The list of values for C are {C_range}')

epsilon_range = np.logspace(-1,-3,3)
print(f'The list of values for epsilon are {epsilon_range}')

gamma_range = np.logspace(-5, -2, 4)
print(f'The list of values for gamma are {gamma_range}')

The list of values for C are [  10.  100. 1000.]
The list of values for epsilon are [0.1   0.01  0.001]
The list of values for gamma are [1.e-05 1.e-04 1.e-03 1.e-02]


In [116]:
param_grid = { 
    # Regularization parameter
    "C": C_range,
    # Kernel type
    "kernel": ['rbf', 'poly'],
    # margin parameter
    "epsilon":epsilon_range,
    # Gamma is the Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
    "gamma": gamma_range
    }

# Set up score
scoring = ['accuracy']

## Hyperparameter Tuning Using Grid Search

In [117]:
# Define grid search
grid_search = GridSearchCV(estimator=regressor, 
                           param_grid=param_grid, 
                           refit= 'neg_root_mean_squared_error', 
                           verbose=0)

In [None]:
# Fit grid search
grid_result = grid_search.fit(x_train, y_train)
# Print grid search summary
grid_result

In [None]:
# Print the best accuracy score for the training dataset
print(f'The best accuracy score for the training dataset is {grid_result.best_score_:.4f}')
# Print the hyperparameters for the best score
print(f'The best hyperparameters are {grid_result.best_params_}')
# Print the best accuracy score for the testing dataset
print(f'The accuracy score for the testing dataset is {grid_search.score(x_test, y_test):.4f}')

In [None]:
best_regressor = grid_result.best_estimator_
y_pred = best_regressor.predict(x_test)
rmse = np.sqrt(np.mean((y_test-y_pred)**2))
rmse

## Hyperparameter Tuning Using Random Search

In [None]:
# Define random search
random_search = RandomizedSearchCV(estimator=regressor, 
                           param_distributions=param_grid, 
                           n_iter=100,
                           scoring=scoring, 
                           refit='accuracy', 
                           n_jobs=-1, 
                           cv=kfold, 
                           verbose=0)
# Fit grid search
random_result = random_search.fit(x_train, y_train)
# Print random search summary
random_result