In [5]:

import numpy as np
import pandas as pd
import datetime
from scipy.stats import uniform, randint
import math
from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split,StratifiedKFold

import xgboost as xgb
import lightgbm as lgb

In [6]:
#-----------------READ DATA----------------
datafilename = 'DATA/MockDATA_10percent.csv'

Data = pd.read_csv(datafilename)[['date','exdate','delta','impl_volatility',"cp_flag"]] #try using more features later
Data['date'] = pd.to_datetime(Data['date'])
Data['exdate'] = pd.to_datetime(Data['exdate'])


#sdate = datetime.datetime(2019,4,13)
#edate = datetime.datetime(2019,4,20)


#Data = Data[Data['date'] < edate] #change this line if we need more complicated set of data
#Data = Data[Data['date'] >= sdate]

Data['time_to_maturity'] = (Data['exdate']-Data['date']).astype('int64')/10**9/3600/24
Data = Data.dropna()

Data
Data = Data.dropna()

Data = Data.sort_values(by='date')
Data['impl_volatility'] = np.log(Data['impl_volatility'])

Data = Data[Data["cp_flag"] == "C"]
print(Data)

              date     exdate     delta  impl_volatility cp_flag   
0       2018-02-28 2018-09-21  0.776262        -1.510561       C  \
780     2018-02-28 2018-06-29  0.169746        -2.120272       C   
785     2018-02-28 2018-06-29  0.662646        -1.669820       C   
788     2018-02-28 2018-12-21  0.973789        -0.814296       C   
791     2018-02-28 2018-04-20  0.951787        -0.809045       C   
...            ...        ...       ...              ...     ...   
2133073 2023-02-28 2023-03-27  0.459562        -1.640929       C   
2113668 2023-02-28 2023-04-21  0.300511        -1.777419       C   
2133100 2023-02-28 2023-03-15  0.830740        -1.381148       C   
2114552 2023-02-28 2024-01-19  0.905867        -1.186254       C   
2118538 2023-02-28 2023-05-31  0.932871        -1.091079       C   

         time_to_maturity  
0                   205.0  
780                 121.0  
785                 121.0  
788                 296.0  
791                  51.0  
...            

In [7]:
#Train Test Split

rate = 0.2
rows, columns = Data.shape
num = round(rows*(1-rate))

idx = Data.index
train = Data.loc[idx[0:num]]
test = Data.loc[idx[num:]]



X_train = train[['delta','time_to_maturity']]
X_test = test[['delta','time_to_maturity']]

y_train = train['impl_volatility']
y_test = test['impl_volatility']



In [14]:
LGBMR = lgb.LGBMRegressor()

#Hyper parameter tuning
#brute force grid search 

pgrid = {
    'n_estimators': [5,10,50,75,100],
    'num_leaves': [5,10, 15, 20],
    'subsample': [0.2,0.5,0.8,1],
    'colsample_bytree': [0.2,0.5,0.8,1],
    'max_depth':[2,5,10,15]
        }

grid_search = GridSearchCV(estimator=LGBMR, param_grid=pgrid, scoring='neg_mean_squared_error',n_jobs = 4,cv=5, verbose=3 )
grid_search.fit(X_train,y_train)

bestlgbr = grid_search.best_estimator_
print(grid_search.best_params_)

Fitting 5 folds for each of 1280 candidates, totalling 6400 fits
[CV 4/5] END colsample_bytree=0.8, max_depth=10, n_estimators=10, num_leaves=20, subsample=0.5;, score=nan total time=   0.0s
[CV 5/5] END colsample_bytree=0.8, max_depth=10, n_estimators=10, num_leaves=20, subsample=0.5;, score=nan total time=   0.0s
[CV 1/5] END colsample_bytree=0.8, max_depth=10, n_estimators=10, num_leaves=20, subsample=0.8;, score=nan total time=   0.0s
[CV 2/5] END colsample_bytree=0.8, max_depth=10, n_estimators=10, num_leaves=20, subsample=0.8;, score=nan total time=   0.0s
[CV 3/5] END colsample_bytree=0.8, max_depth=10, n_estimators=10, num_leaves=20, subsample=0.8;, score=nan total time=   0.0s
[CV 4/5] END colsample_bytree=0.8, max_depth=10, n_estimators=10, num_leaves=20, subsample=0.8;, score=nan total time=   0.0s
[CV 5/5] END colsample_bytree=0.8, max_depth=10, n_estimators=10, num_leaves=20, subsample=0.8;, score=nan total time=   0.0s
[CV 1/5] END colsample_bytree=0.8, max_depth=10, n_es



[CV 3/5] END colsample_bytree=1, max_depth=15, n_estimators=75, num_leaves=5, subsample=0.2;, score=nan total time=   0.0s
[CV 4/5] END colsample_bytree=1, max_depth=15, n_estimators=75, num_leaves=5, subsample=0.2;, score=nan total time=   0.0s
[CV 4/5] END colsample_bytree=1, max_depth=15, n_estimators=75, num_leaves=10, subsample=0.5;, score=nan total time=   0.0s
[CV 5/5] END colsample_bytree=1, max_depth=15, n_estimators=75, num_leaves=10, subsample=0.5;, score=nan total time=   0.0s
[CV 1/5] END colsample_bytree=1, max_depth=15, n_estimators=75, num_leaves=10, subsample=0.8;, score=nan total time=   0.0s
[CV 2/5] END colsample_bytree=1, max_depth=15, n_estimators=75, num_leaves=10, subsample=0.8;, score=nan total time=   0.0s
[CV 3/5] END colsample_bytree=1, max_depth=15, n_estimators=75, num_leaves=10, subsample=0.8;, score=nan total time=   0.0s
[CV 4/5] END colsample_bytree=1, max_depth=15, n_estimators=75, num_leaves=10, subsample=0.8;, score=nan total time=   0.0s
[CV 5/5] E

In [15]:
#fitting on the training set

y_p_lgbr = bestlgbr.predict(X_test)

goodness = mean_squared_error(y_p_lgbr,y_test)

print(goodness)
print(grid_search.best_params_)




0.08622938826013876
{'colsample_bytree': 0.8, 'max_depth': 15, 'n_estimators': 50, 'num_leaves': 20, 'subsample': 0.2}


In [16]:
from matplotlib import interactive

import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
#Plot the fitting result
interactive(True)
#delta = 0.1, 0., 15, 0.2, 0.25, 0.3, 0.35, . . . , 0.9
#time to expiry of 1, 2, 3, 4, 8, 12, 18, 52, 104 weeks. 
D = np.linspace(-1, 1, 41, endpoint=True)
T = np.array([1,2,3,4,8,12,18,52,104,156])
T_days = T*7 #over time_to_maturity is in days

dd,tt = np.meshgrid(D,T_days)


X_plot = np.array([dd.flatten(), tt.flatten()]).T

y_plot = bestlgbr.predict(X_plot)

z = y_plot.reshape(dd.shape)


In [10]:

xd_test = np.array(X_test['delta'])
xt_test = np.array(X_test['time_to_maturity'])

test_z = np.array(y_test)


xd_train = np.array(X_train['delta'])
xt_train = np.array(X_train['time_to_maturity'])

train_z = np.array(y_train)

In [11]:
%matplotlib qt 
fig = plt.figure()
ax = Axes3D(fig)

ax.plot_surface(dd,tt, z,cmap =cm.coolwarm)


#ax.scatter(xd_train, xt_train, train_z, color='green')
ax.scatter(xd_test, xt_test, test_z, color='red')

ax.set_xlabel('delta')
ax.set_ylabel('time to maturity')
ax.set_zlabel('Volatility');

plt.show()