In [1]:
import sys
from tqdm import tqdm
import pandas as pd
import os
import datetime
import OptionTools
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

In [2]:
def OLS_evaluate(df, add=False):
    df = df.dropna()
    MON = df['week_day'].apply(lambda x: 1 if x == 0 else 0).to_numpy()
    TUE = df['week_day'].apply(lambda x: 1 if x == 1 else 0).to_numpy()
    WED = df['week_day'].apply(lambda x: 1 if x == 2 else 0).to_numpy()
    THU = df['week_day'].apply(lambda x: 1 if x == 3 else 0).to_numpy()
    FRI = df['week_day'].apply(lambda x: 1 if x == 4 else 0).to_numpy()
    y = df.imp_change.values
    X = np.column_stack([MON, TUE, WED, THU, FRI])
    if add:
        X = np.column_stack(
            [MON, TUE, WED, THU, FRI, df['to_maturity'].values / 252])
    model = sm.OLS(y, X)
    model = model.fit()
    for j, k in enumerate(model.params):
        print(round(k, 4))
        print("({})".format(round(model.tvalues[j], 2)))


#     print(round(model.f_pvalue, 4))
    print(round(model.rsquared_adj, 4))
    print(model.summary())
    return model


def quadratic_OLS(sel):
    mats = np.log(sel['to_maturity'].values)
    mats_sq = np.log(sel['to_maturity'].values)**2
    X = np.column_stack([mats, mats_sq])
    X = sm.add_constant(X)
    y = sel.imp_change.values
    model = sm.OLS(y, X)
    model = model.fit()
    print(model.summary())


def abs_OLS(df, add=False):
    df = df.dropna()
    MON = df['week_day'].apply(lambda x: 1 if x == 0 else 0).to_numpy()
    TUE = df['week_day'].apply(lambda x: 1 if x == 1 else 0).to_numpy()
    WED = df['week_day'].apply(lambda x: 1 if x == 2 else 0).to_numpy()
    THU = df['week_day'].apply(lambda x: 1 if x == 3 else 0).to_numpy()
    FRI = df['week_day'].apply(lambda x: 1 if x == 4 else 0).to_numpy()
    y = np.abs(df.imp_change.values)
    X = np.column_stack([MON, TUE, WED, THU, FRI])
    if add:
        X = sm.add_constant(X)
    model = sm.OLS(y, X)
    model = model.fit()
    print(model.pvalues)
    print(model.params)
    print(model.summary())
    return model


def cp(df, days=6):
    calls = df[df['type'] == 'C']
    puts = df[df['type'] == 'P']
    calls = calls[calls.to_maturity >= days].dropna()
    puts = puts[puts.to_maturity >= days].dropna()
    return calls, puts

In [3]:
def weekly_OLS_evaluate(df):
    a1 = df[(df['to_maturity'] <= 29) & (df['to_maturity'] >= 6)]
    a2 = df[(df['to_maturity'] <= 51) & (df['to_maturity'] >= 30)]
    a3 = df[(df['to_maturity'] <= 106) & (df['to_maturity'] >= 52)]
    a4 = df[(df['to_maturity'] <= 175) & (df['to_maturity'] >= 107)]
    a5 = df[df['to_maturity'] >= 176]
    for k in [a1, a2, a3, a4, a5]:
        print(k.to_maturity.min(), k.to_maturity.max())
        OLS_evaluate(k)

In [4]:
def td_OLS_evaluate(df):
    a1 = df[(df['to_maturity'] <= 22) & (df['to_maturity'] >= 6)]
    a2 = df[(df['to_maturity'] <= 37) & (df['to_maturity'] >= 23)]
    a3 = df[(df['to_maturity'] <= 73) & (df['to_maturity'] >= 38)]
    a4 = df[(df['to_maturity'] <= 119) & (df['to_maturity'] >= 74)]
    a5 = df[df['to_maturity'] >= 120]
    for k in [a1, a2, a3, a4, a5]:
        print(k.to_maturity.min(), k.to_maturity.max())
        OLS_evaluate(k)

In [22]:
df = pd.DataFrame()
for fn in os.listdir('cd_vol/call'):
    path = 'cd_vol/call/' + fn
    tmp = pd.read_csv(path)
    tmp['type'] = 'C'
    df = pd.concat([df, tmp])

for fn in os.listdir('cd_vol/put'):
    path = 'cd_vol/put/' + fn
    tmp = pd.read_csv(path)
    tmp['type'] = 'P'
    df = pd.concat([df, tmp])

calls, puts = cp(df, 8)

In [23]:
calls['year'] = calls.date.apply(lambda x: x // 10000)
calls

Unnamed: 0,date,imp_vol,to_maturity,imp_change,week_day,type,year
1,20150210,0.311825,44.0,-0.083403,1,C,2015
2,20150211,0.269398,43.0,-0.136061,2,C,2015
3,20150212,0.250500,42.0,-0.070149,3,C,2015
4,20150213,0.215214,41.0,-0.140862,4,C,2015
5,20150216,0.216541,38.0,0.006168,0,C,2015
...,...,...,...,...,...,...,...
44,20210408,0.203579,168.0,-0.021597,3,C,2021
45,20210409,0.199528,167.0,-0.019900,4,C,2021
46,20210412,0.202541,164.0,0.015104,0,C,2021
47,20210413,0.202785,163.0,0.001205,1,C,2021


In [24]:
def describe(df, col_name):
    target = df[col_name]
    tl = [
        target.mean(),
        target.std(),
        target.max() - target.min(),
        np.percentile(target, 25),
        target.median(),
        np.percentile(target, 75)
    ]
    ttl = [round(x, 3) for x in tl]
    for k in ttl:
        print(k, end='\t')

# 使用通过implied futures price计算的vol

In [25]:
a = OLS_evaluate(calls)

0.0178
(9.7)
-0.0093
(-5.18)
-0.0029
(-1.6)
-0.0026
(-1.41)
-0.0018
(-1.0)
0.0218
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                     31.59
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           4.64e-26
Time:                        21:13:00   Log-Likelihood:                 7637.6
No. Observations:                5505   AIC:                        -1.527e+04
Df Residuals:                    5500   BIC:                        -1.523e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------

In [26]:
calls['year'] = calls['date'].apply(lambda x: x // 10000
                                    if x < 20210000 else 2020)
calls['year'] = calls['year'].apply(lambda x: x if x >= 2017 else 2017)

In [27]:
OLS_evaluate(calls[calls['year'] <= 2017])
OLS_evaluate(calls[calls['year'] == 2018])
OLS_evaluate(calls[calls['year'] == 2019])
OLS_evaluate(calls[calls['year'] >= 2020])

0.0194
(7.5)
-0.0068
(-2.69)
-0.0057
(-2.29)
-0.0057
(-2.19)
-0.0072
(-2.82)
0.0283
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.028
Method:                 Least Squares   F-statistic:                     20.03
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           2.90e-16
Time:                        21:13:09   Log-Likelihood:                 3722.1
No. Observations:                2614   AIC:                            -7434.
Df Residuals:                    2609   BIC:                            -7405.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2b3d9fc16d8>

In [28]:
a1 = calls[(calls['to_maturity'] <= 31) & (calls['to_maturity'] >= 8)]
a2 = calls[(calls['to_maturity'] <= 55) & (calls['to_maturity'] >= 32)]
a3 = calls[(calls['to_maturity'] <= 111) & (calls['to_maturity'] >= 56)]
a4 = calls[(calls['to_maturity'] <= 177) & (calls['to_maturity'] >= 112)]
a5 = calls[calls['to_maturity'] >= 178]
OLS_evaluate(a1)
OLS_evaluate(a2)
OLS_evaluate(a3)
OLS_evaluate(a4)
OLS_evaluate(a5)

0.0428
(7.99)
-0.0176
(-3.32)
-0.0052
(-0.98)
-0.011
(-1.79)
-0.0128
(-2.04)
0.0643
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.064
Method:                 Least Squares   F-statistic:                     20.78
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           1.49e-16
Time:                        21:13:11   Log-Likelihood:                 1202.4
No. Observations:                1153   AIC:                            -2395.
Df Residuals:                    1148   BIC:                            -2369.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2b3d9fbc588>

In [29]:
def weekly_basics(df):
    a1 = df[(df['to_maturity'] <= 31) & (df['to_maturity'] >= 8)]
    a2 = df[(df['to_maturity'] <= 55) & (df['to_maturity'] >= 32)]
    a3 = df[(df['to_maturity'] <= 108) & (df['to_maturity'] >= 56)]
    a4 = df[(df['to_maturity'] <= 177) & (df['to_maturity'] >= 109)]
    a5 = df[df['to_maturity'] >= 178]
    for k in [a1, a2, a3, a4, a5]:
        print(round(k.imp_change.mean(), 4), end='\t')
        print(round(k.imp_change.std(), 4), end='\t')
        print(round(k.imp_change.max() - k.imp_change.min(), 4), end='\t')
        print(round(np.percentile(k.imp_change, 25), 4), end='\t')
        print(round(k.imp_change.median(), 4), end='\t')
        print(round(np.percentile(k.imp_change, 75), 4), end='\t')
        print()

In [30]:
def wd_weekly_basics(df, ab=False):
    tdf = df.loc[:]
    if ab:
        tdf['imp_change'] = np.abs(tdf.imp_change)
    a1 = tdf[(tdf['to_maturity'] <= 31) & (tdf['to_maturity'] >= 8)]
    a2 = tdf[(tdf['to_maturity'] <= 55) & (tdf['to_maturity'] >= 32)]
    a3 = tdf[(tdf['to_maturity'] <= 108) & (tdf['to_maturity'] >= 56)]
    a4 = tdf[(tdf['to_maturity'] <= 177) & (tdf['to_maturity'] >= 109)]
    a5 = tdf[tdf['to_maturity'] >= 178]
    for k in [a1, a2, a3, a4, a5]:
        s = [k[k['week_day'] == x] for x in range(5)]
        for sb in s:
            print(round(sb.imp_change.mean(), 4), end='\t')
        print()

In [31]:
def yearlys(df):
    c1 = calls[(calls['date'] < 20160101)]
    c2 = calls[(calls['date'] < 20170101) & (calls['date'] >= 20160101)]
    c3 = calls[(calls['date'] < 20180101) & (calls['date'] >= 20170101)]
    c4 = calls[(calls['date'] < 20190101) & (calls['date'] >= 20180101)]
    c5 = calls[(calls['date'] < 20200101) & (calls['date'] >= 20190101)]
    c6 = calls[(calls['date'] >= 20200101)]
    for k in [c1, c2, c3, c4, c5, c6]:
        s = [k[k['week_day'] == x] for x in range(5)]
        for sb in s:
            print(round(sb.imp_change.mean(), 4), end='\t')
        print()

In [32]:
def bymat(df):
    a1 = df[(df['to_maturity'] <= 31) & (df['to_maturity'] >= 8)]
    a2 = df[(df['to_maturity'] <= 55) & (df['to_maturity'] >= 32)]
    a3 = df[(df['to_maturity'] <= 108) & (df['to_maturity'] >= 56)]
    a4 = df[(df['to_maturity'] <= 177) & (df['to_maturity'] >= 109)]
    a5 = df[df['to_maturity'] >= 178]
    for k in [a1, a2, a3, a4, a5]:
        OLS_evaluate(k)

# 按照交易日期划分

In [33]:
c1 = calls[(calls['date'] < 20160101)]
c2 = calls[(calls['date'] < 20170101) & (calls['date'] >= 20160101)]
c3 = calls[(calls['date'] < 20180101) & (calls['date'] >= 20170101)]
c4 = calls[(calls['date'] < 20190101) & (calls['date'] >= 20180101)]
c5 = calls[(calls['date'] < 20200101) & (calls['date'] >= 20190101)]
c6 = calls[(calls['date'] >= 20200101)]
for k in [c1, c2, c3, c4, c5, c6]:
    OLS_evaluate(k)

0.0275
(5.27)
-0.0045
(-0.88)
-0.0099
(-1.96)
-0.0056
(-1.06)
-0.0092
(-1.76)
0.0386
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.043
Model:                            OLS   Adj. R-squared:                  0.039
Method:                 Least Squares   F-statistic:                     9.151
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           3.12e-07
Time:                        21:13:16   Log-Likelihood:                 1060.7
No. Observations:                 813   AIC:                            -2111.
Df Residuals:                     808   BIC:                            -2088.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------

# 按照到期日区间划分

In [34]:
a1 = calls[(calls['to_maturity'] <= 22) & (calls['to_maturity'] >= 6)]
a2 = calls[(calls['to_maturity'] <= 37) & (calls['to_maturity'] >= 23)]
a3 = calls[(calls['to_maturity'] <= 73) & (calls['to_maturity'] >= 38)]
a4 = calls[(calls['to_maturity'] <= 119) & (calls['to_maturity'] >= 74)]
a5 = calls[calls['to_maturity'] >= 120]

for k in [a1, a2, a3, a4, a5]:
    OLS_evaluate(k)

0.0519
(6.78)
-0.029
(-3.76)
-0.0104
(-1.63)
-0.0088
(-1.12)
-0.026
(-3.29)
0.091
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.096
Model:                            OLS   Adj. R-squared:                  0.091
Method:                 Least Squares   F-statistic:                     18.24
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           2.98e-14
Time:                        21:13:19   Log-Likelihood:                 705.81
No. Observations:                 690   AIC:                            -1402.
Df Residuals:                     685   BIC:                            -1379.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------

# 到期日加权method?
# 可以思考一下：具体如何加权？

In [35]:
# calls['x'] = calls.to_maturity.apply(lambda x: 1/x)
calls['x'] = calls.to_maturity.apply(lambda x: 1 / np.log(x))
tmp1 = calls.groupby('date').x.sum().reset_index()
tmp1.columns = ['date', 'sum_mat']

tmp_call = calls.merge(tmp1, on='date')
tmp_call['w'] = tmp_call['x'] / tmp_call['sum_mat']
tmp_call['weighted_iv'] = tmp_call.imp_vol * tmp_call.w
weighted_ivs = tmp_call.groupby('date').sum().reset_index()[[
    'date', 'weighted_iv'
]]

calendar = calls[['date', 'week_day']]
weighted_ivs = weighted_ivs.merge(calendar, on='date').drop_duplicates()
weighted_ivs[
    'imp_change'] = weighted_ivs.weighted_iv / weighted_ivs.weighted_iv.shift(
        1) - 1
weighted_ivs = weighted_ivs.dropna()
OLS_evaluate(weighted_ivs[weighted_ivs['date'] >= 20190101])

0.0218
(3.59)
-0.0172
(-2.87)
-0.0057
(-0.95)
0.0077
(1.27)
0.0016
(0.25)
0.0339
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     5.853
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           0.000129
Time:                        21:13:20   Log-Likelihood:                 740.79
No. Observations:                 554   AIC:                            -1472.
Df Residuals:                     549   BIC:                            -1450.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2b3d9f9a198>