# straddle 分析 交易日版本

In [2]:
import sys
from tqdm import tqdm
import pandas as pd
import os
import datetime
import OptionTools

import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
from scipy.stats import norm

In [3]:
def delta_OLS(df, add=False):
    df = df.dropna()
    MON = df['weekday'].apply(lambda x: 1 if x == 0 else 0).to_numpy()
    TUE = df['weekday'].apply(lambda x: 1 if x == 1 else 0).to_numpy()
    WED = df['weekday'].apply(lambda x: 1 if x == 2 else 0).to_numpy()
    THU = df['weekday'].apply(lambda x: 1 if x == 3 else 0).to_numpy()
    FRI = df['weekday'].apply(lambda x: 1 if x == 4 else 0).to_numpy()
    y = df.neutral_pnl.values
    X = np.column_stack([MON, TUE, WED, THU, FRI])
    if add:
        X = sm.add_constant(X)
    model = sm.OLS(y, X)
    model = model.fit()
    for j, k in enumerate(model.params):
        print(round(k, 4))
        print("({})".format(round(model.tvalues[j], 2)))


#     print(round(model.f_pvalue, 4))
    print(round(model.rsquared_adj, 4))
    print(model.summary())
    return model

In [4]:
straddles = pd.read_csv('all_straddles_td.csv')
straddles = straddles[straddles['date'] >= '2016-11-01']
straddles = straddles[straddles['dtm'] >= 6]
# straddles = straddles.drop_duplicates(subset='date').reset_index(drop=True)
straddles['principal'] = straddles['call_close'] / straddles[
    'call_delta'] - straddles['put_close'] / straddles['put_delta']

close_to_close = straddles[[
    'date', 'exe_date', 'weekday', 'dtm', 'call_delta', 'call_close',
    'call_next_close', 'put_delta', 'put_close', 'put_next_close', 'principal'
]]
close_to_open = straddles[[
    'date', 'exe_date', 'weekday', 'dtm', 'call_delta', 'call_close',
    'call_next_open', 'put_delta', 'put_close', 'put_next_open', 'principal'
]]
open_to_close = straddles[[
    'date', 'exe_date', 'weekday', 'dtm', 'call_delta', 'call_next_close',
    'call_next_open', 'put_delta', 'put_next_close', 'put_next_open',
    'principal'
]]

close_to_close['call_ret'] = close_to_close[
    'call_next_close'].values - close_to_close['call_close'].values
close_to_open['call_ret'] = close_to_open[
    'call_next_open'].values - close_to_open['call_close'].values
open_to_close['call_ret'] = open_to_close[
    'call_next_close'].values - open_to_close['call_next_open'].values

close_to_close['put_ret'] = close_to_close[
    'put_next_close'].values - close_to_close['put_close'].values
close_to_open['put_ret'] = close_to_open[
    'put_next_open'].values - close_to_open['put_close'].values
open_to_close['put_ret'] = open_to_close[
    'put_next_close'].values - open_to_close['put_next_open'].values

close_to_close['ret'] = close_to_close['call_ret'] / close_to_close[
    'call_delta'] - close_to_close['put_ret'] / close_to_close['put_delta']
close_to_open['ret'] = close_to_open['call_ret'] / close_to_open[
    'call_delta'] - close_to_open['put_ret'] / close_to_open['put_delta']
open_to_close['ret'] = open_to_close['call_ret'] / open_to_close[
    'call_delta'] - open_to_close['put_ret'] / open_to_close['put_delta']

close_to_close[
    'neutral_pnl'] = close_to_close['ret'] / close_to_close['principal']
close_to_open[
    'neutral_pnl'] = close_to_open['ret'] / close_to_close['principal']
open_to_close[
    'neutral_pnl'] = open_to_close['ret'] / close_to_close['principal']

# close_to_close['neutral_pnl'] = close_to_close['ret']
# close_to_open['neutral_pnl'] = close_to_open['ret']
# open_to_close['neutral_pnl'] = open_to_close['ret']

close_to_close['weekday'] = close_to_close.weekday.apply(lambda x:
                                                         (x + 1) % 5).values
close_to_open['weekday'] = close_to_open.weekday.apply(lambda x:
                                                       (x + 1) % 5).values
open_to_close['weekday'] = open_to_close.weekday.apply(lambda x:
                                                       (x + 1) % 5).values

delta_OLS(close_to_close)
delta_OLS(close_to_open)
delta_OLS(open_to_close)

-0.0014
(-0.52)
-0.0095
(-3.37)
-0.001
(-0.36)
-0.0021
(-0.77)
0.0083
(3.03)
0.0042
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     5.205
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           0.000351
Time:                        22:13:40   Log-Likelihood:                 4508.0
No. Observations:                3989   AIC:                            -9006.
Df Residuals:                    3984   BIC:                            -8975.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2d7ff408860>

# 获取realized vol

In [6]:
etfs = pd.read_csv('50etf_5m.csv')
etfs['time'] = etfs.date.apply(lambda x: x.split()[-1])
etfs['date'] = etfs.date.apply(lambda x: x.split()[0])
etfs = etfs[(etfs['date'] >= '2015-02-09') & (etfs['date'] <= '2021-04-14')]
etfs = etfs.reset_index(drop=True)

opens = etfs.groupby('date').apply(lambda x: x['open'].iloc[0]).reset_index()
opens['yclose'] = etfs.groupby('date').apply(
    lambda x: x['yclose'].iloc[0]).values
opens.columns = ['date', 'sectional_close', 'yclose']
opens['time'] = "09:30:00"

selected = etfs[['date', 'sectional_close', 'yclose', 'time']][:]
selected.loc[selected['time'] == '09:35:00',
             'yclose'] = opens['sectional_close'].values
selected = pd.concat(
    [selected, opens]).sort_values(by=['date', 'time']).reset_index(drop=True)
selected.loc[:, 'cont_change'] = np.log(selected['sectional_close'].values /
                                        selected['yclose'].values)

rvs = selected.groupby('date').apply(lambda x: np.sqrt(
    (x['cont_change']**2).sum()) * np.sqrt(252)).reset_index()
rvs.columns = ['date', 'rv']
overn = selected[selected['time'] == "09:30:00"].reset_index(drop=True)
overn['on_rv'] = np.abs(overn['cont_change']) * np.sqrt(252)
overn = overn[['date', 'on_rv']]
cm = selected[selected['time'] > "09:30:00"].groupby('date').apply(
    lambda x: np.sqrt(
        (x['cont_change']**2).sum()) * np.sqrt(252)).reset_index()
cm.columns = ['date', 'id_rv']

rvs = pd.merge(rvs, overn, on='date')
rvs = pd.merge(rvs, cm, on='date')

In [7]:
# straddle_rv = pd.merge(close_to_close[['date', 'dtm', 'weekday', 'ret', 'neutral_pnl']], rvs[['date', 'rv']])
rvs['rvc'] = rvs.rv - rvs.rv.shift(1)
rvs['pct_rvc'] = rvs.rv / rvs.rv.shift(1) - 1
rvs['rvc_id'] = rvs.id_rv - rvs.id_rv.shift(1)
rvs['rvc_on'] = rvs.on_rv - rvs.on_rv.shift(1)
rvs['abs_rvc'] = np.abs(rvs['pct_rvc'].values)
straddle_rv = pd.merge(
    close_to_close[[
        'date', 'exe_date', 'weekday', 'neutral_pnl', 'ret', 'dtm'
    ]],
    pd.DataFrame({
        'date': rvs.date[:-1].values,
        'rv': rvs.rv[1:].values,
        'abs_rvc': rvs.abs_rvc[1:].values,
        'rvc': rvs.rvc[1:].values,
        'rvc_id': rvs.rvc_id[1:].values,
        'rvc_on': rvs.rvc_on[:-1].values,
        'id_rv': rvs.id_rv[1:].values,
        'on_rv': rvs.on_rv[:-1].values,
        'pct_rvc': rvs.pct_rvc[:-1].values
    }))

In [9]:
def categorize(df, seperators, target_column, column_name):
    all_temps = []
    for k in range(len(seperators)):
        if k == 0:
            temp = df[df[target_column] <= seperators[0]][:]
        else:
            temp = df[(df[target_column] > seperators[k - 1])
                      & (df[target_column] <= seperators[k])][:]
        temp[column_name] = k

        all_temps.append(temp)
    temp = df[(df[target_column] > seperators[-1])][:]
    temp[column_name] = len(seperators)
    all_temps.append(temp)
    return pd.concat(all_temps).reset_index(drop=True)


def RV_OLS(df, method='neutral_pnl', add=False):
    df = df.dropna()
    y = df[method].values
    X = np.column_stack([
        df['rv_level'].apply(lambda x: 1 if x == i else 0) for i in range(10)
    ])
    model = sm.OLS(y, X)
    model = model.fit()
    for j, k in enumerate(model.params):
        print(round(k, 4))
        print("({})".format(round(model.tvalues[j], 2)))


#     print(round(model.f_pvalue, 4))
    print(round(model.rsquared_adj, 4))
    print(model.summary())
    return model

## 到期日与实现波动率的关系？

In [12]:
new_straddles = pd.read_csv('all_straddles_td.csv')

In [13]:
straddle_rv = pd.merge(
    close_to_close[[
        'date', 'exe_date', 'weekday', 'neutral_pnl', 'ret', 'dtm'
    ]],
    pd.DataFrame({
        'date': rvs.date[:-1].values,
        'rv': rvs.rv[1:].values,
        'abs_rvc': rvs.abs_rvc[1:].values,
        'rvc': rvs.rvc[1:].values,
        'rvc_id': rvs.rvc_id[1:].values,
        'rvc_on': rvs.rvc_on[:-1].values,
        'id_rv': rvs.id_rv[1:].values,
        'on_rv': rvs.on_rv[:-1].values,
        'pct_rvc': rvs.pct_rvc[:-1].values
    }))
rv_division = [
    np.percentile(straddle_rv.rv.values, 10 * j) for j in range(1, 10)
]
rv_df = categorize(straddle_rv, rv_division, 'rv', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

-0.0248
(-6.54)
-0.0177
(-4.66)
-0.0173
(-4.52)
-0.0144
(-3.77)
0.0001
(0.02)
-0.0028
(-0.74)
0.0006
(0.15)
0.0119
(3.13)
0.0088
(2.31)
0.0454
(11.87)
0.0581
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.060
Model:                            OLS   Adj. R-squared:                  0.058
Method:                 Least Squares   F-statistic:                     28.33
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           3.57e-48
Time:                        22:14:05   Log-Likelihood:                 4621.5
No. Observations:                3989   AIC:                            -9223.
Df Residuals:                    3979   BIC:                            -9160.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2d7ff3fa630>

In [14]:
straddle_rv = pd.merge(
    close_to_close[[
        'date', 'exe_date', 'weekday', 'neutral_pnl', 'ret', 'dtm'
    ]],
    pd.DataFrame({
        'date': rvs.date[:-1].values,
        'rv': rvs.rv[1:].values,
        'abs_rvc': rvs.abs_rvc[1:].values,
        'rvc': rvs.rvc[1:].values,
        'rvc_id': rvs.rvc_id[1:].values,
        'rvc_on': rvs.rvc_on[:-1].values,
        'id_rv': rvs.id_rv[1:].values,
        'on_rv': rvs.on_rv[:-1].values,
        'pct_rvc': rvs.pct_rvc[:-1].values
    }))
rv_division = [
    np.percentile(straddle_rv.rv.values, 10 * j) for j in range(1, 10)
]
rv_df = categorize(straddle_rv, rv_division, 'rv', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

straddle_rv = pd.merge(
    close_to_open[['date', 'exe_date', 'weekday', 'neutral_pnl', 'ret',
                   'dtm']],
    pd.DataFrame({
        'date': rvs.date[:-1].values,
        'rv': rvs.rv[1:].values,
        'abs_rvc': rvs.abs_rvc[1:].values,
        'rvc': rvs.rvc[1:].values,
        'rvc_id': rvs.rvc_id[1:].values,
        'rvc_on': rvs.rvc_on[:-1].values,
        'id_rv': rvs.id_rv[1:].values,
        'on_rv': rvs.on_rv[:-1].values,
        'pct_rvc': rvs.pct_rvc[:-1].values
    }))
rv_division = [
    np.percentile(straddle_rv.rv.values, 10 * j) for j in range(1, 10)
]
rv_df = categorize(straddle_rv, rv_division, 'rv', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

straddle_rv = pd.merge(
    open_to_close[['date', 'exe_date', 'weekday', 'neutral_pnl', 'ret',
                   'dtm']],
    pd.DataFrame({
        'date': rvs.date[:-1].values,
        'rv': rvs.rv[1:].values,
        'abs_rvc': rvs.abs_rvc[1:].values,
        'rvc': rvs.rvc[1:].values,
        'rvc_id': rvs.rvc_id[1:].values,
        'rvc_on': rvs.rvc_on[:-1].values,
        'id_rv': rvs.id_rv[1:].values,
        'on_rv': rvs.on_rv[:-1].values,
        'pct_rvc': rvs.pct_rvc[:-1].values
    }))
rv_division = [
    np.percentile(straddle_rv.id_rv.values, 10 * j) for j in range(1, 10)
]
rv_df = categorize(straddle_rv, rv_division, 'rv', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

-0.0248
(-6.54)
-0.0177
(-4.66)
-0.0173
(-4.52)
-0.0144
(-3.77)
0.0001
(0.02)
-0.0028
(-0.74)
0.0006
(0.15)
0.0119
(3.13)
0.0088
(2.31)
0.0454
(11.87)
0.0581
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.060
Model:                            OLS   Adj. R-squared:                  0.058
Method:                 Least Squares   F-statistic:                     28.33
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           3.57e-48
Time:                        22:14:05   Log-Likelihood:                 4621.5
No. Observations:                3989   AIC:                            -9223.
Df Residuals:                    3979   BIC:                            -9160.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2d78c49eb38>

In [15]:
# Result Analysis: A basic case.
model = sm.OLS(rv_df.neutral_pnl.values, sm.add_constant(rv_df.rv.values))
model = model.fit()
print(model.summary())


def delta_OLS_effect(df, test_col, add=False):
    df = df.dropna()
    MON = (df['weekday'].apply(lambda x: 1 if x == 0 else 0) *
           df[test_col]).values
    TUE = (df['weekday'].apply(lambda x: 1 if x == 1 else 0) *
           df[test_col]).values
    WED = (df['weekday'].apply(lambda x: 1 if x == 2 else 0) *
           df[test_col]).values
    THU = (df['weekday'].apply(lambda x: 1 if x == 3 else 0) *
           df[test_col]).values
    FRI = (df['weekday'].apply(lambda x: 1 if x == 4 else 0) *
           df[test_col]).values
    y = df.neutral_pnl.values
    X = np.column_stack([MON, TUE, WED, THU, FRI])
    if add:
        X = sm.add_constant(X)
    model = sm.OLS(y, X)
    model = model.fit()
    for j, k in enumerate(model.params):
        print(round(k, 4))
        print("({})".format(round(model.tvalues[j], 2)))


#     print(round(model.f_pvalue, 4))
    print(round(model.rsquared_adj, 4))
    print(model.summary())
    return model

delta_OLS_effect(rv_df[rv_df['dtm'] >= 63], 'rvc')

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.083
Model:                            OLS   Adj. R-squared:                  0.083
Method:                 Least Squares   F-statistic:                     362.6
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           1.93e-77
Time:                        22:14:06   Log-Likelihood:                 4951.5
No. Observations:                3989   AIC:                            -9899.
Df Residuals:                    3987   BIC:                            -9886.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0402      0.002    -17.351      0.0

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2d78c481cc0>

In [16]:
rv_df[(rv_df['rv'] <= rv_division[1]) & (rv_df['weekday'] == 0)].shape

(98, 15)

## 其他delta-neutral

In [17]:
daily = pd.read_csv('daily_underlying.csv')
daily.columns = ['date', 'underlying']

long_call = pd.DataFrame()
for fn in os.listdir('new_td/call'):
    path = 'new_td/call/' + fn
    tmp = pd.read_csv(path)
    tmp = pd.merge(tmp[tmp['dtm'] >= 6], daily, on='date')
    tmp['imp_und'] = tmp.stock_price
    tmp['und_diff'] = tmp['underlying'] - tmp['imp_und']
    tmp['stock_pnl'] = tmp.imp_und - tmp.imp_und.shift(1)
    tmp['last_delta'] = tmp.call_delta.shift(1)
    tmp['option_ret'] = (tmp.next_close - tmp.call_close).shift(1)
    tmp['cost'] = tmp.call_close.shift(1)
    tmp['neutral_pnl'] = (-tmp.last_delta * tmp.stock_pnl +
                          tmp.option_ret) / (tmp.cost)
    long_call = pd.concat([
        long_call, tmp[['date', 'dtm', 'weekday', 'neutral_pnl',
                        'und_diff']].dropna(how='any')
    ])
    long_call = long_call[long_call['date'] >= '2016-11-01']
long_put = pd.DataFrame()
for fn in os.listdir('new_td/put'):
    path = 'new_td/put/' + fn
    tmp = pd.read_csv(path)
    tmp = pd.merge(tmp[tmp['dtm'] >= 6], daily, on='date')
    tmp['imp_und'] = tmp.stock_price
    tmp['und_diff'] = tmp['underlying'] - tmp['imp_und']
    tmp['stock_pnl'] = tmp.underlying - tmp.underlying.shift(1)
    tmp['last_delta'] = tmp.put_delta.shift(1)
    tmp['option_ret'] = (tmp.next_close - tmp.put_close).shift(1)
    tmp['cost'] = tmp.put_close.shift(1)
    tmp['neutral_pnl'] = (-tmp.last_delta * tmp.stock_pnl +
                          tmp.option_ret) / (tmp.cost)
    long_put = pd.concat([
        long_put, tmp[['date', 'dtm', 'weekday', 'neutral_pnl',
                       'und_diff']].dropna(how='any')
    ])
    long_put = long_put[long_put['date'] >= '2016-11-01']

In [18]:
lc_rv = pd.merge(
    long_call[['date', 'weekday', 'neutral_pnl']],
    pd.DataFrame({
        'date': rvs.date[:-1].values,
        'rv': rvs.rv[1:].values,
        'rvc': rvs.rvc[1:].values,
        'rvc_id': rvs.rvc_id[1:].values,
        'rvc_on': rvs.rvc_on[:-1].values,
        'id_rv': rvs.id_rv[1:].values,
        'on_rv': rvs.on_rv[:-1].values,
        'pct_rvc': rvs.pct_rvc[:-1].values
    }))

rv_division = [np.percentile(lc_rv.rvc.values, 10 * j) for j in range(1, 10)]
rv_df = categorize(lc_rv, rv_division, 'rvc', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

rv_division = [np.percentile(lc_rv.id_rv.values, 10 * j) for j in range(1, 10)]
rv_df = categorize(lc_rv, rv_division, 'id_rv', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

rv_division = [np.percentile(lc_rv.on_rv.values, 10 * j) for j in range(1, 10)]
rv_df = categorize(lc_rv, rv_division, 'on_rv', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

0.0248
(5.86)
-0.0034
(-0.81)
0.0122
(2.88)
-0.01
(-2.36)
-0.0072
(-1.7)
-0.0137
(-3.24)
-0.0076
(-1.79)
-0.0101
(-2.4)
0.0014
(0.34)
0.0011
(0.26)
0.0154
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     7.836
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           1.55e-11
Time:                        22:14:13   Log-Likelihood:                 4171.9
No. Observations:                3936   AIC:                            -8324.
Df Residuals:                    3926   BIC:                            -8261.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t| 

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2d7ff561240>

In [19]:
lp_rv = pd.merge(
    long_put[['date', 'weekday', 'neutral_pnl']],
    pd.DataFrame({
        'date': rvs.date[1:].values,
        'rv': rvs.rv[1:].values,
        'id_rv': rvs.id_rv[1:].values,
        'on_rv': rvs.on_rv[:-1].values
    }))

rv_division = [np.percentile(lp_rv.rv.values, 10 * j) for j in range(1, 10)]
rv_df = categorize(lp_rv, rv_division, 'rv', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

rv_division = [np.percentile(lp_rv.id_rv.values, 10 * j) for j in range(1, 10)]
rv_df = categorize(lp_rv, rv_division, 'id_rv', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

rv_division = [np.percentile(lp_rv.on_rv.values, 10 * j) for j in range(1, 10)]
rv_df = categorize(lp_rv, rv_division, 'on_rv', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

-0.0307
(-6.2)
-0.0169
(-3.41)
-0.0135
(-2.71)
-0.014
(-2.82)
0.0013
(0.27)
-0.0016
(-0.33)
0.0068
(1.36)
0.0137
(2.76)
0.01
(2.02)
0.045
(9.05)
0.0372
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.039
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     17.88
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           2.05e-29
Time:                        22:14:13   Log-Likelihood:                 3544.8
No. Observations:                3936   AIC:                            -7070.
Df Residuals:                    3926   BIC:                            -7007.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|    

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2d78c4812e8>

# 实现波动率变化与straddle return

In [20]:
rv_division = [
    np.percentile(straddle_rv.abs_rvc.values, 10 * j) for j in range(1, 10)
]
rv_df = categorize(straddle_rv, rv_division, 'abs_rvc', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df, 'ret')

-0.0019
(-1.37)
-0.0023
(-1.64)
-0.0032
(-2.24)
-0.0026
(-1.86)
0.0001
(0.08)
-0.0007
(-0.49)
-0.0031
(-2.19)
0.0028
(1.99)
-0.0017
(-1.21)
0.0109
(7.61)
0.0178
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     9.029
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           1.28e-13
Time:                        22:14:14   Log-Likelihood:                 8549.7
No. Observations:                3989   AIC:                        -1.708e+04
Df Residuals:                    3979   BIC:                        -1.702e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2d7ff561208>

In [21]:
def leveled_analysis(df, target_col):
    rv_division = [
        np.percentile(df[target_col].values, 10 * j) for j in range(1, 10)
    ]
    rv_df = categorize(df, rv_division, target_col, 'rv_level')
    rv_df.sort_values(by='date').reset_index(drop=True)
    print('Percentiles: ', rv_division)
    RV_OLS(rv_df)


leveled_analysis(straddle_rv, 'rvc')

Percentiles:  [-0.0716538020686379, -0.04029479547403833, -0.021582989192098542, -0.010086572710607145, -0.0015489531344572982, 0.010400800387845924, 0.02247688754458008, 0.040311917795589645, 0.06784614059471956]
-0.0184
(-5.18)
-0.0073
(-2.06)
-0.0123
(-3.43)
-0.013
(-3.66)
-0.0158
(-4.45)
-0.0034
(-0.97)
-0.0023
(-0.65)
0.0031
(0.87)
0.0104
(2.92)
0.0446
(12.55)
0.0553
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.057
Model:                            OLS   Adj. R-squared:                  0.055
Method:                 Least Squares   F-statistic:                     26.95
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           1.05e-45
Time:                        22:14:14   Log-Likelihood:                 4895.9
No. Observations:                3989   AIC:                            -9772.
Df Residuals:                    3979   BIC:                            

In [22]:
straddle_rv = pd.merge(
    close_to_close[[
        'date', 'exe_date', 'weekday', 'neutral_pnl', 'ret', 'dtm'
    ]],
    pd.DataFrame({
        'date': rvs.date[:-1].values,
        'rv': rvs.rv[1:].values,
        'abs_rvc': rvs.abs_rvc[1:].values,
        'rvc': rvs.rvc[1:].values,
        'rvc_id': rvs.rvc_id[1:].values,
        'rvc_on': rvs.rvc_on[:-1].values,
        'id_rv': rvs.id_rv[1:].values,
        'on_rv': rvs.on_rv[:-1].values,
        'pct_rvc': rvs.pct_rvc[:-1].values
    }))
rv_division = [
    np.percentile(straddle_rv.rvc.values, 10 * j) for j in range(1, 10)
]
rv_df = categorize(straddle_rv, rv_division, 'rvc', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

straddle_rv = pd.merge(
    close_to_open[['date', 'exe_date', 'weekday', 'neutral_pnl', 'ret',
                   'dtm']],
    pd.DataFrame({
        'date': rvs.date[:-1].values,
        'rv': rvs.rv[1:].values,
        'abs_rvc': rvs.abs_rvc[1:].values,
        'rvc': rvs.rvc[1:].values,
        'rvc_id': rvs.rvc_id[1:].values,
        'rvc_on': rvs.rvc_on[:-1].values,
        'id_rv': rvs.id_rv[1:].values,
        'on_rv': rvs.on_rv[:-1].values,
        'pct_rvc': rvs.pct_rvc[:-1].values
    }))
rv_division = [
    np.percentile(straddle_rv.rvc.values, 10 * j) for j in range(1, 10)
]
rv_df = categorize(straddle_rv, rv_division, 'rvc', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

straddle_rv = pd.merge(
    open_to_close[['date', 'exe_date', 'weekday', 'neutral_pnl', 'ret',
                   'dtm']],
    pd.DataFrame({
        'date': rvs.date[:-1].values,
        'rv': rvs.rv[1:].values,
        'abs_rvc': rvs.abs_rvc[1:].values,
        'rvc': rvs.rvc[1:].values,
        'rvc_id': rvs.rvc_id[1:].values,
        'rvc_on': rvs.rvc_on[:-1].values,
        'id_rv': rvs.id_rv[1:].values,
        'on_rv': rvs.on_rv[:-1].values,
        'pct_rvc': rvs.pct_rvc[:-1].values
    }))
rv_division = [
    np.percentile(straddle_rv.rvc.values, 10 * j) for j in range(1, 10)
]
rv_df = categorize(straddle_rv, rv_division, 'rvc', 'rv_level')
rv_df.sort_values(by='date').reset_index(drop=True)
RV_OLS(rv_df)

-0.0241
(-6.4)
-0.0104
(-2.77)
-0.0121
(-3.19)
-0.0141
(-3.75)
-0.0164
(-4.34)
-0.0039
(-1.03)
-0.0033
(-0.86)
0.0032
(0.84)
0.0115
(3.05)
0.0589
(15.64)
0.0782
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.080
Model:                            OLS   Adj. R-squared:                  0.078
Method:                 Least Squares   F-statistic:                     38.58
Date:                Tue, 07 Mar 2023   Prob (F-statistic):           2.34e-66
Time:                        22:14:14   Log-Likelihood:                 4664.5
No. Observations:                3989   AIC:                            -9309.
Df Residuals:                    3979   BIC:                            -9246.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2d78c9d3a58>