In [57]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

bitcoin_data = pd.read_csv('bitcoin_bitstamp.csv')
stock_data = pd.read_csv('composite_data.csv')

bitcoin_data['log_price'] = np.log(bitcoin_data['close'])

stock_data['log_return'] = np.log(stock_data['Close']) - np.log(stock_data['Close'].shift(1))

stock_data['realized_volatility'] = stock_data['log_return'].rolling(window=20).apply(lambda x: np.sqrt(np.sum(x**2))*252/20, raw=False)
# stock_data['realized_volatility'] = stock_data['log_return'].apply(lambda x: x**2).rolling(20).sum()*np.sqrt(253/20)


merged_data = pd.merge(bitcoin_data[['Date', 'log_price']], stock_data[['Date', 'realized_volatility']], on='Date')
merged_data = merged_data.dropna()
merged_data

Unnamed: 0,Date,log_price,realized_volatility
20,2017-12-21,9.655027,0.244999
21,2017-12-22,9.547512,0.243696
22,2017-12-26,9.665512,0.244013
23,2017-12-27,9.639843,0.210697
24,2017-12-28,9.579838,0.211908
...,...,...,...
1372,2021-12-22,10.791769,0.726402
1373,2021-12-23,10.836635,0.730026
1374,2021-12-27,10.834038,0.692126
1375,2021-12-28,10.769396,0.672234


In [58]:
merged_data.describe()

Unnamed: 0,log_price,realized_volatility
count,1357.0,1357.0
mean,9.332817,0.626569
std,0.749908,0.5587
min,8.069577,0.190911
25%,8.893835,0.319871
50%,9.133153,0.464249
75%,9.380869,0.72527
max,11.120757,3.413446


In [59]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.optimize import minimize

break_dates = ['2019-02-01', '2020-02-27', '2020-10-08', '2021-05-21']
for i, date in enumerate(break_dates):
    merged_data[f'break_{i+1}'] = (merged_data['Date'] >= date).astype(int)

# 독립 변수와 종속 변수 설정
merged_data['btc_diff'] = merged_data['log_price'] - merged_data['log_price'].shift(1)
merged_data['log_price_lag'] = merged_data['log_price'].shift(1)

# 필요한 행만 선택 (NaN 값 제거)
filtered_data = merged_data.dropna(subset=['realized_volatility', 'log_price_lag', 'btc_diff'])

def nls_model(params, y, X):
    alpha, beta, gamma, rho = params[0], params[1], params[2], params[3]
    breaks = params[4:]
    y_pred = alpha + beta * X['log_price_lag'] + gamma * (X['log_price'] - rho * X['log_price_lag'])
    for i in range(len(break_dates)):
        y_pred += breaks[i] * X[f'break_{i+1}']
    residuals = y - y_pred
    return np.sum(residuals**2)

# 초기 추정값 설정
initial_params = np.zeros(4 + len(break_dates))
initial_params[0] = 0  # alpha
initial_params[1] = 0  # beta
initial_params[2] = 0  # gamma
initial_params[3] = 0.5  # 초기 rho 값

# 최적화 수행
result = minimize(nls_model, initial_params, args=(filtered_data['realized_volatility'], filtered_data),
                  method='L-BFGS-B')

# 최적화 결과
params = result.x
alpha, beta, gamma, rho = params[0], params[1], params[2], params[3]
breaks = params[4:]

print(f"alpha: {alpha}, beta: {beta}, gamma: {gamma}, rho: {rho}")
for i in range(len(break_dates)):
    print(f"break_{i+1}: {breaks[i]}")

# 예측값 계산
filtered_data['RV_t_predicted'] = alpha + beta * filtered_data['log_price_lag'] + \
                                  gamma * (filtered_data['log_price'] - rho * filtered_data['log_price_lag'])
for i in range(len(break_dates)):
    filtered_data['RV_t_predicted'] += breaks[i] * filtered_data[f'break_{i+1}']

# 예측값 출력
print(filtered_data[['Date', 'RV_t_predicted']])

alpha: 3.786215082080097, beta: -0.2637578378002439, gamma: -0.20999532240652927, rho: 0.5058095478312354
break_1: -0.08419117579851286
break_2: 0.9331763737045542
break_3: -0.2847945841012282
break_4: -0.0011577642543791247
            Date  RV_t_predicted
21    2017-12-22        0.260227
22    2017-12-26        0.252386
23    2017-12-27        0.239186
24    2017-12-28        0.255831
25    2017-12-29        0.267181
...          ...             ...
1372  2021-12-22        0.381885
1373  2021-12-23        0.373468
1374  2021-12-27        0.366945
1375  2021-12-28        0.380929
1376  2021-12-29        0.395847

[1356 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RV_t_predicted'] = alpha + beta * filtered_data['log_price_lag'] + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RV_t_predicted'] += breaks[i] * filtered_data[f'break_{i+1}']


In [60]:
filtered_data['RV_t_predicted'] = alpha + beta * filtered_data['log_price_lag'] + \
                                  gamma * (filtered_data['log_price'] - rho * filtered_data['log_price_lag'])
for i in range(len(break_dates)):
    filtered_data['RV_t_predicted'] += breaks[i] * filtered_data[f'break_{i+1}']

# OLS 회귀 모델 설정
X_ols = filtered_data[['log_price_lag', 'btc_diff']]
for i in range(len(break_dates)):
    X_ols[f'break_{i+1}'] = filtered_data[f'break_{i+1}']
X_ols['const'] = 1  # 상수항 추가
y_ols = filtered_data['realized_volatility']

# OLS 회귀 분석 수행
ols_model = sm.OLS(y_ols, X_ols).fit()

# 회귀 결과 요약
print(ols_model.summary())

                             OLS Regression Results                            
Dep. Variable:     realized_volatility   R-squared:                       0.420
Model:                             OLS   Adj. R-squared:                  0.417
Method:                  Least Squares   F-statistic:                     162.8
Date:                 Sun, 02 Jun 2024   Prob (F-statistic):          1.19e-155
Time:                         23:11:57   Log-Likelihood:                -765.22
No. Observations:                 1356   AIC:                             1544.
Df Residuals:                     1349   BIC:                             1581.
Df Model:                            6                                         
Covariance Type:             nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
log_price_lag    -0.3675      0.035 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RV_t_predicted'] = alpha + beta * filtered_data['log_price_lag'] + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RV_t_predicted'] += breaks[i] * filtered_data[f'break_{i+1}']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_ols[f'break_{i+1}'] = filtered_data[f'brea

In [61]:
import numpy as np
from scipy.stats import t

# CW test를 위한 함수 정의
def clark_west_test(actual, pred1, pred2):
    f = (actual - pred1) ** 2 - (actual - pred2) ** 2 + (pred1 - pred2) ** 2
    mean_f = np.mean(f)
    std_f = np.std(f, ddof=1)
    t_statistic = mean_f / (std_f / np.sqrt(len(f)))
    
    # 자유도 계산 (표본 크기 - 1)
    df = len(f) - 1
    
    # p-value 계산 (단측검정)
    p_value = 1 - t.cdf(t_statistic, df=df)
    
    print(f"Mean of f: {mean_f}")
    print(f"Standard deviation of f: {std_f}")
    print(f"t-statistic: {t_statistic}")
    print(f"p-value: {p_value}")
    
    return t_statistic

cw_stat = clark_west_test(filtered_data['realized_volatility'], filtered_data['realized_volatility'].mean(), filtered_data['RV_t_predicted'])
print(cw_stat)

Mean of f: 0.26206567925665386
Standard deviation of f: 0.7483909813818523
t-statistic: 12.894706096025837
p-value: 0.0
12.894706096025837


In [62]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

filtered_data['RV_t_predicted'] = alpha + beta * filtered_data['log_price_lag'] + \
                                  gamma * (filtered_data['log_price'] - rho * filtered_data['log_price_lag'])
for i in range(len(break_dates)):
    filtered_data['RV_t_predicted'] += breaks[i] * filtered_data[f'break_{i+1}']

gamma_risk_aversion = 3  
theta_leverage = 6  
rf = 0.01  

filtered_data['sigma_sq'] = filtered_data['realized_volatility']**2
filtered_data['wt'] = (1 / gamma_risk_aversion) * ((theta_leverage * filtered_data['RV_t_predicted'] + (theta_leverage - 1) * rf) / (theta_leverage**2 * filtered_data['sigma_sq']))

filtered_data['Rp'] = filtered_data['wt'] * (filtered_data['RV_t_predicted'] - rf) + (1 - filtered_data['wt']) * rf
filtered_data['Var_Rp'] = filtered_data['wt']**2 * theta_leverage**2 * filtered_data['sigma_sq']
filtered_data['CER'] = filtered_data['Rp'] - 0.5 * (1 / gamma_risk_aversion) * filtered_data['Var_Rp']

print(filtered_data[['Date', 'wt', 'Rp', 'Var_Rp', 'CER']])


            Date        wt        Rp    Var_Rp       CER
21    2017-12-22  0.251230  0.070352  0.134941  0.047862
22    2017-12-26  0.243261  0.066531  0.126846  0.045390
23    2017-12-27  0.309755  0.077894  0.153341  0.052337
24    2017-12-28  0.326821  0.087074  0.172669  0.058296
25    2017-12-29  0.396258  0.107947  0.218349  0.071556
...          ...       ...       ...       ...       ...
1372  2021-12-22  0.041085  0.024868  0.032064  0.019524
1373  2021-12-23  0.039800  0.024068  0.030392  0.019003
1374  2021-12-27  0.043522  0.025100  0.032666  0.019655
1375  2021-12-28  0.047855  0.027272  0.037256  0.021063
1376  2021-12-29  0.056987  0.031418  0.046066  0.023741

[1356 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RV_t_predicted'] = alpha + beta * filtered_data['log_price_lag'] + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RV_t_predicted'] += breaks[i] * filtered_data[f'break_{i+1}']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['sigma_sq'] = filtered_data['r