In [39]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

bitcoin_data = pd.read_csv('bitcoin_bitstamp.csv')
stock_data = pd.read_csv('composite_data.csv')

bitcoin_data['log_price'] = np.log(bitcoin_data['close'])

stock_data['log_return'] = np.log(stock_data['Close']) - np.log(stock_data['Close'].shift(1))

stock_data['realized_volatility'] = stock_data['log_return'].rolling(window=20).apply(lambda x: np.sqrt(np.sum(252 * x**2)), raw=False)

merged_data = pd.merge(bitcoin_data[['Date', 'log_price']], stock_data[['Date', 'realized_volatility']], on='Date')
merged_data = merged_data.dropna()

In [40]:
merged_data.describe()

Unnamed: 0,log_price,realized_volatility
count,1357.0,1357.0
mean,9.332817,0.789402
std,0.749908,0.703896
min,8.069577,0.240525
25%,8.893835,0.403
50%,9.133153,0.584899
75%,9.380869,0.913755
max,11.120757,4.300537


In [46]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.optimize import minimize

break_dates = ['2019-02-01', '2020-02-27', '2020-10-08', '2021-05-21']
for i, date in enumerate(break_dates):
    merged_data[f'break_{i+1}'] = (merged_data['Date'] >= date).astype(int)

# 독립 변수와 종속 변수 설정
merged_data['btc_diff'] = merged_data['log_price'] - merged_data['log_price'].shift(1)
merged_data['log_price_lag'] = merged_data['log_price'].shift(1)

# 필요한 행만 선택 (NaN 값 제거)
filtered_data = merged_data.dropna(subset=['realized_volatility', 'log_price_lag', 'btc_diff'])

def nls_model(params, y, X):
    alpha, beta, gamma, rho = params[0], params[1], params[2], params[3]
    breaks = params[4:]
    y_pred = alpha + beta * X['log_price_lag'] + gamma * (X['log_price'] - rho * X['log_price_lag'])
    for i in range(len(break_dates)):
        y_pred += breaks[i] * X[f'break_{i+1}']
    residuals = y - y_pred
    return np.sum(residuals**2)

# 초기 추정값 설정
initial_params = np.zeros(4 + len(break_dates))
initial_params[0] = 0  # alpha
initial_params[1] = 0  # beta
initial_params[2] = 0  # gamma
initial_params[3] = 0.5  # 초기 rho 값

# 최적화 수행
result = minimize(nls_model, initial_params, args=(filtered_data['realized_volatility'], filtered_data),
                  method='L-BFGS-B')

# 최적화 결과
params = result.x
alpha, beta, gamma, rho = params[0], params[1], params[2], params[3]
breaks = params[4:]

print(f"alpha: {alpha}, beta: {beta}, gamma: {gamma}, rho: {rho}")
for i in range(len(break_dates)):
    print(f"break_{i+1}: {breaks[i]}")

# 예측값 계산
filtered_data['RV_t_predicted'] = alpha + beta * filtered_data['log_price_lag'] + \
                                  gamma * (filtered_data['log_price'] - rho * filtered_data['log_price_lag'])
for i in range(len(break_dates)):
    filtered_data['RV_t_predicted'] += breaks[i] * filtered_data[f'break_{i+1}']

# 예측값 출력
print(filtered_data[['Date', 'RV_t_predicted']])

alpha: 4.770255702047686, beta: -0.33139962693888164, gamma: -0.26477938995919287, rho: 0.5027337018686939
break_1: -0.10592940941691166
break_2: 1.175377562635887
break_3: -0.3586180063195051
break_4: -0.0011768624180347373
            Date  RV_t_predicted
21    2017-12-22        0.327814
22    2017-12-26        0.317888
23    2017-12-27        0.301287
24    2017-12-28        0.322265
25    2017-12-29        0.336554
...          ...             ...
1372  2021-12-22        0.481349
1373  2021-12-23        0.470733
1374  2021-12-27        0.462525
1375  2021-12-28        0.480156
1376  2021-12-29        0.498943

[1356 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RV_t_predicted'] = alpha + beta * filtered_data['log_price_lag'] + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RV_t_predicted'] += breaks[i] * filtered_data[f'break_{i+1}']


In [50]:
col_list = filtered_data.columns
col_list = list(col_list )
y = filtered_data['realized_volatility']

if 'realized_volatility' in col_list:
    col_list.remove('realized_volatility')
    
train_size = int(len(filtered_data) * 0.75)
X_train, X_test = filtered_data[col_list][:train_size], filtered_data[col_list][train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# OLS 회귀 모델 설정
X_ols = X_train[['log_price_lag', 'btc_diff']]  #   filtered_data[['log_price_lag', 'btc_diff']] 
for i in range(len(break_dates)):
    X_ols[f'break_{i+1}'] = filtered_data[f'break_{i+1}'] # filtered_data[f'break_{i+1}'] # 
X_ols['const'] = 1  # 상수항 추가
y_ols = y_train # y_train #  filtered_data['realized_volatility'] 

# OLS 회귀 분석 수행
ols_model = sm.OLS(y_ols, X_ols).fit()

# 회귀 결과 요약
print(ols_model.summary())

                             OLS Regression Results                            
Dep. Variable:     realized_volatility   R-squared:                       0.448
Model:                             OLS   Adj. R-squared:                  0.446
Method:                  Least Squares   F-statistic:                     205.4
Date:                 Wed, 05 Jun 2024   Prob (F-statistic):          5.70e-129
Time:                         23:21:50   Log-Likelihood:                -908.74
No. Observations:                 1017   AIC:                             1827.
Df Residuals:                     1012   BIC:                             1852.
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
log_price_lag    -0.6284      0.062 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_ols[f'break_{i+1}'] = filtered_data[f'break_{i+1}'] # filtered_data[f'break_{i+1}'] #
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_ols[f'break_{i+1}'] = filtered_data[f'break_{i+1}'] # filtered_data[f'break_{i+1}'] #
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_ols[f'break_{i+1}'] = filte

In [56]:
filtered_data.head()

Unnamed: 0,Date,log_price,realized_volatility,break_1,break_2,break_3,break_4,btc_diff,log_price_lag,RV_t_predicted
21,2017-12-22,9.547512,0.307029,0,0,0,0,-0.107515,9.655027,0.327814
22,2017-12-26,9.665512,0.307428,0,0,0,0,0.118,9.547512,0.317888
23,2017-12-27,9.639843,0.265454,0,0,0,0,-0.025669,9.665512,0.301287
24,2017-12-28,9.579838,0.266978,0,0,0,0,-0.060005,9.639843,0.322265
25,2017-12-29,9.570808,0.247615,0,0,0,0,-0.00903,9.579838,0.336554


In [55]:
import numpy as np
from scipy.stats import t

def predict(params, X):
    alpha, beta, gamma, rho = params[0], params[1], params[2], params[3]
    breaks = params[4:]
    y_pred = alpha + beta * X['log_price_lag'] + gamma * (X['log_price'] - rho * X['log_price_lag'])
    for i in range(len(break_dates)):
        y_pred += breaks[i] * X[f'break_{i+1}']
    return y_pred

# 함수 정의: RMSE 계산
def calculate_mse(actual, predicted):
    mse = np.mean((actual - predicted) ** 2)
    sqrt_mse = np.sqrt(mse)
    return sqrt_mse

# 함수 정의: Clark and West 테스트
def clark_west_test(actual, pred1, pred2):
    f = (actual - pred1) ** 2 - (actual - pred2) ** 2 + (pred1 - pred2) ** 2
    mean_f = np.mean(f)
    std_f = np.std(f, ddof=1)
    t_statistic = mean_f / (std_f / np.sqrt(len(f)))
    
    df = len(f) - 1
    p_value = 1 - t.cdf(t_statistic, df=df)
    
    return t_statistic, p_value

# Out-of-sample 예측 수행 함수
def out_of_sample_evaluation(h, params):
    test_data = filtered_data[:h+1]
    print(len(test_data))
#     test_data = filtered_data.iloc[:h+1]

    # 예측 수행 (test_data 사용)
    test_data['RV_t_predicted'] = predict(params, test_data)
    
    # Relative RMSE 계산
    mse_model = calculate_mse(test_data['realized_volatility'], test_data['RV_t_predicted'])
    mse_benchmark = calculate_mse(test_data['realized_volatility'], test_data['realized_volatility'].mean())
    
    rmse = mse_model/mse_benchmark
    
    # Clark and West 테스트 수행
    cw_stat, p_value = clark_west_test(test_data['realized_volatility'], test_data['RV_t_predicted'], test_data['realized_volatility'].mean())
    
    return rmse, cw_stat, p_value

# 모델 최적화 (전체 데이터를 사용하여 최적화 수행)
result = minimize(nls_model, initial_params, args=(filtered_data['realized_volatility'], filtered_data), method='L-BFGS-B')
# result = minimize(nls_model, initial_params, args=(filtered_data['realized_volatility'], filtered_data), method='L-BFGS-B')
params = result.x

# h 값에 따른 평가 수행
h_values = [30, 60, 120]
for h in h_values:
    rmse, cw_stat, p_value = out_of_sample_evaluation(h, params)
    print(f"Out-of-sample evaluation for h={h}:")
    print(f"Relative RMSE: {rmse}")
    print(f"Clark and West test statistic: {cw_stat}, p-value: {p_value}")
    print(f"------------------------------------------------------------")

31
Out-of-sample evaluation for h=30:
Relative RMSE: 0.5956039461721295
Clark and West test statistic: -0.9800360298938584, p-value: 0.8325469557813037
------------------------------------------------------------
61
Out-of-sample evaluation for h=60:
Relative RMSE: 1.0383453658162753
Clark and West test statistic: 4.080460552999244, p-value: 6.734538966868264e-05
------------------------------------------------------------
121
Out-of-sample evaluation for h=120:
Relative RMSE: 1.043056434558777
Clark and West test statistic: 4.624019216735917, p-value: 4.784035422367161e-06
------------------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['RV_t_predicted'] = predict(params, test_data)


In [18]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

filtered_data['RV_t_predicted'] = alpha + beta * filtered_data['log_price_lag'] + \
                                  gamma * (filtered_data['log_price'] - rho * filtered_data['log_price_lag'])
for i in range(len(break_dates)):
    filtered_data['RV_t_predicted'] += breaks[i] * filtered_data[f'break_{i+1}']

gamma_risk_aversion = 3  
theta_leverage = 6  
rf = 0.01  

filtered_data['sigma_sq'] = filtered_data['realized_volatility']**2
filtered_data['wt'] = (1 / gamma_risk_aversion) * ((theta_leverage * filtered_data['RV_t_predicted'] + (theta_leverage - 1) * rf) / (theta_leverage**2 * filtered_data['sigma_sq']))

filtered_data['Rp'] = filtered_data['wt'] * theta_leverage * (filtered_data['RV_t_predicted'] - rf) + (1 - filtered_data['wt']) * rf
filtered_data['Var_Rp'] = filtered_data['wt']**2 * theta_leverage**2 * filtered_data['sigma_sq']
filtered_data['CER'] = filtered_data['Rp'] - 0.5 * (1 / gamma_risk_aversion) * filtered_data['Var_Rp']

print(filtered_data[['Date', 'wt', 'Rp', 'Var_Rp', 'CER']])
print(filtered_data['Rp']/filtered_data['Var_Rp'])


            Date        wt          Rp     Var_Rp        CER
21    2017-12-22 -2.568838   67.500571  22.394103  63.768220
22    2017-12-26 -2.556493   67.028128  22.237116  63.321942
23    2017-12-27 -3.454487   91.241896  30.272452  86.196487
24    2017-12-28 -3.398874   89.347409  29.643181  84.406879
25    2017-12-29 -3.932073  102.862936  34.127029  97.175098
...          ...       ...         ...        ...        ...
1372  2021-12-22 -0.327106    9.727913   3.226220   9.190209
1373  2021-12-23 -0.324353    9.660528   3.203868   9.126550
1374  2021-12-27 -0.361860   10.806565   3.584381  10.209168
1375  2021-12-28 -0.382474   11.388438   3.777514  10.758852
1376  2021-12-29 -0.436381   12.925390   4.287675  12.210778

[1356 rows x 5 columns]
21      3.014212
22      3.014246
23      3.014024
24      3.014096
25      3.014119
          ...   
1372    3.015266
1373    3.015270
1374    3.014904
1375    3.014797
1376    3.014545
Length: 1356, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RV_t_predicted'] = alpha + beta * filtered_data['log_price_lag'] + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RV_t_predicted'] += breaks[i] * filtered_data[f'break_{i+1}']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['sigma_sq'] = filtered_data['r

# PCA

In [10]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
file_path = 'bitcoin_full_data.csv'
df = pd.read_csv(file_path)

# Display the column names to identify potential features for PCA
print(df.columns)

# Assuming the columns 'price', 'volume', and other indicators are in the dataset
# Replace these with actual column names from your dataset that seem meaningful
features = ['high', 'low', 'volumefrom', 'volumeto', 'close']  # Example features

# Standardize the features
x = df[features].values
x = StandardScaler().fit_transform(x)

# Apply PCA
pca = PCA(n_components=1)
principal_components = pca.fit_transform(x)

scaler = MinMaxScaler()
principal_components = scaler.fit_transform(principal_components)

# Create a DataFrame with the principal component
pca_df = pd.DataFrame(data=principal_components, columns=['PC1'])
pca_df['Date'] = df['Date']  # Ensure you have a date column for merging later

pca_df.describe()

Index(['Unnamed: 0', 'Date', 'high', 'low', 'open', 'volumefrom', 'volumeto',
       'close', 'conversionType', 'conversionSymbol', 'top_tier_volume_quote',
       'top_tier_volume_base', 'top_tier_volume_total', 'cccagg_volume_quote',
       'cccagg_volume_base', 'cccagg_volume_total', 'total_volume_quote',
       'total_volume_base', 'total_volume_total', 'volume'],
      dtype='object')


Unnamed: 0,PC1
count,4008.0
mean,0.181687
std,0.195989
min,0.0
25%,0.026905
50%,0.104242
75%,0.285144
max,1.0


In [11]:
stock_data = pd.read_csv('composite_data.csv')

stock_data['log_return'] = np.log(stock_data['Close']) - np.log(stock_data['Close'].shift(1))

stock_data['realized_volatility'] = stock_data['log_return'].rolling(window=20).apply(lambda x: np.sqrt(np.sum(252 * x**2)), raw=False)

merged_data_2 = pd.merge(pca_df[['Date', 'PC1']], stock_data[['Date', 'realized_volatility']], on='Date')
merged_data_2 = merged_data_2.dropna()

In [12]:
merged_data_2.head()

Unnamed: 0,Date,PC1,realized_volatility
20,2017-12-21,0.283567,0.308669
21,2017-12-22,0.414604,0.307029
22,2017-12-26,0.257651,0.307428
23,2017-12-27,0.258907,0.265454
24,2017-12-28,0.259951,0.266978


In [13]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.optimize import minimize

break_dates = ['2019-02-01', '2020-02-27', '2020-10-08', '2021-05-21']
for i, date in enumerate(break_dates):
    merged_data_2[f'break_{i+1}'] = (merged_data_2['Date'] >= date).astype(int)

# 독립 변수와 종속 변수 설정
merged_data_2['btc_diff'] = merged_data_2['PC1'] - merged_data_2['PC1'].shift(1)
merged_data_2['log_price_lag'] = merged_data_2['PC1'].shift(1)

# 필요한 행만 선택 (NaN 값 제거)
filtered_data_2 = merged_data_2.dropna(subset=['realized_volatility', 'log_price_lag', 'btc_diff'])

def nls_model(params, y, X):
    alpha, beta, gamma, rho = params[0], params[1], params[2], params[3]
    breaks = params[4:]
    y_pred = alpha + beta * X['PC1'] + gamma * (X['PC1'] - rho * X['PC1'])
    for i in range(len(break_dates)):
        y_pred += breaks[i] * X[f'break_{i+1}']
    residuals = y - y_pred
    return np.sum(residuals**2)

# 초기 추정값 설정
initial_params = np.zeros(4 + len(break_dates))
initial_params[0] = 0  # alpha
initial_params[1] = 0  # beta
initial_params[2] = 0  # gamma
initial_params[3] = 0.5  # 초기 rho 값

# 최적화 수행
result = minimize(nls_model, initial_params, args=(filtered_data_2['realized_volatility'], filtered_data_2),
                  method='L-BFGS-B')

# 최적화 결과
params = result.x
alpha, beta, gamma, rho = params[0], params[1], params[2], params[3]
breaks = params[4:]

print(f"alpha: {alpha}, beta: {beta}, gamma: {gamma}, rho: {rho}")
for i in range(len(break_dates)):
    print(f"break_{i+1}: {breaks[i]}")

# 예측값 계산
filtered_data_2['RV_t_predicted'] = alpha + beta * filtered_data_2['log_price_lag'] + \
                                  gamma * (filtered_data_2['PC1'] - rho * filtered_data_2['log_price_lag'])
for i in range(len(break_dates)):
    filtered_data_2['RV_t_predicted'] += breaks[i] * filtered_data_2[f'break_{i+1}']

# 예측값 출력
print(filtered_data_2[['Date', 'RV_t_predicted']])

alpha: 0.7429945928817034, beta: -0.42343255371575256, gamma: -0.21063467997909427, rho: 0.48638926801362753
break_1: -0.17881468194862532
break_2: 1.034813871750256
break_3: -0.6714483428709339
break_4: -0.12820270628931793
            Date  RV_t_predicted
21    2017-12-22        0.564645
22    2017-12-26        0.555644
23    2017-12-27        0.605759
24    2017-12-28        0.605135
25    2017-12-29        0.611030
...          ...             ...
1034  2021-12-22        0.517994
1035  2021-12-23        0.516851
1036  2021-12-27        0.507901
1037  2021-12-28        0.507432
1038  2021-12-29        0.508892

[1018 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['RV_t_predicted'] = alpha + beta * filtered_data_2['log_price_lag'] + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['RV_t_predicted'] += breaks[i] * filtered_data_2[f'break_{i+1}']


In [14]:
filtered_data_2['RV_t_predicted'] = alpha + beta * filtered_data_2['log_price_lag'] + \
                                  gamma * (filtered_data_2['PC1'] - rho * filtered_data_2['log_price_lag'])
for i in range(len(break_dates)):
    filtered_data_2['RV_t_predicted'] += breaks[i] * filtered_data_2[f'break_{i+1}']

# OLS 회귀 모델 설정
X_ols = filtered_data_2[['log_price_lag', 'btc_diff']]
for i in range(len(break_dates)):
    X_ols[f'break_{i+1}'] = filtered_data_2[f'break_{i+1}']
X_ols['const'] = 1  # 상수항 추가
y_ols = filtered_data_2['realized_volatility']

# OLS 회귀 분석 수행
ols_model_2 = sm.OLS(y_ols, X_ols).fit()

# 회귀 결과 요약
print(ols_model_2.summary())

                             OLS Regression Results                            
Dep. Variable:     realized_volatility   R-squared:                       0.342
Model:                             OLS   Adj. R-squared:                  0.338
Method:                  Least Squares   F-statistic:                     87.60
Date:                 Wed, 05 Jun 2024   Prob (F-statistic):           1.87e-88
Time:                         14:04:36   Log-Likelihood:                -709.73
No. Observations:                 1018   AIC:                             1433.
Df Residuals:                     1011   BIC:                             1468.
Df Model:                            6                                         
Covariance Type:             nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
log_price_lag    -0.5433      0.156 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['RV_t_predicted'] = alpha + beta * filtered_data_2['log_price_lag'] + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['RV_t_predicted'] += breaks[i] * filtered_data_2[f'break_{i+1}']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_ols[f'break_{i+1}'] = filtered_dat

In [15]:
import numpy as np
from scipy.stats import t

def predict(params, X):
    alpha, beta, gamma, rho = params[0], params[1], params[2], params[3]
    breaks = params[4:]
    y_pred = alpha + beta * X['log_price_lag'] + gamma * (X['log_price'] - rho * X['log_price_lag'])
    for i in range(len(break_dates)):
        y_pred += breaks[i] * X[f'break_{i+1}']
    return y_pred

# 함수 정의: RMSE 계산
def calculate_relative_rmse(actual, predicted):
    mse = np.mean((actual - predicted) ** 2)
    variance = np.sum(predicted ** 2)
    relative_mse = mse / variance
    return np.sqrt(relative_mse)

# 함수 정의: Clark and West 테스트
def clark_west_test(actual, pred1, pred2):
    f = (actual - pred1) ** 2 - (actual - pred2) ** 2 + (pred1 - pred2) ** 2
    mean_f = np.mean(f)
    std_f = np.std(f, ddof=1)
    t_statistic = mean_f / (std_f / np.sqrt(len(f)))
    
    df = len(f) - 1
    p_value = 1 - t.cdf(t_statistic, df=df)
    
    return t_statistic, p_value

# Out-of-sample 예측 수행 함수
def out_of_sample_evaluation(h, params):
    test_data = filtered_data.iloc[-h:]

    # 예측 수행 (test_data 사용)
    test_data['RV_t_predicted'] = predict(params, test_data)
    
    # Relative RMSE 계산
    rmse_model = calculate_relative_rmse(test_data['realized_volatility'], test_data['RV_t_predicted'])
    rmse_benchmark = calculate_relative_rmse(test_data['realized_volatility'], test_data['realized_volatility'].mean())
    
    # Clark and West 테스트 수행
    cw_stat, p_value = clark_west_test(test_data['realized_volatility'], test_data['RV_t_predicted'], test_data['realized_volatility'].mean())
    
    return rmse_model, rmse_benchmark, cw_stat, p_value

# 모델 최적화 (전체 데이터를 사용하여 최적화 수행)
result_2 = minimize(nls_model, initial_params, args=(filtered_data_2['realized_volatility'], filtered_data_2), method='L-BFGS-B')
params = result_2.x

# h 값에 따른 평가 수행
h_values = [30, 60, 120]
for h in h_values:
    rmse_model, rmse_benchmark, cw_stat, p_value = out_of_sample_evaluation(h, params)
    print(f"Out-of-sample evaluation for h={h}:")
    print(f"Relative RMSE: {rmse_model}")
    print(f"Relative RMSE benchmark: {rmse_benchmark}")
    print(f"Clark and West test statistic: {cw_stat}, p-value: {p_value}")
    print(f"------------------------------------------------------------")

Out-of-sample evaluation for h=30:
Relative RMSE: 0.20643995412388016
Relative RMSE benchmark: 0.32888396151715266
Clark and West test statistic: 220.46997853575763, p-value: 0.0
------------------------------------------------------------
Out-of-sample evaluation for h=60:
Relative RMSE: 0.14462469628708496
Relative RMSE benchmark: 0.300420926985187
Clark and West test statistic: 354.97298275229457, p-value: 0.0
------------------------------------------------------------
Out-of-sample evaluation for h=120:
Relative RMSE: 0.10103208554129804
Relative RMSE benchmark: 0.30266391763567063
Clark and West test statistic: 227.35661197070397, p-value: 0.0
------------------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['RV_t_predicted'] = predict(params, test_data)


In [17]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

filtered_data_2['RV_t_predicted'] = alpha + beta * filtered_data_2['log_price_lag'] + \
                                  gamma * (filtered_data_2['PC1'] - rho * filtered_data_2['log_price_lag'])
for i in range(len(break_dates)):
    filtered_data_2['RV_t_predicted'] += breaks[i] * filtered_data_2[f'break_{i+1}']

gamma_risk_aversion = 3  
theta_leverage = 6  
rf = 0.01  

filtered_data_2['sigma_sq'] = filtered_data_2['realized_volatility']**2
filtered_data_2['wt'] = (1 / gamma_risk_aversion) * ((theta_leverage * filtered_data_2['RV_t_predicted'] + (theta_leverage - 1) * rf) / (theta_leverage**2 * filtered_data_2['sigma_sq']))

filtered_data_2['Rp'] = filtered_data_2['wt'] * theta_leverage * (filtered_data_2['RV_t_predicted'] - rf) + (1 - filtered_data_2['wt']) * rf
filtered_data_2['Var_Rp'] = filtered_data_2['wt']**2 * theta_leverage**2 * filtered_data_2['sigma_sq']
filtered_data_2['CER'] = filtered_data_2['Rp'] - 0.5 * (1 / gamma_risk_aversion) * filtered_data_2['Var_Rp']

print(filtered_data_2[['Date', 'wt', 'Rp', 'Var_Rp', 'CER']])

print(filtered_data_2['Rp']/filtered_data_2['Var_Rp'])


            Date        wt        Rp    Var_Rp       CER
21    2017-12-22  0.337682  1.130384  0.386969  1.065889
22    2017-12-26  0.331514  1.092018  0.373933  1.029695
23    2017-12-27  0.484153  1.735789  0.594629  1.636684
24    2017-12-28  0.478154  1.712617  0.586665  1.614840
25    2017-12-29  0.561203  2.028187  0.695177  1.912324
...          ...       ...       ...       ...       ...
1034  2021-12-22  0.034912  0.116060  0.036750  0.109935
1035  2021-12-23  0.034491  0.114545  0.036228  0.108507
1036  2021-12-27  0.037718  0.122301  0.038942  0.115810
1037  2021-12-28  0.039947  0.128825  0.041206  0.121957
1038  2021-12-29  0.045943  0.147064  0.047526  0.139143

[1018 rows x 5 columns]
21      2.921126
22      2.920355
23      2.919112
24      2.919241
25      2.917511
          ...   
1034    3.158113
1035    3.161784
1036    3.140564
1037    3.126350
1038    3.094408
Length: 1018, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['RV_t_predicted'] = alpha + beta * filtered_data_2['log_price_lag'] + \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['RV_t_predicted'] += breaks[i] * filtered_data_2[f'break_{i+1}']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['sigma_sq'] = filter