ETF 대신 전부 자산군 데이터 활용하는 걸로 수정

- yield를 price로 수정

In [1]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from tools.portfolio import black_litterman, vectorize_corr
from tqdm import tqdm
from mvgarch.ugarch import UGARCH
from mvgarch.mgarch import DCCGARCH
from arch import arch_model

import warnings
warnings.filterwarnings('ignore')

Process

- In the Time t
    - Feature 만들고, 해당 Feature들의 correlation matrix 계산

    - 과거 correlation matrix 기반으로, 미래 1-day의 corr matirx prediction (Using GCC-GARCH)

    - prediction한 corr matrix를 포함해서, corr matrix를 clustering

        - 현재 포지션이 있을 경우

            - 예측된 Cluster가 과거와 동일한 경우: Holding

                - 만약 진입후 한달이 지났다면 새롭게 리벨런싱

            - 예측된 Cluster가 달라진 경우: 포지션 change

    - 예측한 cluster에 해당하는 과거 cluster에서 자산군들의 mean-return vector, covariance matrix 추정

    - 추정한 것들을 Black-Litterman에 넣고 optimize --> weight vector 생성

- weight dataframe을 가지고, Back-testing

In [2]:
window_size = 60
start_date = "2007-01-03"
end_date = "2025-04-16"
range_n_clusters = list(range(2, 10))

In [3]:
df = pd.read_pickle("data/asset_cc2.pkl")
rtn = np.log(df).diff().dropna() * 100
holding_rtn = (np.log(df) - np.log(df).shift(20)).shift(-20).dropna()["1997-05":] * 100
holding_cov = rtn.rolling(window=window_size).cov().shift(-20*6).dropna()
rolling_corr_matrix = rtn.rolling(window=window_size).corr().dropna()["1997-05":]
rolling_cov_matrix = rtn.rolling(window=window_size).cov().dropna()["1997-05":]

# Index 맞추기
rtn = rtn["1997-05":]

In [10]:
rtn

Unnamed: 0,brent,dxy,gold,silver,snp,t10
1997-05-01,-0.054810,-0.381463,0.146434,1.585489,-0.350044,0.281149
1997-05-02,-3.005235,0.237297,-0.029270,-0.842465,1.799614,0.093734
1997-05-05,0.112931,0.123584,0.787291,0.842465,2.093553,0.093743
1997-05-06,0.562748,-0.671247,-0.816569,-0.736768,-0.289506,0.000000
1997-05-07,0.112171,0.000000,0.263197,0.526816,-1.484754,-0.843368
...,...,...,...,...,...,...
2025-05-09,3.226086,-0.298537,0.238533,0.865780,-0.071177,0.000000
2025-05-12,1.083434,1.434745,-2.668052,-0.447305,3.204001,-0.766210
2025-05-13,2.538323,-0.779135,0.516778,1.154000,0.722208,-0.382885
2025-05-14,-1.086475,0.039596,-2.099024,-2.246941,0.102384,-0.382738


In [4]:
days_lst = rtn.loc[start_date:end_date].index

In [5]:
result_weight_dict = {}
predicted_corr_matrix_dict = {}
original_black_litterman_weight_dict = {}
best_k_dict = {}
estimated_regime_lst = []

In [None]:
i = 0

for today in tqdm(days_lst[:]):
    historical_rolling_corr = rolling_corr_matrix.loc[:today]
    historical_rtn = rtn.loc[:today]
    
    # Estimate GARCH for use in the DCC-GARCH
    garch_specs = []
    for col in historical_rtn.columns:
        ug = UGARCH(order=(1,1))
        ug.spec(returns=historical_rtn[col])
        ug.fit()            
        garch_specs.append(ug)
        
    # Estimate DCC-GARCH model
    dcc = DCCGARCH()
    dcc.spec(ugarch_objs=garch_specs, returns=historical_rtn)
    dcc.fit() 
    dcc.forecast(n_ahead=1)  # 반환값은 None
    
    predicted_corr_matrix = dcc.fc_cor[:,:,0]
    predicted_cov_matrix = dcc.fc_cov[:,:,0]
    predicted_corr_matrix_dict[today] = predicted_corr_matrix # Save the predicted matrix
    
    
    # Predicted corr matrix를 기존 corr matrix와 합치기기
    index_dates_lst = historical_rtn.index
    
    corr_array_lst = []
    for date in index_dates_lst:
        corr_array_lst.append(historical_rolling_corr.loc[date].values)
        
    stacked_corr_matrix = np.vstack(
        [
            np.vstack([vectorize_corr(m) for m in corr_array_lst]),
            vectorize_corr(predicted_corr_matrix)
        ]
    )
    
    
    # Kmeans Clustering
    silhouette_avg_scores = []
    best_score = -1
    best_k = None
    best_labels = None
    best_model = None

    for n_clusters in range_n_clusters:
        kmeans = KMeans(n_clusters=n_clusters)
        labels = kmeans.fit_predict(stacked_corr_matrix)

        # 2. 해당 레이블로 실루엣 스코어 계산
        score = silhouette_score(stacked_corr_matrix, labels)
        silhouette_avg_scores.append(score)

        # 3. 최고 스코어 갱신 시, 모델과 레이블 저장
        if score > best_score:
            best_score = score
            best_k = n_clusters
            best_labels = labels
            best_model = kmeans
    
    best_k_dict[today] = best_k
    
    labels_series = pd.Series(
        best_labels, 
        index= (
            index_dates_lst.tolist() +
            [rtn.loc[today:].index[1]]
        )
    )
    
    # Save
    labels_series_tmp = labels_series.copy()
    labels_series_tmp.index = labels_series_tmp.index.astype(str)
    estimated_regime_lst.append(
        {today:labels_series_tmp.to_dict()}
    )
    
    predicted_state = labels_series[-1]
    same_regime_date_lst = labels_series[labels_series == predicted_state].index
    

    
    # ['brent', 'dxy', 'gold', 'silver', 'snp', 't10']
    w_mkt = np.array([0.05, 0.05, 0.05, 0.05, 0.6, 0.2])
    
    mu_bl, w_bl = black_litterman(
        sigma= rolling_cov_matrix.loc[today].values,
        w_mkt= w_mkt,
        p = np.identity(len(w_mkt)),
        q = past_holding_rtn_vector,
        omega = past_holding_cov_matrix,
        tau=0.5,
    )
    
    original_black_litterman_weight_dict[today] = [mu_bl, w_bl]
    
    w_bl = np.where(w_bl>=0, w_bl, 0)
    w_bl /= np.sum(w_bl)
    
    result_weight_dict[today] = w_bl
    
    # Save results
    serializable_pred_corr = {str(k): v.tolist() for k, v in predicted_corr_matrix_dict.items()}
    serializable_bl_orig   = {
        str(k): {
            "mu_bl": mu_bl.tolist(),
            "w_bl": w_bl.tolist(),
        } for k, (mu_bl, w_bl) in original_black_litterman_weight_dict.items()
    }
    serializable_result_w  = {str(k): v.tolist() for k, v in result_weight_dict.items()}
    serializable_best_k  = {str(k): v for k, v in best_k_dict.items()}
    serializable_estimated_regime_lst  = [{str(key):value for key, value in inner_dict.items()} for inner_dict in estimated_regime_lst]
    
    # Saving the results
    with open(f"results/res2/predicted_corr_matrix{i}.json", "w", encoding="utf-8") as f:
        json.dump(serializable_pred_corr, f, ensure_ascii=False, indent=4)
    with open(f"results/res2/original_black_litterman{i}.json", "w", encoding="utf-8") as f:
        json.dump(serializable_bl_orig, f, ensure_ascii=False, indent=4)
    with open(f"results/res2/result_weights{i}.json", "w", encoding="utf-8") as f:
        json.dump(serializable_result_w, f, ensure_ascii=False, indent=4)
    with open(f"results/res2/best_k{i}.json", "w", encoding="utf-8") as f:
        json.dump(serializable_best_k, f, ensure_ascii=False, indent=4)
    with open(f"results/res2/estimated_regime{i}.json", "w", encoding="utf-8") as f:
        json.dump(serializable_estimated_regime_lst, f, ensure_ascii=False, indent=4)

  0%|          | 13/4568 [04:50<28:13:56, 22.31s/it]


KeyboardInterrupt: 

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (2531806412.py, line 2)