In [1]:
import json
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from tools.portfolio import black_litterman, vectorize_corr

In [2]:
df = pd.read_pickle("data/asset_cc2.pkl")

window_size = 60
rtn = np.log(df).diff().dropna() * 100
holding_rtn = (np.log(df) - np.log(df).shift(20)).shift(-20).dropna()["1997-05":] * 100
holding_cov = rtn.rolling(window=window_size).cov().shift(-120).dropna()
rolling_corr_matrix = rtn.rolling(window=window_size).corr().dropna()["1997-05":]
rolling_cov_matrix = rtn.rolling(window=window_size).cov().dropna()["1997-05":]

days_lst = rtn.loc["2007-01-03":"2025-04-16"].index[40:]
range_n_clusters = list(range(2, 5))

rtn = rtn["1997-05":]

In [3]:
corr_vector_dict = {
    d: vectorize_corr(rolling_corr_matrix.loc[d].values)
    for d in rtn.index
}

In [4]:
# ['brent', 'dxy', 'gold', 'silver', 'snp', 't10']
w_mkt = np.array([0.05, 0.05, 0.05, 0.05, 0.6, 0.2])

In [5]:
# 속도 최적화를 위해 다음과 같이 개선할 수 있습니다.
# 1. corr_vector_dict에서 stacked_corr_matrix를 매번 np.vstack으로 만드는 대신, 미리 numpy array로 만들어두고 슬라이싱만 사용
# 2. KMeans의 n_init 파라미터를 1로 줄여 반복 횟수 감소 (정확도에 민감하다면 유지)
# 3. silhouette_score 계산을 for문 안에서 best_score 갱신과 함께 처리
# 4. holding_rtn, holding_cov, rolling_cov_matrix 등에서 loc 슬라이싱 대신 numpy array로 미리 변환해두고 인덱스 매핑
# 5. 불필요한 리스트 변환, dict 변환 최소화
# 6. tqdm의 disable 파라미터로 불필요한 출력 방지(필요시)
# 7. 예외처리 except Exception as e로 구체화

# 1. corr_vector_dict를 numpy array로 변환
corr_dates = list(rtn.index)
corr_matrix_np = np.stack([corr_vector_dict[d] for d in corr_dates])

# 2. 날짜 인덱스 매핑
date_to_idx = {d: i for i, d in enumerate(corr_dates)}

result_weight_dict = {}
estimated_regime_lst = []
best_k_dict = {}

i = 0
for today in tqdm(days_lst[:]):
    try:
        # hist 인덱스 슬라이싱
        hist_idx = date_to_idx[today]
        hist_dates = corr_dates[:hist_idx + 1]
        stacked_corr_matrix = corr_matrix_np[:hist_idx + 1]

        # KMeans Clustering
        best_score = -1
        best_k = None
        best_labels = None
        best_model = None

        for n_clusters in range_n_clusters:
            kmeans = KMeans(n_clusters=n_clusters)
            labels = kmeans.fit_predict(stacked_corr_matrix)

            # 2. 해당 레이블로 실루엣 스코어 계산
            score = silhouette_score(stacked_corr_matrix, labels)
            if score > best_score:
                best_score = score
                best_k = n_clusters
                best_labels = labels
                best_model = kmeans

        best_k_dict[today] = best_k
        labels_series = pd.Series(best_labels, index=hist_dates)
        estimated_regime_lst.append(labels_series.to_dict())
        current_state = labels_series.iloc[-1]
        same_regime_date_lst = labels_series[labels_series == current_state].index

        # Black-Litterman
        # holding_rtn, holding_cov, rolling_cov_matrix는 DataFrame이므로 loc 사용
        past_holding_rtn_vector = holding_rtn.loc[same_regime_date_lst].mean().values
        past_holding_cov_matrix = holding_cov.loc[same_regime_date_lst].groupby(level=1).mean().values

        mu_bl, w_bl = black_litterman(
            sigma=rolling_cov_matrix.loc[today].values,
            w_mkt=w_mkt,
            p=np.identity(len(w_mkt)),
            q=past_holding_rtn_vector,
            omega=past_holding_cov_matrix,
            tau=0.15,
        )
        w_bl = np.where(w_bl >= 0, w_bl, 0)
        w_bl /= np.sum(w_bl)
        result_weight_dict[today] = w_bl

        # Save
        if i % 1000 == 0:
            serializable_result_w = {str(k): v.tolist() for k, v in result_weight_dict.items()}
            serializable_estimated_regime_lst = [{str(key): value for key, value in inner_dict.items()} 
                                                 for inner_dict in estimated_regime_lst]
            serializable_best_k_dict = {str(k): v for k, v in best_k_dict.items()}

            with open(f"results/withoutPrediction/result_weightsfull.json", "w", encoding="utf-8") as f:
                json.dump(serializable_result_w, f, ensure_ascii=False, indent=4)
            with open(f"results/withoutPrediction/estimated_regimefull.json", "w", encoding="utf-8") as f:
                json.dump(serializable_estimated_regime_lst, f, ensure_ascii=False, indent=4)
            with open(f"results/withoutPrediction/best_k_dict.json", "w", encoding="utf-8") as f:
                json.dump(serializable_best_k_dict, f, ensure_ascii=False, indent=4)
    except Exception as e:
        print(f"error in {today}: {e}")
        continue
    i+=1
0
# Final Save
serializable_result_w = {str(k): v.tolist() for k, v in result_weight_dict.items()}
serializable_estimated_regime_lst = [{str(key): value for key, value in inner_dict.items()} 
                                     for inner_dict in estimated_regime_lst]
serializable_best_k_dict = {str(k): v for k, v in best_k_dict.items()}

with open(f"results/withoutPrediction/result_weightsfull.json", "w", encoding="utf-8") as f:
    json.dump(serializable_result_w, f, ensure_ascii=False, indent=4)
with open(f"results/withoutPrediction/estimated_regimefull.json", "w", encoding="utf-8") as f:
    json.dump(serializable_estimated_regime_lst, f, ensure_ascii=False, indent=4)
with open(f"results/withoutPrediction/best_k_dict.json", "w", encoding="utf-8") as f:
    json.dump(serializable_best_k_dict, f, ensure_ascii=False, indent=4)

100%|██████████| 4528/4528 [1:35:09<00:00,  1.26s/it]  
