# Ridge Reg.

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
# =========================
# 실험 설정
# =========================

TRAIN_END_DATE = "2020-12-31"
RIDGE_ALPHA = 10.0

EMBEDDING_PATH = "data/kospidaq_embeddings_KR_FinBERT.xlsx"
RETURN_PATH    = "data/report_return_mapping.xlsx"

RETURN_COLS = [f"log_return_{i}" for i in range(11)]

In [3]:
# =========================
# 데이터 로드
# =========================

df_embed = pd.read_excel(EMBEDDING_PATH)
df_ret = pd.read_excel(RETURN_PATH)

print(df_embed.shape)
print(df_ret.shape)

(62450, 770)
(62450, 14)


In [28]:
# =========================
# 컬럼 선택
# =========================

embedding_cols = [c for c in df_embed.columns if c.startswith("embedding_")]

df = pd.concat(
    [
        df_embed[["date", "ticker"] + embedding_cols],
        df_ret[["ticker_code"] + RETURN_COLS],
    ],
    axis=1
)

df["date"] = pd.to_datetime(df["date"])
df.head()

Unnamed: 0,date,ticker,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,log_return_1,log_return_2,log_return_3,log_return_4,log_return_5,log_return_6,log_return_7,log_return_8,log_return_9,log_return_10
0,2025-12-30,제이투케이바이오,0.110109,0.213751,-0.42421,0.861657,-0.214212,-0.246262,0.626879,0.500181,...,0.049897,0.026393,0.009259,-0.027108,-0.037919,-0.040338,,,,
1,2025-12-30,대한항공,-0.346401,-0.031269,-0.079151,1.11824,0.426655,-0.103879,0.771418,0.755649,...,-0.015643,0.002215,-0.00222,-0.020157,-0.04073,-0.020157,,,,
2,2025-12-30,LG이노텍,0.00079,0.258992,-0.582935,1.163655,0.046262,-0.039124,0.673256,0.462595,...,-0.012999,0.007353,-0.007407,0.014652,0.043328,-0.035685,,,,
3,2025-12-30,삼양식품,0.125795,0.064767,-0.031858,0.96235,0.412019,0.03687,0.608777,0.956877,...,0.035903,0.034335,0.024871,-0.061132,-0.04316,-0.020518,,,,
4,2025-12-30,KT,0.220859,0.229314,-0.851023,0.338585,-0.164694,-0.113018,0.677291,0.34219,...,-0.025025,-0.019194,0.003795,-0.00381,-0.021134,-0.011472,,,,


In [29]:
# =========================
# (date, ticker) 기준 aggregation
# =========================

group_cols = ["date", "ticker_code"]

agg_dict = {c: "mean" for c in embedding_cols}
for c in RETURN_COLS:
    agg_dict[c] = "mean"

df_agg = (
    df
    .groupby(group_cols, as_index=False)
    .agg(agg_dict)
)

df_agg.head()


Unnamed: 0,date,ticker_code,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,...,log_return_1,log_return_2,log_return_3,log_return_4,log_return_5,log_return_6,log_return_7,log_return_8,log_return_9,log_return_10
0,2013-08-16,1040,0.150668,0.293588,0.174383,0.794043,-0.252831,0.999297,0.65633,1.204585,...,,,,,,,,,,
1,2013-08-16,3550,0.225724,0.500187,-0.165939,0.428608,-0.013055,0.300799,0.537054,0.922616,...,,,,,,,,,,
2,2013-08-16,4170,-0.480561,0.852025,-0.089869,1.137384,0.301496,0.51102,0.587942,1.060067,...,,,,,,,,,,
3,2013-08-16,4370,0.036889,0.559823,-0.007186,0.865696,0.127294,0.388543,0.498597,0.776847,...,,,,,,,,,,
4,2013-08-16,5180,-0.395822,0.556432,-0.179259,1.056218,0.058105,0.780324,0.507971,0.816543,...,,,,,,,,,,


In [30]:
def run_ridge_full_eval(
    df,
    embedding_cols,
    target_return,
    train_end_date,
    alpha=10.0
):
    """
    Ridge regression with time-based holdout.
    Returns train/test R2 and MSE.
    """

    df_tmp = df.dropna(subset=[target_return]).copy()

    train_mask = df_tmp["date"] <= train_end_date
    test_mask  = df_tmp["date"] > train_end_date

    X_train = df_tmp.loc[train_mask, embedding_cols].values
    y_train = df_tmp.loc[train_mask, target_return].values

    X_test  = df_tmp.loc[test_mask, embedding_cols].values
    y_test  = df_tmp.loc[test_mask, target_return].values

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=alpha))
    ])

    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test  = model.predict(X_test)

    return {
        "target": target_return,
        "n_train": len(y_train),
        "n_test": len(y_test),
        "train_r2": r2_score(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    }


In [31]:
results = []

for ret in RETURN_COLS:
    out = run_ridge_full_eval(
        df=df_agg,
        embedding_cols=embedding_cols,
        target_return=ret,
        train_end_date=TRAIN_END_DATE,
        alpha=RIDGE_ALPHA
    )
    results.append(out)

df_eval = (
    pd.DataFrame(results)
    .sort_values("test_r2", ascending=False)
    .reset_index(drop=True)
)

df_eval[
    ["target", "train_r2", "test_r2", "train_mse", "test_mse", "n_train", "n_test"]
]


Unnamed: 0,target,train_r2,test_r2,train_mse,test_mse,n_train,n_test
0,log_return_0,0.100399,-0.005161,0.001158,0.001597,15482,21897
1,log_return_9,0.063119,-0.047872,0.005612,0.007703,15500,21865
2,log_return_8,0.061467,-0.047965,0.005134,0.006947,15500,21879
3,log_return_7,0.062733,-0.050184,0.004659,0.006153,15500,21889
4,log_return_6,0.061822,-0.052376,0.004183,0.005341,15500,21902
5,log_return_10,0.064174,-0.052706,0.006184,0.008442,15500,21852
6,log_return_5,0.064223,-0.055971,0.003535,0.004522,15500,21902
7,log_return_3,0.063193,-0.057891,0.002254,0.002839,15500,21902
8,log_return_4,0.063059,-0.060567,0.002984,0.003666,15500,21902
9,log_return_2,0.066544,-0.064394,0.001603,0.001959,15500,21902


# Implied Returns

In [32]:
BEST_RETURN = "log_return_0"

def run_ridge_and_predict(
    df,
    embedding_cols,
    target_return,
    train_end_date,
    alpha=10.0
):
    df_tmp = df.dropna(subset=[target_return]).copy()

    train_mask = df_tmp["date"] <= train_end_date
    test_mask  = df_tmp["date"] > train_end_date

    X_train = df_tmp.loc[train_mask, embedding_cols].values
    y_train = df_tmp.loc[train_mask, target_return].values
    X_test  = df_tmp.loc[test_mask, embedding_cols].values

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=alpha))
    ])

    model.fit(X_train, y_train)

    df_tmp.loc[train_mask, "pred_return"] = model.predict(X_train)
    df_tmp.loc[test_mask,  "pred_return"] = model.predict(X_test)

    return df_tmp, model

In [33]:
df_pred, ridge_model = run_ridge_and_predict(
    df=df_agg,
    embedding_cols=embedding_cols,
    target_return=BEST_RETURN,
    train_end_date=TRAIN_END_DATE,
    alpha=RIDGE_ALPHA
)

df_pred[["date", "ticker_code", BEST_RETURN, "pred_return"]].head()


Unnamed: 0,date,ticker_code,log_return_0,pred_return
571,2014-01-03,4370,0.031375,0.006866
572,2014-01-03,5930,-0.009981,-0.00527
573,2014-01-03,11070,0.005999,-0.004042
574,2014-01-03,15760,-0.008721,0.015348
575,2014-01-03,86280,0.002224,0.00866


In [35]:
df_view = (
    df_pred
    .groupby(["date", "ticker_code"], as_index=False)
    .agg({"pred_return": "mean"})
)

df_view.head()

Unnamed: 0,date,ticker_code,pred_return
0,2014-01-03,4370,0.006866
1,2014-01-03,5930,-0.00527
2,2014-01-03,11070,-0.004042
3,2014-01-03,15760,0.015348
4,2014-01-03,86280,0.00866


In [36]:
def make_bl_view_Q(df_view, view_date):
    tmp = (
        df_view[df_view["date"] == view_date]
        .sort_values("ticker_code")
        .copy()
    )
    Q = tmp["pred_return"].values
    tickers = tmp["ticker_code"].values
    return Q, tickers

In [39]:
VIEW_DATE = df_view["date"]
Q, tickers = make_bl_view_Q(df_view, VIEW_DATE)

Q.shape, tickers.shape

((37379,), (37379,))

In [41]:
from pathlib import Path
import numpy as np
import pandas as pd

ADJ_CLOSE_PATH = Path("data/adj_close_wide_2014_2026.xlsx")
MCAP_PATH      = Path("data/market_cap_2014_2025.xlsx")

px = pd.read_excel(ADJ_CLOSE_PATH, index_col=0)
mc = pd.read_excel(MCAP_PATH, index_col=0)

px.index = pd.to_datetime(px.index)
mc.index = pd.to_datetime(mc.index)

# ticker_code를 문자열로 통일(005930 같은)
px.columns = px.columns.astype(str).str.zfill(6)
mc.columns = mc.columns.astype(str).str.zfill(6)

px = px.sort_index()
mc = mc.sort_index()

(px.shape, mc.shape, px.index.min(), px.index.max(), mc.index.max())

((2951, 2761),
 (2945, 2761),
 Timestamp('2014-01-02 00:00:00'),
 Timestamp('2026-01-09 00:00:00'),
 Timestamp('2025-12-30 00:00:00'))

In [42]:
def compute_log_returns(price_wide: pd.DataFrame) -> pd.DataFrame:
    r = np.log(price_wide / price_wide.shift(1))
    return r


In [43]:
from sklearn.covariance import LedoitWolf

def compute_prior_at_date(
    view_date: pd.Timestamp,
    px_wide: pd.DataFrame,
    mc_wide: pd.DataFrame,
    window: int = 252,
    min_coverage: float = 0.90,
    risk_aversion: float = 2.5
):
    """
    Returns:
      - tickers_univ: prior에 포함되는 종목 리스트 (정렬된 순서)
      - Sigma: (N,N) 공분산 (Ledoit-Wolf)
      - Pi: (N,) 내재 기대수익률
      - w_mkt: (N,) 시총가중치
    """
    view_date = pd.to_datetime(view_date)

    # 시총 데이터 끝나는 날짜 체크
    if view_date not in mc_wide.index:
        raise ValueError(f"market cap에 {view_date.date()}가 없습니다. (mc max={mc_wide.index.max().date()})")

    # 수익률 계산
    ret = compute_log_returns(px_wide)

    # view_date 기준, 직전 window일 수익률 구간 뽑기
    if view_date not in ret.index:
        raise ValueError(f"가격/수익률 데이터에 {view_date.date()}가 없습니다.")

    end_loc = ret.index.get_loc(view_date)
    start_loc = end_loc - window + 1
    if start_loc < 0:
        raise ValueError("윈도우가 너무 깁니다. 과거 데이터가 부족합니다.")

    ret_win = ret.iloc[start_loc:end_loc+1]  # (window, #tickers)

    # 커버리지 조건 (유효 관측치 비율)
    min_obs = int(np.ceil(window * min_coverage))
    valid_obs = ret_win.notna().sum(axis=0)
    tickers_cov = valid_obs[valid_obs >= min_obs].index

    # 당일 시총 존재 종목
    mcap_t = mc_wide.loc[view_date]
    tickers_mcap = mcap_t.dropna().index

    # 최종 prior 유니버스
    tickers_univ = sorted(list(set(tickers_cov).intersection(set(tickers_mcap))))
    if len(tickers_univ) < 2:
        raise ValueError("prior 유니버스가 너무 작습니다. (커버리지/시총 결측 확인)")

    # 공분산: 수익률 윈도우에서 유니버스만 뽑고, 결측은 행 단위로 제거
    X = ret_win[tickers_univ]
    X = X.dropna(axis=0, how="any")  # LedoitWolf는 NaN 허용 안 함

    if len(X) < min_obs:
        # dropna로 너무 줄어들면(공통 거래일 부족) 기준을 더 느슨하게 하거나 다른 처리 필요
        raise ValueError(f"공통 관측치가 부족합니다: {len(X)}행만 남음 (min_obs={min_obs})")

    lw = LedoitWolf().fit(X.values)
    Sigma = lw.covariance_

    # 시총가중치
    mcap_vec = mc_wide.loc[view_date, tickers_univ].values.astype(float)
    w_mkt = mcap_vec / np.nansum(mcap_vec)

    # 내재 기대수익률
    Pi = risk_aversion * (Sigma @ w_mkt)

    return tickers_univ, Sigma, Pi, w_mkt


In [44]:
def align_view_to_prior(
    df_view: pd.DataFrame,      # columns: date, ticker_code, pred_return
    view_date: pd.Timestamp,
    tickers_univ: list
):
    tmp = df_view[df_view["date"] == pd.to_datetime(view_date)].copy()
    tmp["ticker_code"] = tmp["ticker_code"].astype(str).str.zfill(6)

    view_tickers = set(tmp["ticker_code"].unique())
    prior_tickers = set(tickers_univ)

    common = sorted(list(view_tickers.intersection(prior_tickers)))
    missing_in_prior = sorted(list(view_tickers - prior_tickers))
    missing_in_view  = sorted(list(prior_tickers - view_tickers))

    if len(common) < 2:
        raise ValueError("교집합 종목이 너무 적습니다. (view/prior 정합성 문제)")

    # common 순서로 Q 만들기
    tmp_c = tmp[tmp["ticker_code"].isin(common)].sort_values("ticker_code")
    Q = tmp_c["pred_return"].values
    tickers_common = tmp_c["ticker_code"].values

    return {
        "tickers_common": tickers_common,
        "Q": Q,
        "missing_in_prior": missing_in_prior,  # view는 있는데 prior에 없는 종목
        "missing_in_view": missing_in_view,    # prior는 있는데 view가 없는 종목
        "n_view": len(view_tickers),
        "n_prior": len(prior_tickers),
        "n_common": len(common),
    }


In [45]:
# 예: view_date는 df_view에서 선택 (시총 데이터 마지막 날짜까지만!)
VIEW_DATE = min(df_view["date"].max(), mc.index.max())
VIEW_DATE = pd.to_datetime(VIEW_DATE)

tickers_univ, Sigma, Pi, w_mkt = compute_prior_at_date(
    view_date=VIEW_DATE,
    px_wide=px,
    mc_wide=mc,
    window=252,
    min_coverage=0.90,
    risk_aversion=2.5
)

aligned = align_view_to_prior(
    df_view=df_view,
    view_date=VIEW_DATE,
    tickers_univ=tickers_univ
)

print("VIEW_DATE:", VIEW_DATE.date())
print("n_view:", aligned["n_view"], "n_prior:", aligned["n_prior"], "n_common:", aligned["n_common"])
print("missing_in_prior (view-only) sample:", aligned["missing_in_prior"][:10])
print("missing_in_view (prior-only) sample:", aligned["missing_in_view"][:10])

Q = aligned["Q"]
tickers_common = aligned["tickers_common"]


VIEW_DATE: 2025-12-30
n_view: 13 n_prior: 2673 n_common: 13
missing_in_prior (view-only) sample: []
missing_in_view (prior-only) sample: ['000020', '000040', '000050', '000070', '000080', '000087', '000100', '000105', '000120', '000140']


# Black-Litterman Portfolio

In [46]:
def slice_prior_to_common(tickers_univ, Sigma, Pi, w_mkt, tickers_common):
    # tickers_common: array-like of ticker codes (sorted)
    idx_map = {t:i for i,t in enumerate(tickers_univ)}
    idx = np.array([idx_map[t] for t in tickers_common], dtype=int)

    Sigma_c = Sigma[np.ix_(idx, idx)]
    Pi_c = Pi[idx]
    w_c = w_mkt[idx]

    return Sigma_c, Pi_c, w_c


In [47]:
Sigma_c, Pi_c, w_mkt_c = slice_prior_to_common(
    tickers_univ=tickers_univ,
    Sigma=Sigma,
    Pi=Pi,
    w_mkt=w_mkt,
    tickers_common=tickers_common
)

Sigma_c.shape, Pi_c.shape, w_mkt_c.shape


((13, 13), (13,), (13,))

In [48]:
Q = aligned["Q"]
N = len(Q)

P = np.eye(N)   # absolute views

P.shape, Q.shape


((13, 13), (13,))

In [49]:
def compute_oos_mse_for_omega(df_pred, target_return, train_end_date):
    mask = df_pred["date"] > pd.to_datetime(train_end_date)
    y_true = df_pred.loc[mask, target_return].values
    y_pred = df_pred.loc[mask, "pred_return"].values
    return mean_squared_error(y_true, y_pred)

oos_mse = compute_oos_mse_for_omega(
    df_pred=df_pred,
    target_return=BEST_RETURN,   # "log_return_0"
    train_end_date=TRAIN_END_DATE
)

Omega = np.eye(N) * oos_mse

oos_mse, Omega.shape


(0.001597393904599764, (13, 13))

In [50]:
def black_litterman_posterior(Pi, Sigma, P, Q, Omega, tau=0.05):
    Sigma_t = tau * Sigma

    A = np.linalg.inv(Sigma_t) + P.T @ np.linalg.inv(Omega) @ P
    b = np.linalg.inv(Sigma_t) @ Pi + P.T @ np.linalg.inv(Omega) @ Q

    mu_bl = np.linalg.solve(A, b)  # inv(A)@b 보다 안정적
    return mu_bl

TAU = 0.05

mu_bl = black_litterman_posterior(
    Pi=Pi_c,
    Sigma=Sigma_c,
    P=P,
    Q=Q,
    Omega=Omega,
    tau=TAU
)

mu_bl.shape


(13,)

In [51]:
df_bl_out = pd.DataFrame({
    "ticker_code": tickers_common,
    "Pi_prior": Pi_c,
    "Q_view": Q,
    "mu_bl": mu_bl,
    "w_mkt": w_mkt_c
}).sort_values("mu_bl", ascending=False).reset_index(drop=True)

df_bl_out.head(20)


Unnamed: 0,ticker_code,Pi_prior,Q_view,mu_bl,w_mkt
0,348340,0.000387,0.017769,0.001366,9.6e-05
1,60720,0.000353,0.019014,0.001066,6.8e-05
2,11070,0.000411,0.011183,0.000984,0.001637
3,420570,0.000318,0.003713,0.000796,1.3e-05
4,78340,0.000209,0.01185,0.000652,9.4e-05
5,1530,0.00011,0.008238,0.000575,9.9e-05
6,1450,0.000208,0.008973,0.00056,0.000703
7,1120,0.000179,0.013676,0.000551,0.000322
8,145720,0.000276,-0.005911,0.000467,0.000131
9,3230,0.000191,0.002585,0.00043,0.002367


In [52]:
df_bl_out[["Pi_prior", "Q_view", "mu_bl"]].describe()

Unnamed: 0,Pi_prior,Q_view,mu_bl
count,13.0,13.0,13.0
mean,0.000239,0.007112,0.000649
std,0.000104,0.00756,0.000326
min,7.2e-05,-0.005911,0.000162
25%,0.000179,0.002585,0.00043
50%,0.000209,0.008238,0.00056
75%,0.000318,0.01185,0.000796
max,0.000411,0.019014,0.001366


In [53]:
df_bl_out[["Pi_prior", "Q_view", "mu_bl"]].corr()

Unnamed: 0,Pi_prior,Q_view,mu_bl
Pi_prior,1.0,0.426456,0.853023
Q_view,0.426456,1.0,0.761358
mu_bl,0.853023,0.761358,1.0
