In [87]:
# library import
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import optuna

from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")

In [88]:
# 평가 함수
def _validate_input(answer_df, submission_df):
    # ① 컬럼 개수·이름 일치 여부
    if len(answer_df.columns) != len(submission_df.columns) or not all(answer_df.columns == submission_df.columns):
        raise ValueError("The columns of the answer and submission dataframes do not match.")


    # ② 필수 컬럼에 NaN 존재 여부
    if submission_df.isnull().values.any():
        raise ValueError("The submission dataframe contains missing values.")


    # ③ pair 중복 여부
    pairs = list(zip(submission_df["leading_item_id"], submission_df["following_item_id"]))
    if len(pairs) != len(set(pairs)):
        raise ValueError("The submission dataframe contains duplicate (leading_item_id, following_item_id) pairs.")
        
def comovement_f1(answer_df, submission_df):
    """공행성쌍 F1 계산"""
    ans = answer_df[["leading_item_id", "following_item_id"]].copy()
    sub = submission_df[["leading_item_id", "following_item_id"]].copy()


    ans["pair"] = list(zip(ans["leading_item_id"], ans["following_item_id"]))
    sub["pair"] = list(zip(sub["leading_item_id"], sub["following_item_id"]))


    G = set(ans["pair"])
    P = set(sub["pair"])


    tp = len(G & P)
    fp = len(P - G)
    fn = len(G - P)


    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0


    return f1


def comovement_nmae(answer_df, submission_df, eps=1e-6):
    """
    전체 U = G ∪ P에 대한 clipped NMAE 계산
    """
    ans = answer_df[["leading_item_id", "following_item_id", "value"]].copy()
    sub = submission_df[["leading_item_id", "following_item_id", "value"]].copy()


    ans["pair"] = list(zip(ans["leading_item_id"], ans["following_item_id"]))
    sub["pair"] = list(zip(sub["leading_item_id"], sub["following_item_id"]))


    G = set(ans["pair"])
    P = set(sub["pair"])
    U = G | P


    ans_val = dict(zip(ans["pair"], ans["value"]))
    sub_val = dict(zip(sub["pair"], sub["value"]))


    errors = []
    for pair in U:
        if pair in G and pair in P:
            # 정수 변환(반올림)
            y_true = int(round(float(ans_val[pair])))
            y_pred = int(round(float(sub_val[pair])))
            rel_err = abs(y_true - y_pred) / (abs(y_true) + eps)
            rel_err = min(rel_err, 1.0) # 오차 100% 이상은 100%로 간주
        else:
            rel_err = 1.0  # FN, FP는 오차 100%
        errors.append(rel_err)


    return np.mean(errors) if errors else 1.0


def comovement_score(answer_df, submission_df):
    _validate_input(answer_df, submission_df)
    S1 = comovement_f1(answer_df, submission_df)
    nmae_full = comovement_nmae(answer_df, submission_df, 1e-6)
    S2 = 1 - nmae_full
    score = 0.6 * S1 + 0.4 * S2
    return score

In [89]:
# Data Load 및 value, weight pivot 생성
train = pd.read_csv('./train.csv')

# year, month, item_id 기준으로 value 합산 (seq만 다르다면 value 합산)
monthly = (
    train
    .groupby(["item_id", "year", "month"], as_index=False)[["value", 'weight']]
    .sum()
)

# year, month를 하나의 키(ym)로 묶기
monthly["ym"] = pd.to_datetime(
    monthly["year"].astype(str) + "-" + monthly["month"].astype(str).str.zfill(2)
)

# item_id × ym 피벗 (월별 총 무역량 매트릭스 생성)

# value에 대한 pivot
pivot_val = (
    monthly
    .pivot(index="item_id", columns="ym", values="value")
    .fillna(0.0)
)

# weight에 대한 pivot
pivot_wgt = (monthly
             .pivot(index='item_id', columns='ym', values='weight')
             .fillna(0.0))

In [90]:
# value pivot
print('pivot_val')
pivot_val.head() # 2022-01 - 2025-07

pivot_val


ym,2022-01-01,2022-02-01,2022-03-01,2022-04-01,2022-05-01,2022-06-01,2022-07-01,2022-08-01,2022-09-01,2022-10-01,...,2024-10-01,2024-11-01,2024-12-01,2025-01-01,2025-02-01,2025-03-01,2025-04-01,2025-05-01,2025-06-01,2025-07-01
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AANGBULD,14276.0,52347.0,53549.0,0.0,26997.0,84489.0,0.0,0.0,0.0,0.0,...,428725.0,144248.0,26507.0,25691.0,25805.0,0.0,38441.0,0.0,441275.0,533478.0
AHMDUILJ,242705.0,120847.0,197317.0,126142.0,71730.0,149138.0,186617.0,169995.0,140547.0,89292.0,...,123085.0,143451.0,78649.0,125098.0,80404.0,157401.0,115509.0,127473.0,89479.0,101317.0
ANWUJOKX,0.0,0.0,0.0,63580.0,81670.0,26424.0,8470.0,0.0,0.0,80475.0,...,0.0,0.0,0.0,27980.0,0.0,0.0,0.0,0.0,0.0,0.0
APQGTRMF,383999.0,512813.0,217064.0,470398.0,539873.0,582317.0,759980.0,216019.0,537693.0,205326.0,...,683581.0,2147.0,0.0,25013.0,77.0,20741.0,2403.0,3543.0,32430.0,40608.0
ATLDMDBO,143097177.0,103568323.0,118403737.0,121873741.0,115024617.0,65716075.0,146216818.0,97552978.0,72341427.0,87454167.0,...,60276050.0,30160198.0,42613728.0,64451013.0,38667429.0,29354408.0,42450439.0,37136720.0,32181798.0,57090235.0


In [91]:
# weight pivot
print('pivot_wgt')
pivot_wgt.head()

pivot_wgt


ym,2022-01-01,2022-02-01,2022-03-01,2022-04-01,2022-05-01,2022-06-01,2022-07-01,2022-08-01,2022-09-01,2022-10-01,...,2024-10-01,2024-11-01,2024-12-01,2025-01-01,2025-02-01,2025-03-01,2025-04-01,2025-05-01,2025-06-01,2025-07-01
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AANGBULD,17625.0,67983.0,69544.0,0.0,34173.0,103666.0,0.0,0.0,0.0,0.0,...,786651.0,249144.0,33133.0,32937.0,33083.0,0.0,49050.0,0.0,865246.0,1046036.0
AHMDUILJ,100990.0,43444.0,64113.0,42637.0,21468.0,59424.0,61587.0,63625.0,61245.0,20382.0,...,42986.0,43763.0,24379.0,62351.0,23521.0,43332.0,44913.0,44035.0,25574.0,34463.0
ANWUJOKX,0.0,0.0,0.0,89967.0,118992.0,41649.0,13888.0,0.0,0.0,119940.0,...,0.0,0.0,0.0,37211.0,0.0,0.0,0.0,0.0,0.0,0.0
APQGTRMF,50193.0,81429.0,43310.0,62505.0,84680.0,37425.0,114600.0,39305.0,104865.0,43123.0,...,118952.0,698.0,0.0,1907.0,11.0,2777.0,347.0,335.0,4974.0,6314.0
ATLDMDBO,163308448.0,113468029.0,131798388.0,118641599.0,106301802.0,63769133.0,148292927.0,101468186.0,77986006.0,94320028.0,...,143545801.0,70368609.0,99495350.0,153804927.0,93762902.0,76888377.0,119375444.0,112349280.0,95457203.0,165713328.0


In [92]:
# 선후행 관계인 공행성쌍 추출
def safe_corr(x, y):
    if np.std(x) == 0 or np.std(y) == 0:
        return 0.0
    return float(np.corrcoef(x, y)[0, 1])

def find_comovement_pairs(pivot, max_lag=12, min_nonzero=36, corr_threshold=0.35):
    items = pivot.index.to_list()
    months = pivot.columns.to_list()
    n_months = len(months)

    results = []

    for i, leader in tqdm(enumerate(items)):
        x = pivot.loc[leader].values.astype(float)
        if np.count_nonzero(x) < min_nonzero:
            continue

        for follower in items:
            if follower == leader:
                continue

            y = pivot.loc[follower].values.astype(float)
            if np.count_nonzero(y) < min_nonzero:
                continue

            best_lag = None
            best_corr = 0.0

            # lag = 1 ~ max_lag 탐색
            for lag in range(1, max_lag + 1):
                if n_months <= lag:
                    continue
                corr = safe_corr(x[:-lag], y[lag:])
                if abs(corr) > abs(best_corr):
                    best_corr = corr
                    best_lag = lag

            # 임계값 이상이면 공행성쌍으로 채택
            if best_lag is not None and abs(best_corr) >= corr_threshold:
                results.append({
                    "leading_item_id": leader,
                    "following_item_id": follower,
                    "best_lag": best_lag,
                    "max_corr": best_corr,
                })

    pairs = pd.DataFrame(results)
    return pairs

pairs = find_comovement_pairs(pivot_val)
print("탐색된 공행성쌍 수:", len(pairs))
pairs.head()

100it [00:05, 19.86it/s]

탐색된 공행성쌍 수: 3001





Unnamed: 0,leading_item_id,following_item_id,best_lag,max_corr
0,AHMDUILJ,APQGTRMF,6,0.419733
1,AHMDUILJ,ATLDMDBO,4,0.483281
2,AHMDUILJ,AXULOHBQ,9,0.391992
3,AHMDUILJ,BJALXPFS,10,0.57447
4,AHMDUILJ,BSRMSVTC,12,0.479705


In [93]:
# 학습 데이터 생성
def build_training_data(pivot_val, pivot_wgt, pairs, df):
    """
    공행성쌍 + 시계열을 이용해 (X, y) 학습 데이터를 만드는 함수
    input X:
      - b_t, b_t_1, a_t_lag, max_corr, best_lag
    target y:
      - b_t_plus_1
    """
    months = pivot_val.columns.to_list()
    n_months = len(months)

    rows = []

    for row in tqdm(pairs.itertuples(index=False), desc='Create Train data'):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)

        if leader not in pivot_val.index or follower not in pivot_val.index:
            continue

        a_series_val = pivot_val.loc[leader].values.astype(float) # value pivot
        a_series_wgt = pivot_wgt.loc[leader].values.astype(float) # weight pivot
        
        a_hs4 = df.loc[df['item_id'] == leader, 'hs4'].unique()[0].astype(str) # 각 품목의 hs4
        b_hs4 = df.loc[df['item_id'] == follower, 'hs4'].unique()[0].astype(str) # 각 품목의 hs4

        a_hs2 = a_hs4[:2]
        b_hs2 = b_hs4[:2]

        b_series_val = pivot_val.loc[follower].values.astype(float)
        b_series_wgt = pivot_val.loc[follower].values.astype(float)

        # t+1이 존재하고, t-lag >= 0인 구간만 학습에 사용
        for t in range(max(lag, 1), n_months - 1):
            b_t = b_series_val[t] # t 시간인 경우의 value 값
            b_t_1 = b_series_val[t - 1] # t-1 시간인 경우의 value값

            b_w_t = b_series_wgt[t] # t 시간인 경우의 weight값
            b_w_t_1 = b_series_wgt[t-1] # t-1 시간인 경우의 weight값
            
            a_t_lag = a_series_val[t - lag] # 
            a_w_t_lag = a_series_wgt[t - lag] # 

            b_t_plus_1 = b_series_val[t + 1]

            rows.append({
                'time': t, # 현재 시간
                "b_t": b_t, 
                "b_t_1": b_t_1,
                "b_w_t": b_w_t,
                "b_w_t_1": b_w_t_1,
                "a_t_lag": a_t_lag,
                "a_w_t_lag": a_w_t_lag,
                "max_corr": corr,
                "best_lag": float(lag),
                "a_hs2": a_hs2,
                "b_hs2": b_hs2,
                "a_hs4": a_hs4,
                "b_hs4": b_hs4,
                "target": b_t_plus_1,
            })

    df_train = pd.DataFrame(rows)
    return df_train

df_train_model = build_training_data(pivot_val, pivot_wgt, pairs, train) # 데이터 생성
df_train_model[['a_hs2', 'b_hs2', 'a_hs4', 'b_hs4']] = df_train_model[['a_hs2', 'b_hs2', 'a_hs4', 'b_hs4']].astype('category') # hs2, hs4
print('생성된 학습 데이터의 shape :', df_train_model.shape)
df_train_model.head()

Create Train data: 3001it [00:03, 940.39it/s]


생성된 학습 데이터의 shape : (105015, 14)


Unnamed: 0,time,b_t,b_t_1,b_w_t,b_w_t_1,a_t_lag,a_w_t_lag,max_corr,best_lag,a_hs2,b_hs2,a_hs4,b_hs4,target
0,6,759980.0,582317.0,759980.0,582317.0,242705.0,100990.0,0.419733,6.0,21,81,2102,8105,216019.0
1,7,216019.0,759980.0,216019.0,759980.0,120847.0,43444.0,0.419733,6.0,21,81,2102,8105,537693.0
2,8,537693.0,216019.0,537693.0,216019.0,197317.0,64113.0,0.419733,6.0,21,81,2102,8105,205326.0
3,9,205326.0,537693.0,205326.0,537693.0,126142.0,42637.0,0.419733,6.0,21,81,2102,8105,169440.0
4,10,169440.0,205326.0,169440.0,205326.0,71730.0,21468.0,0.419733,6.0,21,81,2102,8105,698033.0


In [94]:
# 각 feature DataType 확인
df_train_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105015 entries, 0 to 105014
Data columns (total 14 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   time       105015 non-null  int64   
 1   b_t        105015 non-null  float64 
 2   b_t_1      105015 non-null  float64 
 3   b_w_t      105015 non-null  float64 
 4   b_w_t_1    105015 non-null  float64 
 5   a_t_lag    105015 non-null  float64 
 6   a_w_t_lag  105015 non-null  float64 
 7   max_corr   105015 non-null  float64 
 8   best_lag   105015 non-null  float64 
 9   a_hs2      105015 non-null  category
 10  b_hs2      105015 non-null  category
 11  a_hs4      105015 non-null  category
 12  b_hs4      105015 non-null  category
 13  target     105015 non-null  float64 
dtypes: category(4), float64(9), int64(1)
memory usage: 8.4 MB


In [95]:
# 훈련 데이터에 대한 정보

# 사용할 feature
feature_cols = ["time", "b_t", "b_t_1", "b_w_t", "b_w_t_1", "a_t_lag", "a_w_t_lag", "max_corr", "best_lag", "a_hs2", "b_hs2", "a_hs4", "b_hs4"]

start, end = df_train_model['time'].min(), df_train_model['time'].max()
print(f'기간의 범위: {start} - {end}')
print(f'마지막 달 부터 11개월 전: {end-11}\n')
print('2025-07: 42')
print('2025-06: 41')
print('2024-07: 30')


기간의 범위: 1 - 41
마지막 달 부터 11개월 전: 30

2025-07: 42
2025-06: 41
2024-07: 30


In [96]:
# 평가 지표의 S2 계산 함수(상대 오차)
def calculate_custom_error(y_true, y_pred):
    """
    상대 오차(Relative Error) 계산 로직
    """
    # 음수 예측 보정
    y_pred = np.maximum(y_pred, 0)
    
    eps = 1e-6
    diff = np.abs(y_true - y_pred)
    denom = np.abs(y_true) + eps
    
    rel_err = diff / denom
    bounded_err = np.minimum(rel_err, 1.0)
    
    return np.mean(bounded_err)

In [97]:
# 2024-07 부터 2024-06 까지 훈련과 검증을 수행하는 함수
def train_validation_with_fold(df, feats, base_model, model_type):
    end = df['time'].max() # 마지막 달 6월
    start = end-11 # 시작 달 7월
    fold_scores = [] # 각 fold의 점수
    for vtime in range(start, end+1):
        # 훈련 데이터
        x_tr, y_tr = df.loc[df['time'] < vtime, feats], df.loc[df['time'] < vtime, 'target']

        # 검증 데이터
        x_val, y_val = df.loc[df['time'] == vtime, feats], df.loc[df['time'] == vtime, 'target']
        
        # 최근 데이터일수록 큰 가중치 부여
        min_time = x_tr['time'].min()
        max_time = x_tr['time'].max()
        norm_time = (x_tr['time'] - min_time) / (max_time - min_time)
        sample_weight = np.exp(norm_time*3)

        fit_params = {}
        # 모델 Type 별 fit 입력 값
        if model_type == 'lgb': # LightGBM
            fit_params = {
                'eval_set': [(x_val, y_val)],
                'sample_weight': sample_weight,
                'callbacks': [lgb.early_stopping(stopping_rounds=50, verbose=False)]
            }

        elif model_type == 'xgb': # XGBoost
            fit_params = {
                'eval_set': [(x_val, y_val)],
                'sample_weight': sample_weight,
                'verbose': False
            }

        elif model_type == 'hgb': # Histogram Gradient Boost
            fit_params = {
                'sample_weight': sample_weight
            }
        
        base_model.fit(x_tr, y_tr, **fit_params)

        preds = base_model.predict(x_val) # 추론
        
        score = calculate_custom_error(y_val, preds) # 검증에 대한 점수
        fold_scores.append(score)


    return np.mean(fold_scores) # 각 fold의 점수에 대한 점수 반환

In [98]:
# optuna를 통한 하이퍼 파라미터 최적화 함수
def objective(trial, df, feats, seed, model_type):
    cat_cols = ['a_hs2', 'b_hs2', 'a_hs4', 'b_hs4']
    model = None
    if model_type == 'lgb':
        params = {
            'objective': 'regression',
            'verbosity': -1,
            'n_estimators': trial.suggest_int('n_estimators', 400, 2000),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 10, 200),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
            'random_state': seed,
            'n_jobs': -1
        }
        model = LGBMRegressor(**params)

    elif model_type == 'xgb':
        params = {
            'objective': 'reg:squarederror',
            'n_estimators': trial.suggest_int('n_estimators', 400, 2000),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
            'early_stopping_rounds': 50, 
            'random_state': seed,
            'n_jobs': -1,
            'enable_categorical': True, 
            'tree_method': 'hist'
        }
        model = XGBRegressor(**params)

    elif model_type == 'hgb':
        is_categorical = [f in cat_cols for f in feats]
        params = {
            'loss': 'squared_error',
            'max_iter': trial.suggest_int('max_iter', 400, 2000),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 200),
            'l2_regularization': trial.suggest_float('l2_regularization', 1e-3, 10.0, log=True),
            'categorical_features': is_categorical, # Boolean Mask 전달
            'random_state': seed
        }
        model = HistGradientBoostingRegressor(**params)

    score = train_validation_with_fold(df, feats, model, model_type) # fold 점수 반환

    return score

In [99]:
# 각 SEED 별 최적의 하이퍼 파라미터 탐색 -- ## 수정 각 모델마다 1개의 SEED 사용 ##
def training_with_seed(df, feats, model_type):
    optuna.logging.set_verbosity(optuna.logging.INFO)
    SEED = [1]
    trials = 20
    
    # objective 함수 내에 있던 cat_cols 정보 (HGB 마스크 생성용)
    cat_cols = ['a_hs2', 'b_hs2', 'a_hs4', 'b_hs4']

    Scores = []
    Params = []
    
    print(f'{model_type} 모델 (SEED {len(SEED)}개: {SEED}), Trial = {trials}')
    
    for seed in SEED:
        # 1. 최적화 수행
        study = optuna.create_study(direction='minimize', study_name='Optimization')
        study.optimize(lambda trial: objective(trial, df, feats, seed, model_type), n_trials=trials)

        print(f'SEED: {seed}')
        print(f'Best_Score: {study.best_value}\n')
        
        # 2. [핵심] 고정 파라미터와 최적 파라미터 병합 (Merge)
        best_params = study.best_params # Optuna가 찾은 값들
        final_params = {}

        if model_type == 'lgb':
            final_params = {
                'objective': 'regression',
                'verbosity': -1,
                'random_state': seed,
                'n_jobs': -1
            }
            
        elif model_type == 'xgb':
            final_params = {
                'objective': 'reg:squarederror',
                'random_state': seed,
                'n_jobs': -1,
                'enable_categorical': True, 
                'tree_method': 'hist'
            }
            
        elif model_type == 'hgb':
            # HGB는 categorical_features 마스크가 필요함
            is_categorical = [f in cat_cols for f in feats]
            final_params = {
                'loss': 'squared_error',
                'categorical_features': is_categorical,
                'random_state': seed
            }

        # 고정 파라미터 딕셔너리에 Optuna 최적 파라미터를 덮어씌움 (Update)
        final_params.update(best_params)
    
        Scores.append(study.best_value)
        Params.append(final_params) # 이제 모든 파라미터가 포함됨
    
    return Scores, Params

In [100]:
# LGB optuna
_, lgb_params = training_with_seed(df_train_model, feature_cols, 'lgb')

[I 2025-11-28 05:08:01,955] A new study created in memory with name: Optimization


lgb 모델 (SEED 1개: [1]), Trial = 20


[I 2025-11-28 05:08:42,724] Trial 0 finished with value: 0.6657517773089668 and parameters: {'n_estimators': 1598, 'learning_rate': 0.0013033619075800964, 'num_leaves': 96, 'max_depth': 12, 'subsample': 0.8501322796399804, 'colsample_bytree': 0.9872881027988333, 'reg_alpha': 0.0024526949421631505, 'reg_lambda': 0.15557166348394752}. Best is trial 0 with value: 0.6657517773089668.
[I 2025-11-28 05:08:47,690] Trial 1 finished with value: 0.6018682263710703 and parameters: {'n_estimators': 1555, 'learning_rate': 0.005000071144937592, 'num_leaves': 50, 'max_depth': 4, 'subsample': 0.5778792408349296, 'colsample_bytree': 0.9297809955270023, 'reg_alpha': 0.0026863445187137007, 'reg_lambda': 0.06178734024043109}. Best is trial 1 with value: 0.6018682263710703.
[I 2025-11-28 05:08:48,925] Trial 2 finished with value: 0.5885474846918524 and parameters: {'n_estimators': 1891, 'learning_rate': 0.07213024683411125, 'num_leaves': 115, 'max_depth': 3, 'subsample': 0.6492088924237205, 'colsample_bytr

SEED: 1
Best_Score: 0.5574234491335498



In [101]:
# XGB optuna
_, xgb_params = training_with_seed(df_train_model, feature_cols, 'xgb')

[I 2025-11-28 05:11:31,635] A new study created in memory with name: Optimization


xgb 모델 (SEED 1개: [1]), Trial = 20


[I 2025-11-28 05:11:35,406] Trial 0 finished with value: 0.5529871227782798 and parameters: {'n_estimators': 1305, 'learning_rate': 0.059238529418810096, 'max_depth': 7, 'subsample': 0.7039388445537336, 'colsample_bytree': 0.7861058574315025, 'reg_alpha': 0.056243531054134766, 'reg_lambda': 0.0102149648426016}. Best is trial 0 with value: 0.5529871227782798.
[I 2025-11-28 05:11:39,070] Trial 1 finished with value: 0.5823067686363269 and parameters: {'n_estimators': 479, 'learning_rate': 0.017957939276075707, 'max_depth': 3, 'subsample': 0.8520084718363244, 'colsample_bytree': 0.9730037462792338, 'reg_alpha': 0.2367834693885073, 'reg_lambda': 1.3196764779689176}. Best is trial 0 with value: 0.5529871227782798.
[I 2025-11-28 05:12:31,161] Trial 2 finished with value: 0.5632991420388778 and parameters: {'n_estimators': 1581, 'learning_rate': 0.09712937575670637, 'max_depth': 15, 'subsample': 0.572188195655627, 'colsample_bytree': 0.6282691595706034, 'reg_alpha': 4.196530458924404, 'reg_la

SEED: 1
Best_Score: 0.521296992885136



In [102]:
# HGB optuna
_, hgb_params = training_with_seed(df_train_model, feature_cols, 'hgb')

[I 2025-11-28 05:25:35,691] A new study created in memory with name: Optimization


hgb 모델 (SEED 1개: [1]), Trial = 20


[I 2025-11-28 05:27:20,618] Trial 0 finished with value: 0.6567858860725041 and parameters: {'max_iter': 926, 'learning_rate': 0.0023954644383620055, 'max_depth': 12, 'max_leaf_nodes': 144, 'l2_regularization': 0.42233371806503495}. Best is trial 0 with value: 0.6567858860725041.
[I 2025-11-28 05:28:14,524] Trial 1 finished with value: 0.4713051315611811 and parameters: {'max_iter': 985, 'learning_rate': 0.0932455122647868, 'max_depth': 12, 'max_leaf_nodes': 84, 'l2_regularization': 0.1299384387562076}. Best is trial 1 with value: 0.4713051315611811.
[I 2025-11-28 05:29:11,381] Trial 2 finished with value: 0.45643860534984243 and parameters: {'max_iter': 709, 'learning_rate': 0.01897622000118871, 'max_depth': 15, 'max_leaf_nodes': 92, 'l2_regularization': 0.27641080971811927}. Best is trial 2 with value: 0.45643860534984243.
[I 2025-11-28 05:29:35,951] Trial 3 finished with value: 0.49086689386229243 and parameters: {'max_iter': 959, 'learning_rate': 0.01136662167790142, 'max_depth': 4

SEED: 1
Best_Score: 0.4545077238185123



In [103]:
# 추가 모델 정의 (Linear, Ridge, Lasso, MLP)
X = df_train_model[feature_cols].copy()
y = df_train_model['target'].copy()

numeric_features = ["time", "b_t", "b_t_1", "b_w_t", "b_w_t_1", "a_t_lag", "a_w_t_lag", "max_corr", "best_lag"]
categorical_features = ["a_hs2", "b_hs2", "a_hs4", "b_hs4"]

# 데이터 전처리
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

linear_model = Pipeline([('preprocessor', preprocessor), ('regressor', LinearRegression())])
ridge_model = Pipeline([('preprocessor', preprocessor), ('regressor', Ridge(alpha=1.0))])
lasso_model = Pipeline([('preprocessor', preprocessor), ('regressor', Lasso(alpha=0.01))])
mlp_model = Pipeline([('preprocessor', preprocessor), ('regressor', MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42))])

additional_models = {
    'linear': linear_model,
    'ridge': ridge_model,
    'lasso': lasso_model,
    'mlp': mlp_model
}

In [104]:
# 모든 모델 앙상블 및 잔차 학습 모델 추가
class FinalHybridModel:
    def __init__(self, lgb_params_list, xgb_params_list, hgb_params_list, additional_models, weights=None):
        self.lgb_params = lgb_params_list
        self.xgb_params = xgb_params_list
        self.hgb_params = hgb_params_list
        self.add_models = additional_models
        
        self.trained_lgb = []
        self.trained_xgb = []
        self.trained_hgb = []
        self.trained_add = {}
        
        self.residual_model = None
        
        if weights is None:
            self.weights = {
                'lgb': 3.0, 'xgb': 3.0, 'hgb': 2.0,
                'linear': 0.5, 'ridge': 0.5, 'lasso': 0.5, 'mlp': 1.0
            }
        else:
            self.weights = weights

    def fit(self, X, y, use_residual=True): # 학습
        print("===== 1. Start Training Base Models with Time Weights =====")
        
        # 시간 가중치 계산
        times = X['time']
        min_time = times.min()
        max_time = times.max()
        denom = max_time - min_time
        
        norm_time = (times - min_time) / denom
        sample_weight = np.exp(norm_time * 3).values
        
        # -------------------------------------------------------
        
        # 2) LGBM
        print(f"Training {len(self.lgb_params)} LGBM Seed Models...")
        for params in self.lgb_params:
            model = LGBMRegressor(**params)
            model.fit(X, y, sample_weight=sample_weight) 
            self.trained_lgb.append(model)
            
        # 3) XGB
        print(f"Training {len(self.xgb_params)} XGB Seed Models...")
        for params in self.xgb_params:
            params['enable_categorical'] = True
            params['tree_method'] = 'hist'
            model = XGBRegressor(**params)
            model.fit(X, y, sample_weight=sample_weight, verbose=False)
            self.trained_xgb.append(model)
            
        # 4) HGB
        print(f"Training {len(self.hgb_params)} HGB Seed Models...")
        for params in self.hgb_params:
            model = HistGradientBoostingRegressor(**params)
            # [수정] sample_weight 전달
            model.fit(X, y, sample_weight=sample_weight)
            self.trained_hgb.append(model)
            
        # 5) Additional Models Training
        print("Training Additional Models...")
        for name, model in self.add_models.items():
            print(f" - Fitting {name}...")
            
            if name == 'mlp':
                model.fit(X, y)
            else:
                try:
                    model.fit(X, y, regressor__sample_weight=sample_weight)
                except:
                    print(f"   (Warning: {name} fit without weights)")
                    model.fit(X, y)
                    
            self.trained_add[name] = model
            
        # 6) Residual Learning
        if use_residual:
            print("===== 2. Start Residual Learning =====")
            base_preds = self._predict_voting(X)
            residuals = y - base_preds
            
            print("Fitting Residual Model (LGBM)...")
            res_params = {
                'n_estimators': 300, 'learning_rate': 0.01, 'max_depth': 3,
                'random_state': 42, 'n_jobs': -1, 'verbosity': -1, 'reg_alpha': 1.0
            }
            self.residual_model = LGBMRegressor(**res_params)
            self.residual_model.fit(X, residuals, sample_weight=sample_weight)
            
        print("===== Training Completed =====")

    def _predict_voting(self, X):
        # (기존 코드와 동일)
        pred_lgb = np.mean([m.predict(X) for m in self.trained_lgb], axis=0)
        pred_xgb = np.mean([m.predict(X) for m in self.trained_xgb], axis=0)
        pred_hgb = np.mean([m.predict(X) for m in self.trained_hgb], axis=0)
        
        pred_add = {}
        for name, model in self.trained_add.items():
            pred_add[name] = model.predict(X)
            
        final_pred = (
            pred_lgb * self.weights['lgb'] +
            pred_xgb * self.weights['xgb'] +
            pred_hgb * self.weights['hgb'] +
            pred_add['linear'] * self.weights['linear'] +
            pred_add['ridge'] * self.weights['ridge'] +
            pred_add['lasso'] * self.weights['lasso'] +
            pred_add['mlp'] * self.weights['mlp']
        )
        return final_pred / sum(self.weights.values())

    def predict(self, X):
        base_pred = self._predict_voting(X)
        if self.residual_model is not None:
            res_pred = self.residual_model.predict(X)
            return base_pred + res_pred
        return base_pred

In [105]:
# 앙상블 모델 학습
voting_weights = {
    'lgb': 0.5, 'xgb': 0.5, 'hgb': 4.0,
    'linear': 0.5, 'ridge': 0.5, 'lasso': 0.5, 'mlp': 3.5
}

final_model = FinalHybridModel(
    lgb_params_list=lgb_params, 
    xgb_params_list=xgb_params, 
    hgb_params_list=hgb_params, 
    additional_models=additional_models,
    weights=voting_weights
)

final_model.fit(X, y, use_residual=True)

===== 1. Start Training Base Models with Time Weights =====
Training 1 LGBM Seed Models...
Training 1 XGB Seed Models...
Training 1 HGB Seed Models...
Training Additional Models...
 - Fitting linear...
 - Fitting ridge...
 - Fitting lasso...
 - Fitting mlp...
===== 2. Start Residual Learning =====
Fitting Residual Model (LGBM)...
===== Training Completed =====


In [106]:
def predict_final_submission(pivot_val, pivot_wgt, pairs, final_model, df):
    months = pivot_val.columns.to_list()
    n_months = len(months)

    t_last = n_months-1 # 2025-07
    t_prev = n_months-2 # 2025-06

    cat_cols = ['a_hs2', 'b_hs2', 'a_hs4', 'b_hs4']
    feature_cols = ["time", "b_t", "b_t_1", "b_w_t", "b_w_t_1", "a_t_lag", "a_w_t_lag", "max_corr", "best_lag", "a_hs2", "b_hs2", "a_hs4", "b_hs4"]

    preds = []

    print("Generating Final Predictions...")
    for row in tqdm(pairs.itertuples(index=False), total=len(pairs)):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        corr = float(row.max_corr)

        # 데이터 유효성 검사
        if leader not in pivot_val.index or follower not in pivot_val.index:
            continue

        a_series_val = pivot_val.loc[leader].values.astype(float)
        a_series_wgt = pivot_wgt.loc[leader].values.astype(float)

        b_series_val = pivot_val.loc[follower].values.astype(float)
        b_series_wgt = pivot_wgt.loc[follower].values.astype(float)

        # HS 코드 추출 (df 원본 참조)
        a_hs4 = df.loc[df['item_id'] == leader, 'hs4'].iloc[0]
        b_hs4 = df.loc[df['item_id'] == follower, 'hs4'].iloc[0]
            
        a_hs2 = str(a_hs4)[:2]
        b_hs2 = str(b_hs4)[:2]
        a_hs4 = str(a_hs4)
        b_hs4 = str(b_hs4)
        
        # Lag 범위 체크
        if t_last - lag < 0:
            continue
            
        b_t = b_series_val[t_last]
        b_t_1 = b_series_val[t_prev]
        b_w_t = b_series_wgt[t_last]
        b_w_t_1 = b_series_wgt[t_prev]
        
        a_t_lag = a_series_val[t_last - lag]
        a_w_t_lag = a_series_wgt[t_last - lag]

        # DataFrame 형태로 입력 생성 (컬럼 순서 및 이름 일치 필수)
        input_row = pd.DataFrame([{
            "time": t_last, # 2025-07 시점
            "b_t": b_t, 
            "b_t_1": b_t_1,
            "b_w_t": b_w_t, 
            "b_w_t_1": b_w_t_1, 
            "a_t_lag": a_t_lag, 
            "a_w_t_lag": a_w_t_lag, 
            "max_corr": corr, 
            "best_lag": float(lag), 
            "b_hs2": b_hs2,
            "a_hs2": a_hs2,
            "a_hs4": a_hs4, 
            "b_hs4": b_hs4,
        }])
        
        # 학습 때 사용한 컬럼 순서로 정렬 (안전장치)
        input_row = input_row[feature_cols] # feature_cols에 hs2가 없다면 위 dict에서 제외 필요
        for c in cat_cols:
            input_row[c] = input_row[c].astype('category')

        # 예측 수행
        y_pred = final_model.predict(input_row)[0]

        # 후처리 (음수 제거 및 반올림)
        y_pred = max(0.0, float(y_pred))
        y_pred = int(round(y_pred))

        preds.append({
            "leading_item_id": leader,
            "following_item_id": follower,
            "value": y_pred,
        })

    submission_df = pd.DataFrame(preds)
    return submission_df


final_submission = predict_final_submission(pivot_val, pivot_wgt, pairs, final_model, train)

print("최종 예측 완료")
print(final_submission.head())

# 저장
final_submission.to_csv("final_submission.csv", index=False)

Generating Final Predictions...


100%|██████████| 3001/3001 [01:33<00:00, 32.04it/s]


최종 예측 완료
  leading_item_id following_item_id     value
0        AHMDUILJ          APQGTRMF    105191
1        AHMDUILJ          ATLDMDBO  70630515
2        AHMDUILJ          AXULOHBQ     79749
3        AHMDUILJ          BJALXPFS    163597
4        AHMDUILJ          BSRMSVTC    244025
