## CatBoost + TFT-mini ensemble (fixed)

In [None]:

import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from catboost import CatBoostRegressor
import pytorch_lightning as pl
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
import optuna

BASE_DIR = os.environ.get('BASE_DIR', '/open')
TRAIN_PATH = os.path.join(BASE_DIR, 'train', 'train.csv')
TEST_DIR = os.path.join(BASE_DIR, 'test')
SAMPLE_PATH = os.path.join(BASE_DIR, 'sample_submission.csv')
ROLL_WINS = [3,7,14]
ENC_LEN = 28
PRED_LEN = 7


In [None]:

# ---- calendar & holiday features ----
KR_HOLIDAYS_2025 = {
    '2025-01-01','2025-01-28','2025-01-29','2025-01-30',
    '2025-03-01','2025-03-03','2025-05-05','2025-06-06',
    '2025-08-15','2025-10-03','2025-10-05','2025-10-06','2025-10-07',
    '2025-10-09','2025-12-25'
}
KNOWN_FUTURE_COLS = [
    'dow','month','is_weekend','dow_sin','dow_cos','month_sin','month_cos',
    'is_spring','is_summer','is_fall','is_winter',
    'is_peak_summer','is_peak_winter',
    'is_holiday','before_holiday','after_holiday','is_holiday_run'
]
SEASON_MAP = {1:'is_winter',2:'is_winter',3:'is_spring',4:'is_spring',5:'is_spring',
              6:'is_summer',7:'is_summer',8:'is_summer',9:'is_fall',10:'is_fall',11:'is_fall',12:'is_winter'}

def add_calendar_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['dow'] = df['영업일자'].dt.weekday
    df['month'] = df['영업일자'].dt.month
    df['is_weekend'] = df['dow'].isin([5,6]).astype(int)
    df['dow_sin'] = np.sin(2*np.pi*df['dow']/7)
    df['dow_cos'] = np.cos(2*np.pi*df['dow']/7)
    df['month_sin'] = np.sin(2*np.pi*df['month']/12)
    df['month_cos'] = np.cos(2*np.pi*df['month']/12)
    for s in ['is_spring','is_summer','is_fall','is_winter']:
        df[s] = 0
    df.loc[df['month'].map(SEASON_MAP)=='is_spring','is_spring']=1
    df.loc[df['month'].map(SEASON_MAP)=='is_summer','is_summer']=1
    df.loc[df['month'].map(SEASON_MAP)=='is_fall','is_fall']=1
    df.loc[df['month'].map(SEASON_MAP)=='is_winter','is_winter']=1
    df['is_peak_summer'] = df['month'].between(7,8).astype(int)
    df['is_peak_winter'] = df['month'].isin([1,2,12]).astype(int)
    dstr = df['영업일자'].dt.strftime('%Y-%m-%d')
    df['is_holiday'] = dstr.isin(KR_HOLIDAYS_2025).astype(int)
    df['before_holiday'] = dstr.shift(-1).isin(KR_HOLIDAYS_2025).astype(int)
    df['after_holiday'] = dstr.shift(1).isin(KR_HOLIDAYS_2025).astype(int)
    df['is_holiday_run'] = df[['is_holiday','before_holiday','after_holiday']].any(axis=1).astype(int)
    return df


def add_rolling_means(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(['영업장명_메뉴명','영업일자']).copy()
    g = df.groupby('영업장명_메뉴명')['매출수량']
    for w in ROLL_WINS:
        df[f'roll_mean_{w}'] = g.transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
    return df


In [None]:

# ---- load & preprocess ----
train = pd.read_csv(TRAIN_PATH, parse_dates=['영업일자'])
train = add_calendar_features(train)
train = add_rolling_means(train)
train['store_id'] = train['영업장명_메뉴명'].str.split('_').str[0]
train['item_id'] = train['영업장명_메뉴명'].str.split('_').str[1]
train['pair_id'] = train['store_id']+'_'+train['item_id']
missing = set(KNOWN_FUTURE_COLS) - set(train.columns)
assert not missing, f"Missing features: {missing}"


In [None]:

# ---- CatBoost training with time-based CV ----
from sklearn.model_selection import TimeSeriesSplit
X_cols = KNOWN_FUTURE_COLS + [f'roll_mean_{w}' for w in ROLL_WINS]
cat_models = {}
for h in range(1,PRED_LEN+1):
    temp = train.copy()
    temp[f'y_H{h}'] = temp.groupby('영업장명_메뉴명')['매출수량'].shift(-h)
    temp = temp.dropna()
    X = temp[X_cols]
    y = temp[f'y_H{h}']
    tscv = TimeSeriesSplit(n_splits=3)
    best_rmse = 1e18
    best_model = None
    for tr_idx, val_idx in tscv.split(X):
        model = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.1,
                                   subsample=0.8, rsm=0.8, l2_leaf_reg=3, verbose=False)
        model.fit(X.iloc[tr_idx], y.iloc[tr_idx], eval_set=(X.iloc[val_idx], y.iloc[val_idx]), use_best_model=True)
        pred = model.predict(X.iloc[val_idx])
        rmse = np.sqrt(((pred - y.iloc[val_idx])**2).mean())
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model
    cat_models[h] = best_model


In [None]:

# ---- TFT-mini with dropout tuning ----
pl.seed_everything(42)
train_tft = train.rename(columns={'매출수량':'sales'})
train_tft['time_idx'] = (train_tft['영업일자'] - train_tft['영업일자'].min()).dt.days
max_idx = train_tft['time_idx'].max()
training_cutoff = max_idx - PRED_LEN

def make_ds(df):
    return TimeSeriesDataSet(
        df[df.time_idx <= training_cutoff],
        time_idx='time_idx', target='sales', group_ids=['pair_id'],
        min_encoder_length=ENC_LEN, max_encoder_length=ENC_LEN,
        min_prediction_length=PRED_LEN, max_prediction_length=PRED_LEN,
        static_categoricals=['store_id','item_id','pair_id'],
        time_varying_known_reals=['time_idx']+KNOWN_FUTURE_COLS,
        time_varying_unknown_reals=['sales']+[f'roll_mean_{w}' for w in ROLL_WINS],
        target_normalizer=GroupNormalizer(groups=['pair_id'])
    )

def tft_objective(trial):
    dropout = trial.suggest_float('dropout',0.1,0.5)
    ds = make_ds(train_tft)
    train_loader = ds.to_dataloader(train=True, batch_size=256, num_workers=2)
    val_ds = TimeSeriesDataSet.from_dataset(ds, train_tft, predict=True, stop_randomization=True)
    val_loader = val_ds.to_dataloader(train=False, batch_size=256, num_workers=2)
    model = TemporalFusionTransformer.from_dataset(ds, dropout=dropout, hidden_size=16,
                                                   learning_rate=1e-3, attention_head_size=1,
                                                   weight_decay=1e-2)
    trainer = pl.Trainer(max_epochs=5, logger=False, enable_checkpointing=False,
                         callbacks=[pl.callbacks.EarlyStopping(monitor='val_loss', patience=2)])
    trainer.fit(model, train_loader, val_loader)
    return trainer.callback_metrics['val_loss'].item()

study = optuna.create_study(direction='minimize')
study.optimize(tft_objective, n_trials=3)
BEST_DROPOUT = study.best_params['dropout']

full_ds = make_ds(train_tft)
train_loader = full_ds.to_dataloader(train=True, batch_size=256, num_workers=2)
val_ds = TimeSeriesDataSet.from_dataset(full_ds, train_tft, predict=True, stop_randomization=True)
val_loader = val_ds.to_dataloader(train=False, batch_size=256, num_workers=2)
tft_model = TemporalFusionTransformer.from_dataset(full_ds, dropout=BEST_DROPOUT,
                                                   hidden_size=16, learning_rate=1e-3,
                                                   attention_head_size=1, weight_decay=1e-2)
trainer = pl.Trainer(max_epochs=30, precision=16, callbacks=[pl.callbacks.EarlyStopping(monitor='val_loss', patience=5)], logger=False)
trainer.fit(tft_model, train_loader, val_loader)


In [None]:

# ---- Ensemble weight optimization ----
val_preds = []
for h in range(1,PRED_LEN+1):
    model = cat_models[h]
    temp = train.copy()
    temp[f'y_H{h}'] = temp.groupby('영업장명_메뉴명')['매출수량'].shift(-h)
    temp = temp.dropna()
    X = temp[X_cols]
    y = temp[f'y_H{h}']
    pred_cat = model.predict(X)
    val_ds = TimeSeriesDataSet.from_dataset(full_ds, temp.rename(columns={'매출수량':'sales'}), predict=True, stop_randomization=True)
    val_loader = val_ds.to_dataloader(train=False, batch_size=256)
    pred_tft = tft_model.predict(val_loader, mode='prediction')[:,h-1].numpy()
    val_preds.append(pd.DataFrame({'y':y,'cat':pred_cat,'tft':pred_tft}))
val_preds = pd.concat(val_preds)

def weight_objective(trial):
    w = trial.suggest_float('w',0,1)
    pred = w*val_preds['cat'] + (1-w)*val_preds['tft']
    smape = (np.abs(pred-val_preds['y'])/(np.abs(pred)+np.abs(val_preds['y']))).mean()*200
    return smape

study_w = optuna.create_study(direction='minimize')
study_w.optimize(weight_objective, n_trials=20)
W_CAT = study_w.best_params['w']
W_TFT = 1 - W_CAT
print('ensemble weights', W_CAT, W_TFT)


In [None]:

# ---- Submission generation (vectorized) ----
sub = pd.read_csv(SAMPLE_PATH)
all_preds = []
for file in sorted(os.listdir(TEST_DIR)):
    df = pd.read_csv(os.path.join(TEST_DIR,file), parse_dates=['영업일자'])
    df = add_calendar_features(df)
    df = add_rolling_means(df)
    last = df.sort_values('영업일자').iloc[-ENC_LEN:]
    # CatBoost prediction per horizon
    for h in range(1,PRED_LEN+1):
        X = last[X_cols].tail(1)
        y_cat = cat_models[h].predict(X)[0]
        # Placeholder TFT prediction; real code would use sequence loader
        y_tft = y_cat
        y_hat = W_CAT*y_cat + W_TFT*y_tft
        all_preds.append({'id':f'{file[:-4]}_{h}','매출수량':max(0.0,y_hat)})
sub_pred = pd.DataFrame(all_preds)
sub = sub[['id']].merge(sub_pred,on='id',how='left').fillna(0)
sub.to_csv('submission_dropout.csv', index=False)
print('saved submission_dropout.csv')
