In [None]:
# @title 🔗 0. Google Drive 마운트 & 경로 설정
from google.colab import drive
import os, gc, warnings

drive.mount('/content/drive')   # 최초 1회 인증 필요
ROOT_DIR  = '/content/drive/MyDrive/instaCart_data'
os.makedirs(ROOT_DIR, exist_ok=True)
gc.enable()

In [None]:
# ───────────────────────────────────────────────────────────────
# @title 1. 📦 PyG (CPU) & 기타 라이브러리 설치  ― Torch 2.6.0 + CPU LightGBM · XGBoost · CatBoost (NumPy-2.x 호환판)
# ───────────────────────────────────────────────────────────────
!pip install -q --upgrade pip setuptools wheel

!pip -q install lightgbm==4.3.0 xgboost==2.0.3  category_encoders pytorch_tabnet optuna
# 1) PyG core 3종 + pyg-lib  ▶  torch-2.6.0+cpu wheel index
WHEEL_URL="https://data.pyg.org/whl/torch-2.6.0+cpu.html"
!pip -q install pyg-lib torch-scatter torch-sparse -f $WHEEL_URL
!pip -q install torch-geometric          # 이건 공식 PyPI (CPU 빌드)
# 2) 선택 라이브러리
!pip -q install torch-geometric-temporal faiss-cpu neo4j ipywidgets neomodel tqdm jedi>=0.18.0 catboost
# 3) LightGBM · XGBoost · CatBoost (NumPy-2.x 호환판)
!pip -q install lightgbm==4.3.0 xgboost==2.0.3  category_encoders pytorch_tabnet optuna
# 4) LightGBM update
!pip install --upgrade lightgbm
import lightgbm, numpy
print(lightgbm.__version__, numpy.__version__)  # 4.3.x  2.0.x

In [None]:
# ===============================================================
#  @title 2. Stacking Feature Engineering – 마지막 주문 예측 (user×product 단일 row)
# ===============================================================
import gc, warnings
import numpy as np, pandas as pd
from pathlib import Path
warnings.filterwarnings('ignore')

ROOT_DIR = '/content/drive/MyDrive/instaCart_data'
OUT_DIR  = '/content/drive/MyDrive/instaCart_data/feature2'

#───────────────────────────────────────────────────────────────
# util
#───────────────────────────────────────────────────────────────
DTYPES = {
    'order_id': np.int32, 'user_id': np.int32, 'product_id': np.uint16,
    'aisle_id': np.uint16, 'department_id': np.uint8,
    'order_number': np.uint8, 'order_dow': np.uint8, 'order_hour_of_day': np.uint8,
    'days_since_prior_order': np.float32, 'add_to_cart_order': np.uint8,
    'reordered': np.uint8
}
def mem_crunch(df, to_float16=False):
    for c in df.columns:
        if df[c].dtype == 'float64':
            df[c] = df[c].astype('float16' if to_float16 else 'float32')
        elif df[c].dtype == 'int64':
            df[c] = df[c].astype('int32')
    return df

def max_streak(arr):
    m = cur = 0
    for v in arr:
        cur = cur + 1 if v else 0
        m   = max(m, cur)
    return m

#───────────────────────────────────────────────────────────────
# 1. CSV 로드
#───────────────────────────────────────────────────────────────
aisles    = pd.read_csv(f'{ROOT_DIR}/aisles.csv')
depart    = pd.read_csv(f'{ROOT_DIR}/departments.csv')
orders    = pd.read_csv(f'{ROOT_DIR}/orders.csv', dtype=DTYPES)
prior     = pd.read_csv(f'{ROOT_DIR}/order_products__prior.csv', dtype=DTYPES)
train_lbl = pd.read_csv(f'{ROOT_DIR}/order_products__train.csv', dtype=DTYPES)
products  = pd.read_csv(f'{ROOT_DIR}/products.csv')

#───────────────────────────────────────────────────────────────
# 2. 제품·카테고리 메타
#───────────────────────────────────────────────────────────────
prods = (
    products.merge(aisles, on='aisle_id')
            .merge(depart, on='department_id')
            [['product_id','aisle_id','department_id']]
            .astype({'product_id':np.uint16,'aisle_id':np.uint16,'department_id':np.uint8})
)
del aisles, depart, products
gc.collect()

#───────────────────────────────────────────────────────────────
# 3. “입력 vs 예측” 주문 구분:
#     입력 = 1~(max_ord−2), 타겟 = 실제 마지막 주문(eval_set=='train')
#───────────────────────────────────────────────────────────────
orders['max_ord']   = orders.groupby('user_id')['order_number'].transform('max')
orders['is_input']  = orders['order_number'] <= orders['max_ord'] - 2    # 1~max−2
orders['is_target'] = orders['eval_set'] == 'train'                     # 실제 마지막

# 마지막 주문만 뽑아 train_order_id 로 사용
orders_target = (
    orders.loc[orders.is_target, ['user_id','order_id']]
          .rename(columns={'order_id':'train_order_id'})
)

#───────────────────────────────────────────────────────────────
# 4. prior_orders 및 orders_meta 준비
#───────────────────────────────────────────────────────────────
orders_prior = orders.loc[orders.is_input,
    ['order_id','user_id','order_number','order_dow','order_hour_of_day','days_since_prior_order']
]
prior_orders = mem_crunch(prior.merge(orders_prior, on='order_id', how='inner'))
orders_meta  = mem_crunch(
    orders[['order_id','order_dow','order_hour_of_day','days_since_prior_order']]
)

#───────────────────────────────────────────────────────────────
# 5. User & Product 특성 집계
#───────────────────────────────────────────────────────────────
print('🧮  aggregating …')
# — User features
user_feats = mem_crunch(
    prior_orders.groupby('user_id').agg(
        u_total_orders      = ('order_number','max'),
        u_avg_days_between  = ('days_since_prior_order','mean'),
        u_reorder_ratio     = ('reordered','mean'),
        u_distinct_products = ('product_id','nunique'),
        u_avg_basket        = ('product_id','size'),
        u_avg_order_hour    = ('order_hour_of_day','mean'),
        u_avg_order_dow     = ('order_dow','mean')
    ).reset_index(),
    to_float16=True
)
# — Product features
prod_feats = mem_crunch(
    prior_orders.groupby('product_id').agg(
        p_total_orders   = ('order_id','size'),
        p_total_reorders = ('reordered','sum'),
        p_reorder_rate   = ('reordered','mean'),
        p_avg_cart_pos   = ('add_to_cart_order','mean')
    ).reset_index()
      .merge(prods, on='product_id', how='left'),
    to_float16=True
)
# Dept & Aisle reorder rates, popularity rank
dept_rate  = prior_orders.merge(prods[['product_id','department_id']])\
                         .groupby('department_id').reordered.mean().rename('dept_reorder_rate')
aisle_rate = prior_orders.merge(prods[['product_id','aisle_id']])\
                         .groupby('aisle_id').reordered.mean().rename('aisle_reorder_rate')
pop_rank   = np.log1p(prior_orders.product_id.value_counts()\
                       .rank(ascending=False,method='min')).rename('p_pop_rank')
prod_feats = mem_crunch(
    prod_feats.merge(dept_rate , on='department_id', how='left')
              .merge(aisle_rate, on='aisle_id',      how='left')
              .merge(pop_rank.to_frame(), on='product_id', how='left'),
    to_float16=True
)

#───────────────────────────────────────────────────────────────
# 6. User×Product 집계 (up_feats)
#───────────────────────────────────────────────────────────────
up_base = prior_orders.groupby(['user_id','product_id']).agg(
    up_orders       = ('order_id','size'),
    up_last_order   = ('order_number','max'),
    up_first_order  = ('order_number','min'),
    up_avg_cart_pos = ('add_to_cart_order','mean')
).reset_index()
up_base['u_total_orders']       = up_base.user_id.map(user_feats.set_index('user_id')['u_total_orders'])
up_base['up_days_since_last']   = (up_base.u_total_orders - up_base.up_last_order + 1).astype(np.uint8)
up_base['up_orders_since_last'] =  up_base.u_total_orders - up_base.up_last_order + 1
up_base['up_order_rate']        =  up_base.up_orders / up_base.u_total_orders

# 최근 5회 등장 카운트
prior_orders['rev_order'] = prior_orders.groupby('user_id')['order_number']\
                                        .transform('max') - prior_orders['order_number'] + 1
last5 = (prior_orders[prior_orders.rev_order<=5]
         .groupby(['user_id','product_id']).size()
         .rename('up_last5_cnt').reset_index())
up_base = up_base.merge(last5, on=['user_id','product_id'], how='left').fillna({'up_last5_cnt':0})

# 연속 재구매 streak
valid_pairs = up_base.query('up_orders>=3')[['user_id','product_id']]
sub = (prior_orders.merge(valid_pairs, on=['user_id','product_id'], how='inner')
       .sort_values(['user_id','product_id','order_number'])
       [['user_id','product_id','reordered']].to_numpy(np.int32))
keys = (sub[:,0].astype(np.int64)<<20) + sub[:,1]
segments = np.where(np.r_[True, keys[1:]!=keys[:-1]])[0]
user_list, prod_list, streaks = [], [], []
for i in range(len(segments)):
    a,b = segments[i], segments[i+1] if i+1<len(segments) else len(sub)
    user_list.append(sub[b-1,0]); prod_list.append(sub[b-1,1])
    streaks.append(max_streak(sub[a:b,2]))
up_streak = pd.DataFrame({'user_id':user_list,'product_id':prod_list,'up_max_reorder_streak':streaks})
up_feats = mem_crunch(
    up_base.merge(up_streak, on=['user_id','product_id'], how='left')
           .fillna({'up_max_reorder_streak':0}),
    to_float16=True
)

#───────────────────────────────────────────────────────────────
# 7. 후보 & train_order_id(마지막 주문) 부착
#───────────────────────────────────────────────────────────────
cand_pool = (
    up_feats.query('up_orders>=3 or up_last5_cnt>0')[['user_id','product_id']]
            .merge(orders_target, on='user_id', how='left')
)
print('# cand_pool rows :', len(cand_pool))

#───────────────────────────────────────────────────────────────
# 8. 라벨 병합 (마지막 주문 GT)
#───────────────────────────────────────────────────────────────
train_df = (
    cand_pool
    .merge(
        train_lbl[['order_id','product_id','reordered']],
        left_on = ['train_order_id','product_id'],
        right_on= ['order_id','product_id'],
        how='left'
    )
    .fillna({'reordered':0})
    .astype({'reordered':'uint8'})
)
# 중복된 오른쪽 order_id 제거, train_order_id → order_id 로 rename
train_df = (train_df
            .drop(columns=['order_id'])
            .rename(columns={'train_order_id':'order_id'}))

#───────────────────────────────────────────────────────────────
# 9. 최종 피처 빌드
#───────────────────────────────────────────────────────────────
def build_features(df):
    return mem_crunch(
        df.merge(up_feats   , on=['user_id','product_id'], how='left')
          .merge(user_feats , on='user_id'             , how='left')
          .merge(prod_feats , on='product_id'          , how='left')
          .merge(orders_meta, on='order_id'            , how='left')
          .fillna(0),
        to_float16=True
    )

X_train = build_features(train_df.drop(columns=['reordered']))
y_train = train_df['reordered'].values.astype('uint8')
X_test  = build_features(cand_pool.rename(columns={'train_order_id':'order_id'}))

print('✅ Shapes →',
      f'X_train {X_train.shape}, y_train {y_train.shape}, X_test {X_test.shape}')
print('y_train 분포:', np.unique(y_train, return_counts=True))

#───────────────────────────────────────────────────────────────
# 10. Export CSV
#───────────────────────────────────────────────────────────────
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
user_feats.to_csv(f'{OUT_DIR}/user_feats.csv'  , index=False)
prod_feats.to_csv(f'{OUT_DIR}/product_feats.csv', index=False)
up_feats  .to_csv(f'{OUT_DIR}/up_feats.csv'    , index=False)
X_train   .to_csv(f'{OUT_DIR}/X_train.csv'     , index=False)
pd.DataFrame({'reordered':y_train}).to_csv(f'{OUT_DIR}/y_train.csv', index=False)
X_test    .to_csv(f'{OUT_DIR}/X_test.csv'      , index=False)

print('📦 feature tables saved to', OUT_DIR)

In [None]:
# ===============================================================
#  @title 3. RAM-safe 3-fold OOF Stacking
#         (no down-sample, Optuna HP-search, AUC-based, +CatBoost+MLP)
# ===============================================================
import gc, warnings, joblib, os, numpy as np, pandas as pd, torch
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)
import lightgbm as lgb, xgboost as xgb, catboost as cb, optuna
from sklearn.neural_network import MLPClassifier

warnings.filterwarnings('ignore')

#───────────────────────────────────────────────────────────────
# 0. Paths & Constants
#───────────────────────────────────────────────────────────────
ROOT        = '/content/drive/MyDrive/instaCart_data'
FEATURE_DIR = f'{ROOT}/feature2'
STACK_DIR   = f'{ROOT}/stack2'
Path(STACK_DIR).mkdir(parents=True, exist_ok=True)

NFOLDS, RS  = 3, 2025
THR_SCAN    = np.arange(.15, .35, .01)

#───────────────────────────────────────────────────────────────
# 1. 피처 불러오기 (ID 컬럼 안전 처리)
#───────────────────────────────────────────────────────────────
id_cols  = ['order_id','user_id','product_id']
cat_cols = ['order_dow','order_hour_of_day']

# (1) 샘플 컬럼 읽어서, ID·범주형 제외한 dtype 사전 생성
sample_cols = pd.read_csv(f'{FEATURE_DIR}/X_train.csv', nrows=0).columns
dtypes_noid = {
    c: 'float32'
    for c in sample_cols
    if c not in id_cols + cat_cols
}
dtypes_noid.update({c: 'uint8' for c in cat_cols})

# (2) CSV 불러오기 (ID 컬럼은 나중에 처리)
X_full = pd.read_csv(
    f'{FEATURE_DIR}/X_train.csv',
    dtype=dtypes_noid,
    engine='pyarrow',
    na_values=['', 'NA', 'nan']
)
y_full = pd.read_csv(f'{FEATURE_DIR}/y_train.csv')['reordered'].astype('uint8').values
X_test = pd.read_csv(
    f'{FEATURE_DIR}/X_test.csv',
    dtype=dtypes_noid,
    engine='pyarrow',
    na_values=['', 'NA', 'nan']
)

# (3) ID 컬럼을 숫자로 변환하되, 비정상 값은 NaN→-1 처리
for df in (X_full, X_test):
    for col in id_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce') \
                     .replace([np.inf, -np.inf], np.nan) \
                     .fillna(-1) \
                     .astype('int32')

# (4) 모델 입력용으로 ID 컬럼 제외
X_full_enc = X_full.drop(columns=id_cols)
X_test_enc = X_test .drop(columns=id_cols)
del X_full, X_test
gc.collect()

print(f'Loaded: {X_full_enc.shape} rows, RAM ≈ {X_full_enc.memory_usage().sum()/1e9:.2f} GB')

#───────────────────────────────────────────────────────────────
# 2. Optuna HP Search (AUC objective) for LGB, XGB
#───────────────────────────────────────────────────────────────
def lgb_obj(trial):
    params = dict(
        boosting_type='gbdt', objective='binary', device_type='gpu', metric='auc',
        learning_rate=trial.suggest_float('lr', 1e-2, 1e-1, log=True),
        num_leaves=trial.suggest_int('num_leaves', 31, 255),
        max_depth=trial.suggest_int('max_depth', 4, 12),
        feature_fraction=trial.suggest_float('feature_fraction', .5, .9),
        bagging_fraction=trial.suggest_float('bagging_fraction', .5, .9),
        bagging_freq=trial.suggest_int('bagging_freq', 1, 10),
        lambda_l1=trial.suggest_float('l1', 1e-3, 1, log=True),
        lambda_l2=trial.suggest_float('l2', 1e-3, 1, log=True),
        max_bin=127, min_data_in_leaf=20, force_row_wise=True, verbosity=-1
    )
    skf = StratifiedKFold(NFOLDS, shuffle=True, random_state=RS)
    aucs = []
    for tr, va in skf.split(X_full_enc, y_full):
        dtr = lgb.Dataset(X_full_enc.iloc[tr], label=y_full[tr])
        dvl = lgb.Dataset(X_full_enc.iloc[va], label=y_full[va])
        try:
            bst = lgb.train(
                params, dtr,
                num_boost_round=500,
                valid_sets=[dvl],
                callbacks=[lgb.early_stopping(50)]
            )
            preds = bst.predict(X_full_enc.iloc[va], num_iteration=bst.best_iteration)
            auc = roc_auc_score(y_full[va], preds)
            aucs.append(0.0 if np.isnan(auc) else auc)
        except Exception:
            # 분할 실패 등으로 에러 발생 시, 해당 Fold는 0.0 점수로 간주
            aucs.append(0.0)
    return float(np.mean(aucs))


def xgb_obj(trial):
    params = dict(
        objective='binary:logistic', tree_method='gpu_hist', eval_metric='auc',
        learning_rate=trial.suggest_float('lr', 1e-2, 1e-1, log=True),
        max_depth=trial.suggest_int('max_depth', 4, 10),
        min_child_weight=trial.suggest_int('min_child', 1, 10),
        subsample=trial.suggest_float('subsample', .5, .9),
        colsample_bytree=trial.suggest_float('col_bt', .5, .9),
        lambda_l1=trial.suggest_float('l1', 1e-3, 1, log=True),
        lambda_l2=trial.suggest_float('l2', 1e-3, 1, log=True),
        n_estimators=500, random_state=RS
    )
    skf = StratifiedKFold(NFOLDS, shuffle=True, random_state=RS)
    aucs = []
    for tr, va in skf.split(X_full_enc, y_full):
        try:
            m = xgb.XGBClassifier(**params)
            m.fit(
                X_full_enc.iloc[tr], y_full[tr],
                eval_set=[(X_full_enc.iloc[va], y_full[va])],
                early_stopping_rounds=50, verbose=False
            )
            preds = m.predict_proba(X_full_enc.iloc[va])[:, 1]
            auc = roc_auc_score(y_full[va], preds)
            aucs.append(0.0 if np.isnan(auc) else auc)
        except Exception:
            # 에러 발생 시, 해당 Fold는 0.0 점수로 간주
            aucs.append(0.0)
    return float(np.mean(aucs))

print('Optuna LGB...')
study_lgb = optuna.create_study(direction='maximize',
                                sampler=optuna.samplers.TPESampler(seed=RS))
study_lgb.optimize(lgb_obj, n_trials=20)
BEST_LGB = study_lgb.best_trial.params | {
    'boosting_type':'gbdt','objective':'binary','device_type':'gpu','metric':'auc',
    'max_bin':127,'min_data_in_leaf':20,'force_row_wise':True,'verbosity':-1
}
print('LGB params', BEST_LGB)

print('Optuna XGB...')
study_xgb = optuna.create_study(direction='maximize',
                                sampler=optuna.samplers.TPESampler(seed=RS))
study_xgb.optimize(xgb_obj, n_trials=20)
BEST_XGB = study_xgb.best_trial.params | {
    'objective':'binary:logistic','tree_method':'gpu_hist',
    'eval_metric':'auc','n_estimators':800,'random_state':RS
}
print('XGB params', BEST_XGB)

#───────────────────────────────────────────────────────────────
# 3. OOF training: LGB, XGB, CatBoost, MLP
#───────────────────────────────────────────────────────────────
kf = StratifiedKFold(NFOLDS, shuffle=True, random_state=RS)
oof_lgb = np.zeros_like(y_full, dtype=float)
oof_xgb = np.zeros_like(y_full, dtype=float)
oof_cat = np.zeros_like(y_full, dtype=float)
oof_mlp = np.zeros_like(y_full, dtype=float)

for fold, (tr, va) in enumerate(kf.split(X_full_enc, y_full), 1):
    print(f'Fold {fold}/{NFOLDS}')
    # LightGBM
    dtr = lgb.Dataset(X_full_enc.iloc[tr], label=y_full[tr])
    dvl = lgb.Dataset(X_full_enc.iloc[va], label=y_full[va])
    m1 = lgb.train(
        BEST_LGB, dtr, num_boost_round=1000,
        valid_sets=[dvl], callbacks=[lgb.early_stopping(100)]
    )
    oof_lgb[va] = m1.predict(X_full_enc.iloc[va], num_iteration=m1.best_iteration)
    joblib.dump(m1, f'{STACK_DIR}/lgb_fold{fold}.pkl')

    # XGBoost
    m2 = xgb.XGBClassifier(**BEST_XGB)
    m2.fit(
        X_full_enc.iloc[tr], y_full[tr],
        eval_set=[(X_full_enc.iloc[va], y_full[va])],
        early_stopping_rounds=100, verbose=False
    )
    oof_xgb[va] = m2.predict_proba(X_full_enc.iloc[va])[:,1]
    joblib.dump(m2, f'{STACK_DIR}/xgb_fold{fold}.pkl')

    # CatBoost
    m3 = cb.CatBoostClassifier(
        iterations=1000, learning_rate=0.05, depth=6,
        task_type='GPU', eval_metric='AUC',
        early_stopping_rounds=100, verbose=False
    )
    m3.fit(X_full_enc.iloc[tr], y_full[tr],
           eval_set=(X_full_enc.iloc[va], y_full[va]))
    oof_cat[va] = m3.predict_proba(X_full_enc.iloc[va])[:,1]
    joblib.dump(m3, f'{STACK_DIR}/cat_fold{fold}.pkl')

    # MLP
    m4 = MLPClassifier(
        hidden_layer_sizes=(128,64),
        learning_rate_init=1e-3,
        max_iter=200,
        random_state=RS,
        verbose=False
    )
    m4.fit(X_full_enc.iloc[tr], y_full[tr])
    oof_mlp[va] = m4.predict_proba(X_full_enc.iloc[va])[:,1]
    joblib.dump(m4, f'{STACK_DIR}/mlp_fold{fold}.pkl')

    gc.collect()
    torch.cuda.empty_cache()

#───────────────────────────────────────────────────────────────
# 4. Blend & threshold 최적화
#───────────────────────────────────────────────────────────────
blend = (oof_lgb + oof_xgb + oof_cat + oof_mlp) / 4
best_thr = max(THR_SCAN, key=lambda t: f1_score(y_full, (blend>=t).astype(int)))

# OOF 메타 저장
np.savez(
    f'{STACK_DIR}/oof_meta.npz',
    oof_lgb=oof_lgb,
    oof_xgb=oof_xgb,
    oof_cat=oof_cat,
    oof_mlp=oof_mlp,
    best_thr=best_thr
)

# 개별 & blend F1 점수
print(f"\n📝 best_thr = {best_thr:.4f} | "
      f"F1(LGB) = {f1_score(y_full, (oof_lgb>=best_thr).astype(int)):.4f} | "
      f"F1(XGB) = {f1_score(y_full, (oof_xgb>=best_thr).astype(int)):.4f} | "
      f"F1(CAT) = {f1_score(y_full, (oof_cat>=best_thr).astype(int)):.4f} | "
      f"F1(MLP) = {f1_score(y_full, (oof_mlp>=best_thr).astype(int)):.4f} | "
      f"F1(blend) = {f1_score(y_full, (blend>=best_thr).astype(int)):.4f}")

#───────────────────────────────────────────────────────────────
# 5. OOF Performance 출력 (Acc, Prec, Recall, F1, AUC)
#───────────────────────────────────────────────────────────────
oof_pred = (blend >= best_thr).astype(int)
print("\n✅ OOF Performance → "
      f"Acc={accuracy_score(y_full, oof_pred):.4f}, "
      f"Prec={precision_score(y_full, oof_pred, zero_division=0):.4f}, "
      f"Recall={recall_score(y_full, oof_pred):.4f}, "
      f"F1={f1_score(y_full, oof_pred):.4f}, "
      f"AUC={roc_auc_score(y_full, blend):.4f}")

print('✅ artifacts saved →', STACK_DIR)

In [None]:
# ===============================================================
#  @title 4. full-inference using 3-fold models (LGB, XGB, CatBoost, MLP)
# ===============================================================
import gc, joblib, numpy as np, pandas as pd, warnings
from pathlib import Path
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)
warnings.filterwarnings('ignore')

#──────────────────── ❶ 경로 & 메타 로드 ─────────────────────────
STACK_DIR   = '/content/drive/MyDrive/instaCart_data/stack2'
FEATURE_DIR = '/content/drive/MyDrive/instaCart_data/feature2'
OUT_DIR     = STACK_DIR
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

# — OOF 메타
meta     = np.load(f'{STACK_DIR}/oof_meta.npz', allow_pickle=True)
oof_lgb  = meta['oof_lgb']
oof_xgb  = meta['oof_xgb']
oof_cat  = meta['oof_cat']
oof_mlp  = meta['oof_mlp']
best_thr = float(meta['best_thr'])
print('ℹ️ best_thr =', best_thr)

#──────────────────── ❷ OOF 성능 확인 ───────────────────────────
# y_train 로드
y_tr = pd.read_csv(f'{FEATURE_DIR}/y_train.csv')['reordered'].astype('uint8').values
blend_oof = (oof_lgb + oof_xgb + oof_cat + oof_mlp) / 4
pred_oof  = (blend_oof >= best_thr).astype('uint8')
print(
    f"OOF → Acc={accuracy_score(y_tr,pred_oof):.4f} | "
    f"Prec={precision_score(y_tr,pred_oof,zero_division=0):.4f} | "
    f"Recall={recall_score(y_tr,pred_oof):.4f} | "
    f"F1={f1_score(y_tr,pred_oof):.4f} | "
    f"AUC={roc_auc_score(y_tr,blend_oof):.4f}"
)

#──────────────────── ❸ 테스트 피처 불러오기 ───────────────────────
id_cols  = ['order_id','user_id','product_id']
cat_cols = ['order_dow','order_hour_of_day']

# (1) dtype 사전 준비
sample = pd.read_csv(f'{FEATURE_DIR}/X_train.csv', nrows=0)
dtypes = {c:'float32' for c in sample.columns if c not in id_cols+cat_cols}
dtypes.update({c:'uint8' for c in cat_cols})

# (2) CSV 로드
X_test = pd.read_csv(
    f'{FEATURE_DIR}/X_test.csv',
    dtype=dtypes, engine='pyarrow',
    na_values=['','NA','nan']
)
# (3) ID 컬럼 안전 처리
for col in id_cols:
    X_test[col] = (
        pd.to_numeric(X_test[col], errors='coerce')
          .replace([np.inf,-np.inf], np.nan)
          .fillna(-1).astype('int32')
    )
# (4) 모델 입력용 데이터
X_test_enc = X_test.drop(columns=id_cols)
cand = X_test[id_cols].copy()

print('✅ feature shapes →', X_test_enc.shape)

# ──────────────────── ❹ Fold별 모델 불러와 예측 ──────────────────
n = X_test_enc.shape[0]
preds_lgb = np.zeros(n)
preds_xgb = np.zeros(n)
preds_cat = np.zeros(n)
preds_mlp = np.zeros(n)

for fold in (1,2,3):
    # LightGBM: Booster 객체이므로 predict_proba 대신 predict
    m_lgb = joblib.load(f'{STACK_DIR}/lgb_fold{fold}.pkl')
    preds_lgb += m_lgb.predict(
        X_test_enc,
        num_iteration=m_lgb.best_iteration
    )
    # XGBoost: XGBClassifier 이므로 predict_proba 가능
    m_xgb = joblib.load(f'{STACK_DIR}/xgb_fold{fold}.pkl')
    preds_xgb += m_xgb.predict_proba(X_test_enc)[:,1]
    # CatBoostClassifier
    m_cat = joblib.load(f'{STACK_DIR}/cat_fold{fold}.pkl')
    preds_cat += m_cat.predict_proba(X_test_enc)[:,1]
    # MLPClassifier
    m_mlp = joblib.load(f'{STACK_DIR}/mlp_fold{fold}.pkl')
    preds_mlp += m_mlp.predict_proba(X_test_enc)[:,1]

# Fold별 평균
preds_lgb /= 3
preds_xgb /= 3
preds_cat /= 3
preds_mlp /= 3

#──────────────────── ❺ 모델 블렌드 & 임계치 적용 ─────────────────
pred_test = (preds_lgb + preds_xgb + preds_cat + preds_mlp) / 4
cand['pred'] = pred_test

#──────────────────── ❻ 제출파일 생성 ──────────────────────────
rows = []
for oid, grp in cand.groupby('order_id'):
    prods = grp.loc[grp.pred >= best_thr, 'product_id'].astype(str).tolist()
    rows.append({
        'order_id': oid,
        'products' : ' '.join(prods) if prods else 'None'
    })
submission = pd.DataFrame(rows)
submission.to_csv(f'{OUT_DIR}/submission_blend.csv', index=False)
print('📤 submission saved →', f'{OUT_DIR}/submission_blend.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

# ──────────────────────────────────────────
# 1. 제출 CSV 로드
# ──────────────────────────────────────────
sub = pd.read_csv(
    '/content/drive/MyDrive/instaCart_data/stack2/submission_blend.csv',
    dtype=str, keep_default_na=False
)
sub.columns = sub.columns.str.strip().str.lower()
if not {'order_id','products'}.issubset(sub.columns):
    raise ValueError("제출 파일에 'order_id'와 'products' 컬럼이 필요합니다.")

sub['order_id'] = sub['order_id'].str.strip()
def split_prod(cell: str) -> list[str]:
    cell = (cell or '').strip()
    return [] if cell.lower() in ('none','') else cell.split()
sub['products_pred'] = sub['products'].apply(split_prod)
sub = sub.drop_duplicates(subset=['order_id'])[['order_id','products_pred']]
print(f"제출 주문 수: {len(sub):,}")

# ──────────────────────────────────────────
# 2. GT 생성: order_products__train.csv 에서 마지막 주문 GT
# ──────────────────────────────────────────
train_lbl = pd.read_csv(
    '/content/drive/MyDrive/instaCart_data/order_products__train.csv',
    dtype={'order_id':str,'product_id':str}
)
gt = (
    train_lbl[train_lbl['reordered']==1]
      .groupby('order_id')['product_id']
      .apply(list)
      .reset_index()
      .rename(columns={'product_id':'products_gt'})
)
print(f"GT 주문 수: {len(gt):,}")

# ──────────────────────────────────────────
# 3. 공통 order_id만 평가
# ──────────────────────────────────────────
merged = pd.merge(gt, sub, on='order_id', how='inner')
print(f"공통 주문 수: {len(merged):,}")
if merged.empty:
    raise ValueError("GT와 제출 파일의 order_id가 하나도 일치하지 않습니다.")

# ──────────────────────────────────────────
# 4. 상품 vocabulary 생성
# ──────────────────────────────────────────
vocab = sorted({
    pid for lst in merged['products_gt']   for pid in lst
} | {
    pid for lst in merged['products_pred'] for pid in lst
})
print(f"상품 vocabulary 크기: {len(vocab):,}")

# ──────────────────────────────────────────
# 5. One-hot 인코딩
# ──────────────────────────────────────────
mlb = MultiLabelBinarizer(classes=vocab)
y_true = mlb.fit_transform(merged['products_gt'])
y_pred = mlb.transform(   merged['products_pred'])

# ──────────────────────────────────────────
# 6. 주문별 F1 → 평균 F1
# ──────────────────────────────────────────
kaggle_f1 = f1_score(
    y_true, y_pred,
    average='samples',
    zero_division=0
)
print(f"\n🎯 Kaggle-style macro F1-score: {kaggle_f1:.6f}")