In [1]:
import os
import gc
import math
import random
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


In [2]:

# ----------------------
# Config
# ----------------------
CFG = {
    'TRAIN_PATH': './train.parquet',           
    'TEST_PATH':  './test.parquet',           
    'SUBMIT_PATH': './toss_v5_nn_submit.csv',
    # Optional meta predictions from v4 (if present, will be stacked)
    'V4_XGB_SUB': './toss_xgb_v3_submit.csv',
    'V4_LGB_SUB': './toss_lgbm_v2_submit.csv',

    'CAT_COLS': [ 'gender', 'age_group', 'inventory_id', 'l_feat_14', 'hour', 'day_of_week' ],
    # Numeric columns will be autodetected as (all except target, seq, ID, CAT_COLS)

    'SEQ_COL': 'seq',
    'TARGET_COL': 'clicked',
    'ID_COL': 'ID',

    'MAX_SEQ_LEN': 180,        # truncate or pad sequences to this length
    'BATCH_SIZE': 1024,
    'EPOCHS': 7,
    'LR': 1e-3,
    'WEIGHT_DECAY': 1e-5,
    'SEED': 42,

    'EMB_DIM': 24,             # 16~32 reasonable
    'RNN_TYPE': 'gru',         # 'gru' or 'lstm'
    'RNN_HIDDEN': 96,          # 64~128
    'RNN_LAYERS': 2,
    'RNN_BIDIR': True,

    'CROSS_LAYERS': 2,         # 2~3
    'MLP_UNITS': [512, 256, 128],
    'DROPOUTS': [0.10, 0.15, 0.20],

    'VAL_SIZE': 0.10,          # 10% hold-out validation
    'NUM_WORKERS': 2,          # dataloader workers (increase if IO fast)
    'PIN_MEMORY': True,
}

In [3]:

# ----------------------
# Utils
# ----------------------

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[info] Device: {device}")

# Competition metrics

def weighted_logloss(y_true, y_pred, eps=1e-15):
    y_pred = np.clip(y_pred, eps, 1 - eps)
    mask0 = (y_true == 0)
    mask1 = (y_true == 1)
    ll0 = -np.mean(np.log(1 - y_pred[mask0])) if mask0.sum() else 0.0
    ll1 = -np.mean(np.log(y_pred[mask1])) if mask1.sum() else 0.0
    return 0.5 * ll0 + 0.5 * ll1


def comp_score(y_true, y_pred):
    ap = average_precision_score(y_true, y_pred)
    wll = weighted_logloss(y_true, y_pred)
    score = 0.5 * ap + 0.5 * (1.0 / (1.0 + wll))
    return score, ap, wll


[info] Device: cpu


In [4]:
# ----------------------
# Data loading
# ----------------------
train = pd.read_parquet(CFG['TRAIN_PATH'])
_test = pd.read_parquet(CFG['TEST_PATH'])
print(f"[info] Train shape={train.shape}, Test shape={_test.shape}")
assert CFG['TARGET_COL'] in train.columns, f"Missing target {CFG['TARGET_COL']}"


[info] Train shape=(10704179, 119), Test shape=(1527298, 119)


In [5]:
# Keep ID
if CFG['ID_COL'] in _test.columns:
    test_id = _test[CFG['ID_COL']].astype(str).copy()
else:
    test_id = pd.Series([f"TEST_{i:07d}" for i in range(len(_test))], name='ID')

# Detect numeric columns (everything except excluded and explicit categoricals)
EXCLUDE = set([CFG['TARGET_COL'], CFG['SEQ_COL'], CFG['ID_COL']])
feature_cols = [c for c in train.columns if c not in EXCLUDE]
num_cols = [c for c in feature_cols if c not in CFG['CAT_COLS']]
cat_cols = CFG['CAT_COLS']
print(f"[info] Numeric={len(num_cols)} | Categorical={len(cat_cols)}")


[info] Numeric=111 | Categorical=6


In [6]:
# Optional: stack v4 meta predictions if present
meta_cols = []
for path, name in [(CFG['V4_XGB_SUB'],'pred_xgb'), (CFG['V4_LGB_SUB'],'pred_lgb')]:
    if Path(path).exists():
        sub = pd.read_csv(path)
        assert (sub['ID'].astype(str).values == test_id.astype(str).values).all(), "ID mismatch between v5 test and v4 submission"
        _test[name] = sub['clicked'].astype('float32').values
        # For train, we don't have oof here; set to NaN then fill with column mean after split (to avoid leakage)
        train[name] = np.nan
        meta_cols.append(name)
        print(f"[info] meta feature added: {name}")

[info] meta feature added: pred_xgb
[info] meta feature added: pred_lgb


In [7]:
# Label encode categoricals on train+test jointly (no target leakage)
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    all_vals = pd.concat([train[col], _test[col]], axis=0).astype(str).fillna('UNK')
    le.fit(all_vals)
    train[col] = le.transform(train[col].astype(str).fillna('UNK'))
    _test[col]  = le.transform(_test[col].astype(str).fillna('UNK'))
    encoders[col] = le
    print(f"[enc] {col}: {len(le.classes_)} classes")

# Basic fills / types
for c in num_cols:
    if train[c].dtype == 'float64':
        train[c] = train[c].astype('float32')
        _test[c]  = _test[c].astype('float32')
    if str(train[c].dtype).startswith('int'):
        train[c] = train[c].astype('int32')
        _test[c]  = _test[c].astype('int32')
train[num_cols] = train[num_cols].fillna(0)
_test[num_cols]  = _test[num_cols].fillna(0)


[enc] gender: 3 classes
[enc] age_group: 9 classes
[enc] inventory_id: 18 classes
[enc] l_feat_14: 3286 classes
[enc] hour: 24 classes
[enc] day_of_week: 7 classes


In [8]:
# ----------------------
# Validation split
# ----------------------
sss = StratifiedShuffleSplit(n_splits=1, test_size=CFG['VAL_SIZE'], random_state=CFG['SEED'])
idx_train, idx_val = next(sss.split(train, train[CFG['TARGET_COL']]))
tr_df = train.iloc[idx_train].reset_index(drop=True)
va_df = train.iloc[idx_val].reset_index(drop=True)
print(f"[split] train={len(tr_df):,} | valid={len(va_df):,}")

# If meta features exist, fill their train part with validation-wise mean (avoid leakage)
if meta_cols:
    for m in meta_cols:
        # naive backfill: use validation mean of test preds (placeholder; real oof stacking would be better)
        tr_df[m] = tr_df[m].fillna(va_df[m].mean() if not va_df[m].isna().all() else 0.5).astype('float32')
        va_df[m] = va_df[m].fillna(va_df[m].mean() if not va_df[m].isna().all() else 0.5).astype('float32')


[split] train=9,633,761 | valid=1,070,418


In [9]:
# ----------------------
# Dataset / Dataloader
# ----------------------
class ClickDataset(Dataset):
    def __init__(self, df, num_cols, cat_cols, seq_col, target_col=None, has_target=True, max_seq_len=180):
        self.df = df.reset_index(drop=True)
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        self.seq_col  = seq_col
        self.target_col = target_col
        self.has_target = has_target
        self.max_len = max_seq_len

        self.num_X = self.df[self.num_cols].astype(np.float32).values if len(self.num_cols) else None
        self.cat_X = self.df[self.cat_cols].astype(np.int64).values if len(self.cat_cols) else None
        self.seq_strings = self.df[self.seq_col].astype(str).values if self.seq_col in self.df.columns else np.array(['0']*len(self.df))
        if self.has_target:
            self.y = self.df[self.target_col].astype(np.float32).values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        num_x = torch.tensor(self.num_X[idx], dtype=torch.float32) if self.num_X is not None else torch.empty(0)
        cat_x = torch.tensor(self.cat_X[idx], dtype=torch.long) if self.cat_X is not None else torch.empty(0, dtype=torch.long)

        # seq parse (comma-separated floats)
        s = self.seq_strings[idx]
        arr = np.fromstring(s, sep=',', dtype=np.float32)
        if arr.size == 0:
            arr = np.zeros(1, dtype=np.float32)
        # truncate or pad to max_len handled in collate
        seq = torch.from_numpy(arr)

        if self.has_target:
            y = torch.tensor(self.y[idx], dtype=torch.float32)
            return num_x, cat_x, seq, y
        else:
            return num_x, cat_x, seq


def collate_fn_train(batch):
    num_x, cat_x, seqs, ys = zip(*batch)
    num_x = torch.stack(num_x) if num_x[0].numel() else torch.zeros((len(batch),0))
    cat_x = torch.stack(cat_x) if cat_x[0].numel() else torch.zeros((len(batch),0), dtype=torch.long)
    ys = torch.stack(ys)
    # pad/truncate
    seqs = [s[:CFG['MAX_SEQ_LEN']] for s in seqs]
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    lens = torch.tensor([min(len(s), CFG['MAX_SEQ_LEN']) for s in seqs], dtype=torch.long)
    lens = torch.clamp(lens, min=1)
    seqs_padded = seqs_padded.unsqueeze(-1)  # (B, T, 1)
    return num_x, cat_x, seqs_padded, lens, ys


def collate_fn_infer(batch):
    num_x, cat_x, seqs = zip(*batch)
    num_x = torch.stack(num_x) if num_x[0].numel() else torch.zeros((len(batch),0))
    cat_x = torch.stack(cat_x) if cat_x[0].numel() else torch.zeros((len(batch),0), dtype=torch.long)
    seqs = [s[:CFG['MAX_SEQ_LEN']] for s in seqs]
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    lens = torch.tensor([min(len(s), CFG['MAX_SEQ_LEN']) for s in seqs], dtype=torch.long)
    lens = torch.clamp(lens, min=1)
    seqs_padded = seqs_padded.unsqueeze(-1)  # (B, T, 1)
    return num_x, cat_x, seqs_padded, lens


In [10]:
# ----------------------
# Model
# ----------------------
class CrossNetwork(nn.Module):
    def __init__(self, in_dim: int, num_layers: int = 2):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(in_dim, 1) for _ in range(num_layers)])

    def forward(self, x0):
        x = x0
        for layer in self.layers:
            x = x + x0 * layer(x)
        return x


class WideDeepCTR(nn.Module):
    def __init__(self, num_features: int, cat_cardinalities: list, emb_dim: int = 24,
                 rnn_type: str = 'gru', rnn_hidden: int = 96, rnn_layers: int = 2, rnn_bidir: bool = True,
                 cross_layers: int = 2, mlp_units=(512,256,128), dropouts=(0.1,0.15,0.2)):
        super().__init__()
        self.num_features = num_features
        self.cat_card = cat_cardinalities

        # Embedding for categoricals
        self.emb_layers = nn.ModuleList([
            nn.Embedding(card, emb_dim) for card in self.cat_card
        ])
        cat_out = emb_dim * len(self.cat_card)

        # Numeric block
        self.bn_num = nn.BatchNorm1d(num_features) if num_features > 0 else None

        # Sequence encoder
        rnn_cls = nn.GRU if rnn_type.lower() == 'gru' else nn.LSTM
        self.rnn = rnn_cls(input_size=1, hidden_size=rnn_hidden, num_layers=rnn_layers,
                           batch_first=True, bidirectional=rnn_bidir)
        seq_out = rnn_hidden * (2 if rnn_bidir else 1)

        # Concatenate all
        fused_in = (num_features if num_features>0 else 0) + cat_out + seq_out

        # CrossNet
        self.cross = CrossNetwork(fused_in, num_layers=cross_layers)

        # MLP
        layers = []
        in_dim = fused_in
        for i, h in enumerate(mlp_units):
            p = dropouts[min(i, len(dropouts)-1)]
            layers += [nn.Linear(in_dim, h), nn.ReLU(), nn.Dropout(p)]
            in_dim = h
        layers += [nn.Linear(in_dim, 1)]
        self.mlp = nn.Sequential(*layers)

    def forward(self, num_x, cat_x, seqs, lens):
        parts = []
        if self.num_features > 0:
            x_num = self.bn_num(num_x) if self.bn_num is not None else num_x
            parts.append(x_num)
        if len(self.cat_card) > 0:
            embs = [emb(cat_x[:, i]) for i, emb in enumerate(self.emb_layers)]
            parts.append(torch.cat(embs, dim=1))
        # Sequence
        packed = nn.utils.rnn.pack_padded_sequence(seqs, lens.cpu(), batch_first=True, enforce_sorted=False)
        if isinstance(self.rnn, nn.LSTM):
            _, (h_n, _) = self.rnn(packed)
        else:
            _, h_n = self.rnn(packed)
        if self.rnn.bidirectional:
            h = torch.cat([h_n[-2], h_n[-1]], dim=1)
        else:
            h = h_n[-1]
        parts.append(h)

        z = torch.cat(parts, dim=1)
        z_cross = self.cross(z)
        logits = self.mlp(z_cross).squeeze(1)
        return logits


In [11]:

# ----------------------
# Build loaders
# ----------------------
ALL_NUM = num_cols.copy()
ALL_CAT = cat_cols.copy()

train_ds = ClickDataset(tr_df, ALL_NUM, ALL_CAT, CFG['SEQ_COL'], CFG['TARGET_COL'], True, CFG['MAX_SEQ_LEN'])
valid_ds = ClickDataset(va_df, ALL_NUM, ALL_CAT, CFG['SEQ_COL'], CFG['TARGET_COL'], True, CFG['MAX_SEQ_LEN'])

train_loader = DataLoader(train_ds, batch_size=CFG['BATCH_SIZE'], shuffle=True,
                          num_workers=CFG['NUM_WORKERS'], pin_memory=CFG['PIN_MEMORY'], collate_fn=collate_fn_train)
valid_loader = DataLoader(valid_ds, batch_size=CFG['BATCH_SIZE'], shuffle=False,
                          num_workers=CFG['NUM_WORKERS'], pin_memory=CFG['PIN_MEMORY'], collate_fn=collate_fn_train)

# ----------------------
# Model / Optimizer / Loss
# ----------------------
cat_cardinalities = [int(train[c].max()) + 1 for c in ALL_CAT]
model = WideDeepCTR(num_features=len(ALL_NUM), cat_cardinalities=cat_cardinalities,
                    emb_dim=CFG['EMB_DIM'], rnn_type=CFG['RNN_TYPE'], rnn_hidden=CFG['RNN_HIDDEN'],
                    rnn_layers=CFG['RNN_LAYERS'], rnn_bidir=CFG['RNN_BIDIR'], cross_layers=CFG['CROSS_LAYERS'],
                    mlp_units=CFG['MLP_UNITS'], dropouts=CFG['DROPOUTS']).to(device)

# pos_weight
pos_ratio = tr_df[CFG['TARGET_COL']].mean()
neg = len(tr_df) - tr_df[CFG['TARGET_COL']].sum()
pos = tr_df[CFG['TARGET_COL']].sum()
pos_weight_val = (neg / max(pos,1)) if pos > 0 else 1.0
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight_val], dtype=torch.float, device=device))
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG['LR'], weight_decay=CFG['WEIGHT_DECAY'])
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=2, T_mult=2)

print(f"[info] pos_ratio={pos_ratio:.6f}, pos_weight={pos_weight_val:.2f}")


[info] pos_ratio=0.019075, pos_weight=51.43


In [None]:
# ----------------------
# Train
# ----------------------
BEST = {'score': -1, 'epoch': -1}
logits_val_cache = None

for epoch in range(1, CFG['EPOCHS']+1):
    model.train()
    running_loss = 0.0
    for num_x, cat_x, seqs, lens, ys in tqdm(train_loader, desc=f"[Train {epoch}]"):
        num_x, cat_x, seqs, lens, ys = num_x.to(device), cat_x.to(device), seqs.to(device), lens.to(device), ys.to(device)
        optimizer.zero_grad()
        logits = model(num_x, cat_x, seqs, lens)
        loss = criterion(logits, ys)
        loss.backward()
        optimizer.step()
        scheduler.step(epoch + len(train_loader))
        running_loss += loss.item() * ys.size(0)
    train_loss = running_loss / len(train_ds)

    # validate
    model.eval()
    val_logits, val_targets = [], []
    with torch.no_grad():
        for num_x, cat_x, seqs, lens, ys in tqdm(valid_loader, desc=f"[Valid {epoch}]"):
            num_x, cat_x, seqs, lens = num_x.to(device), cat_x.to(device), seqs.to(device), lens.to(device)
            logits = model(num_x, cat_x, seqs, lens)
            val_logits.append(logits.detach().cpu())
            val_targets.append(ys)
    val_logits = torch.cat(val_logits).numpy()
    val_targets = torch.cat(val_targets).numpy()
    val_probs = 1 / (1 + np.exp(-val_logits))

    score, ap, wll = comp_score(val_targets, val_probs)
    print(f"[Epoch {epoch}] train_loss={train_loss:.5f} | score={score:.6f} | AP={ap:.6f} | WLL={wll:.6f}")

    if score > BEST['score']:
        BEST.update({'score': score, 'epoch': epoch})
        logits_val_cache = val_logits.copy()
    gc.collect()

print(f"[best] epoch={BEST['epoch']} | score={BEST['score']:.6f}")


[Train 1]:   4%|████▊                                                                                                             | 392/9408 [5:55:58<138:05:02, 55.14s/it]

In [None]:
# ----------------------
# Platt scaling (calibration) on validation
# ----------------------
calibrated = False
if logits_val_cache is not None:
    try:
        lr_cal = LogisticRegression(max_iter=1000)
        lr_cal.fit(logits_val_cache.reshape(-1,1), (val_targets>0.5).astype(int))
        calibrated = True
        print("[calib] Platt scaling trained.")
    except Exception as e:
        print("[calib] Failed:", e)

# ----------------------
# Inference on test
# ----------------------
# Rebuild full train loader for final fit if desired (optional). Here we reuse current weights.
# Create test loader

test_ds = ClickDataset(_test, ALL_NUM, ALL_CAT, CFG['SEQ_COL'], has_target=False, max_seq_len=CFG['MAX_SEQ_LEN'])
test_loader = DataLoader(test_ds, batch_size=CFG['BATCH_SIZE'], shuffle=False,
                         num_workers=CFG['NUM_WORKERS'], pin_memory=CFG['PIN_MEMORY'], collate_fn=collate_fn_infer)

model.eval()
all_outs = []
with torch.no_grad():
    for num_x, cat_x, seqs, lens in tqdm(test_loader, desc='[Inference]'):
        num_x, cat_x, seqs, lens = num_x.to(device), cat_x.to(device), seqs.to(device), lens.to(device)
        logits = model(num_x, cat_x, seqs, lens)
        all_outs.append(logits.detach().cpu())

logits_test = torch.cat(all_outs).numpy()
probs_test = 1 / (1 + np.exp(-logits_test))

if calibrated:
    probs_test = lr_cal.predict_proba(logits_test.reshape(-1,1))[:,1]

# clip safety
probs_test = np.clip(probs_test, 0.0, 1.0)

In [None]:
# Build submission
submit = pd.DataFrame({
    'ID': test_id.values.astype(str),
    'clicked': probs_test.astype(np.float32)
})

submit.to_csv(CFG['SUBMIT_PATH'], index=False)
print(f"[save] submission -> {CFG['SUBMIT_PATH']}")
print("[check] clicked range: %.6f ~ %.6f" % (submit['clicked'].min(), submit['clicked'].max()))

print("[done] v5 Neural Hybrid pipeline complete.")