In [1]:
# =============================
# BTK Datathon 2025 — Baseline v1
# =============================

import os, sys, gc, math, json, warnings
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import lightgbm as lgb

warnings.filterwarnings("ignore")

CFG = {
    "seed": 42,
    "seeds": [42, 2024, 77],        # start with 3; you can go to 5 later
    "cv_type": "time",              # "time" (as now) or "group" (see §3)
    "use_log_target": False,
    "add_user_history": True,
    "add_sequence_extras": True,
    "lgb_params": {
        "objective": "regression",
        "metric": "mse",
        "learning_rate": 0.03,      # ↓ a bit
        "num_leaves": 127,          # ↑ capacity
        "min_data_in_leaf": 128,    # ↑ regularization
        "feature_fraction": 0.85,
        "bagging_fraction": 0.85,
        "bagging_freq": 1,
        "lambda_l2": 5.0,           # new
        "max_depth": -1,
        "verbosity": -1,
        "force_row_wise": True,
        "seed": 42, "bagging_seed": 42, "feature_fraction_seed": 42,
        "extra_trees": True         # small but often positive
    },
    "n_splits": 3,                  # time CV folds as you already do
    "n_splits_group": 5,            # for GroupKFold (see §3)
    "early_stopping_rounds": 400,   # ↑ a little
    "num_boost_round": 12000,       # allow more rounds with small LR
    "clip": {"floor": 5.48, "cap": 300.0}  # post-process; try 281.0 next
}


def set_seed(seed=42):
    np.random.seed(seed)

set_seed(CFG["seed"])


In [2]:
# Try common Kaggle paths first, then fallback to cwd.
CANDIDATE_DIRS = [
    Path("/kaggle/input/datathon-2025"),]

def find_csv(filename: str) -> Path:
    for d in CANDIDATE_DIRS:
        p = d / filename
        if p.exists():
            return p
    raise FileNotFoundError(f"Could not find {filename} in {CANDIDATE_DIRS}")

train_path = find_csv("train.csv")
test_path  = find_csv("test.csv")
sub_path   = find_csv("sample_submission.csv")

train_path, test_path, sub_path


(PosixPath('/kaggle/input/datathon-2025/train.csv'),
 PosixPath('/kaggle/input/datathon-2025/test.csv'),
 PosixPath('/kaggle/input/datathon-2025/sample_submission.csv'))

In [3]:
# Auto-detect the time column name (common variants)
def detect_time_col(cols):
    cand = [c for c in cols if c.lower() in ("event_time", "event_timestamp", "timestamp", "time", "event_datetime")]
    if cand:
        return cand[0]
    # Best-effort: look for a column with "time" in its name
    cand = [c for c in cols if "time" in c.lower() or "date" in c.lower()]
    return cand[0] if cand else None

def read_df(path):
    df = pd.read_csv(path)
    tcol = detect_time_col(df.columns)
    if tcol is None:
        raise ValueError("Couldn't detect a time column. Please update detect_time_col().")
    df[tcol] = pd.to_datetime(df[tcol], errors="coerce", utc=True)
    # Make a normalized name for downstream code
    if tcol != "event_time":
        df = df.rename(columns={tcol: "event_time"})
    return df

train = read_df(train_path)
test  = read_df(test_path)
sub   = pd.read_csv(sub_path)

print("Shapes:", train.shape, test.shape, sub.shape)
print("\nTrain columns:\n", list(train.columns))
print("\nTest columns:\n", list(test.columns))
print("\nSubmission columns:\n", list(sub.columns))

# Sanity: expected target column must exist only in train
TARGET_COL = "session_value"
assert TARGET_COL in train.columns and TARGET_COL not in test.columns, "Target column check failed."

# Light touch peek (head only)
display(train.head(3))
display(test.head(3))
display(sub.head(3))


Shapes: (141219, 7) (62951, 6) (30789, 2)

Train columns:
 ['event_time', 'event_type', 'product_id', 'category_id', 'user_id', 'user_session', 'session_value']

Test columns:
 ['event_time', 'event_type', 'product_id', 'category_id', 'user_id', 'user_session']

Submission columns:
 ['user_session', 'session_value']


Unnamed: 0,event_time,event_type,product_id,category_id,user_id,user_session,session_value
0,2025-06-19 10:23:07+00:00,ADD_CART,PROD_011223,CAT_00054,USER_097562,SESSION_158779,90.29
1,2025-06-07 21:34:45+00:00,ADD_CART,PROD_005519,CAT_00144,USER_006535,SESSION_029987,16.39
2,2025-06-21 21:29:09+00:00,ADD_CART,PROD_000577,CAT_00273,USER_047199,SESSION_022134,64.27


Unnamed: 0,event_time,event_type,product_id,category_id,user_id,user_session
0,2025-06-28 10:09:58+00:00,ADD_CART,PROD_015000,CAT_00019,USER_109759,SESSION_164059
1,2025-06-25 11:57:50+00:00,ADD_CART,PROD_023887,CAT_00010,USER_010614,SESSION_109583
2,2025-06-30 14:34:20+00:00,ADD_CART,PROD_022673,CAT_00090,USER_041338,SESSION_171382


Unnamed: 0,user_session,session_value
0,SESSION_164059,0.0
1,SESSION_109583,0.0
2,SESSION_171382,0.0


In [4]:
before = len(train)
train = train.drop_duplicates().reset_index(drop=True)
after = len(train)
print(f"Dropped {before - after} duplicate rows from train.")


Dropped 670 duplicate rows from train.


In [5]:
# Normalize frequent columns; adapt these names if needed
ID_USER = "user_id"
ID_SESSION = "user_session"
PRODUCT_COL = "product_id"
CATEGORY_COL = "category_id"
EVENT_COL = "event_type"  # expected values like VIEW/ADD_CART/REMOVE_CART/BUY

expected_cols = [ID_USER, ID_SESSION, PRODUCT_COL, CATEGORY_COL, EVENT_COL, "event_time"]
for c in expected_cols:
    if c not in train.columns:
        print(f"Warning: expected column '{c}' not found in train.")

# Coerce to appropriate dtypes (safe casts)
for df in (train, test):
    for c in [ID_USER, ID_SESSION]:
        if c in df.columns:
            df[c] = df[c].astype(str)
    for c in [PRODUCT_COL, CATEGORY_COL, EVENT_COL]:
        if c in df.columns:
            df[c] = df[c].astype("category")

# Event type ordering (helps some models and features)
if EVENT_COL in train.columns:
    all_types = sorted(list(set(train[EVENT_COL].dropna().unique()).union(set(test[EVENT_COL].dropna().unique()))))
    train[EVENT_COL] = train[EVENT_COL].cat.set_categories(all_types)
    test[EVENT_COL]  = test[EVENT_COL].cat.set_categories(all_types)

print("Event types:", train[EVENT_COL].cat.categories.tolist() if EVENT_COL in train.columns else "N/A")


Event types: ['ADD_CART', 'BUY', 'REMOVE_CART', 'VIEW']


In [6]:
# Cell 6 — Session-level feature builder (updated to include `user_id`)
def build_session_table(events: pd.DataFrame, is_train: bool) -> pd.DataFrame:
    df = events.copy()
    df = df.sort_values(["user_session", "event_time"]).reset_index(drop=True)

    # Basic per-event helpers
    df["is_buy"] = (df["event_type"] == "BUY").astype(int)
    df["is_add"] = (df["event_type"] == "ADD_CART").astype(int)
    df["is_rem"] = (df["event_type"] == "REMOVE_CART").astype(int)
    df["is_view"] = (df["event_type"] == "VIEW").astype(int)

    # Rank events within a session
    df["ev_idx"] = df.groupby("user_session").cumcount()

    # First/last event type
    first_event = df.groupby("user_session")["event_type"].first().rename("first_event_type")
    last_event  = df.groupby("user_session")["event_type"].last().rename("last_event_type")

    # Session start/end time
    t_start = df.groupby("user_session")["event_time"].min().rename("session_start")
    t_end   = df.groupby("user_session")["event_time"].max().rename("session_end")
    duration = (t_end - t_start).dt.total_seconds().rename("duration_sec")

    # Counts & uniques
    agg_counts = df.groupby("user_session").agg(
        n_events = ("event_type", "size"),
        n_products = ("product_id", pd.Series.nunique),
        n_categories = ("category_id", pd.Series.nunique),
        n_event_types = ("event_type", pd.Series.nunique),
        cnt_buy = ("is_buy", "sum"),
        cnt_add = ("is_add", "sum"),
        cnt_rem = ("is_rem", "sum"),
        cnt_view = ("is_view", "sum"),
    )

    # Has buy flag
    has_buy = (agg_counts["cnt_buy"] > 0).astype(int).rename("has_buy")

    # Index of first BUY (or -1)
    first_buy_idx = (
        df[df["is_buy"] == 1]
        .groupby("user_session")["ev_idx"]
        .min()
        .reindex(agg_counts.index)
        .fillna(-1)
        .astype(int)
        .rename("idx_first_buy")
    )

    # Events after first BUY (0 if none)
    events_after_buy = (agg_counts["n_events"] - (first_buy_idx + 1)).clip(lower=0).rename("events_after_first_buy")

    # Adds/removes before first BUY
    tmp = df.merge(first_buy_idx.rename("fb"), left_on="user_session", right_index=True, how="left")
    before_fb = tmp["ev_idx"] <= tmp["fb"]
    cnt_add_before_buy = tmp.loc[before_fb, "is_add"].groupby(tmp["user_session"]).sum().reindex(agg_counts.index).fillna(0).astype(int).rename("cnt_add_before_buy")
    cnt_rem_before_buy = tmp.loc[before_fb, "is_rem"].groupby(tmp["user_session"]).sum().reindex(agg_counts.index).fillna(0).astype(int).rename("cnt_rem_before_buy")

    # Transitions count
    def count_transitions(g):
        x = g["event_type"].astype(str).values
        if len(x) <= 1:
            return 0
        return int((x[1:] != x[:-1]).sum())
    n_transitions = df.groupby("user_session").apply(count_transitions).rename("n_transitions")

    # Time-of-day features
    start_hour = t_start.dt.hour.rename("start_hour")
    start_dow  = t_start.dt.dayofweek.rename("start_dow")
    start_day  = t_start.dt.day.rename("start_day")

    # >>> NEW: carry user_id per session
    user_map = df.groupby("user_session")["user_id"].first().rename("user_id")

    # Assemble
    sess = pd.concat(
        [
            t_start, t_end, duration, agg_counts,
            has_buy, first_buy_idx, events_after_buy,
            cnt_add_before_buy, cnt_rem_before_buy,
            n_transitions, first_event, last_event,
            start_hour, start_dow, start_day,
            user_map,
        ],
        axis=1
    ).reset_index()

    if is_train:
        t = df.groupby("user_session")["session_value"].first().reset_index()
        sess = sess.merge(t, on="user_session", how="left")
        chk = df.groupby("user_session")["session_value"].nunique().max()
        if chk != 1:
            print("WARNING: session_value is not constant within sessions.")

    # Cast categoricals
    for c in ["first_event_type", "last_event_type"]:
        if c in sess.columns:
            sess[c] = sess[c].astype("category")
    sess["user_id"] = sess["user_id"].astype(str)

    return sess

# Rebuild with updated function
train_sess = build_session_table(train, is_train=True)
test_sess  = build_session_table(test,  is_train=False)
print("Session tables rebuilt; `user_id` included.")


Session tables rebuilt; `user_id` included.


In [7]:
# Cell 7 — (Fixed) User expanding history — index-safe & fast
def add_user_history(train_sess: pd.DataFrame, test_sess: pd.DataFrame):
    # Combine (train first, then test), keep chronological order within each user
    comb = pd.concat(
        [
            train_sess.assign(_is_train=1),
            test_sess.assign(_is_train=0, **{TARGET_COL: np.nan}),
        ],
        axis=0, ignore_index=True
    ).sort_values(["user_id", "session_start"]).reset_index(drop=True)

    # --- 1) Prior # of sessions (strictly before current) ---
    comb["user_prev_n_sessions"] = comb.groupby("user_id").cumcount()

    # --- 2) Prior buy-rate (expanding mean of has_buy, shifted) ---
    g = comb.groupby("user_id", sort=False)
    # cumulative count position (1-based) and previous count
    pos = g.cumcount() + 1
    prev_cnt = pos - 1

    # cumulative sum of has_buy including current, then subtract current for "previous" sum
    cum_sum_buy = g["has_buy"].cumsum()
    prev_sum_buy = cum_sum_buy - comb["has_buy"]
    # safe divide
    comb["user_prev_buy_rate"] = np.divide(
        prev_sum_buy.astype(float),
        prev_cnt,
        out=np.zeros_like(prev_sum_buy, dtype=float),
        where=prev_cnt > 0
    )

    # --- 3) Prior mean(session_value) (labels exist only in train) ---
    # create helper columns to count only non-NaN labels
    comb["sv_notna"]  = comb[TARGET_COL].notna().astype(int)
    comb["sv_filled"] = comb[TARGET_COL].fillna(0.0)

    comb["cum_sum_sv"]  = g["sv_filled"].cumsum()
    comb["cum_cnt_sv"]  = g["sv_notna"].cumsum()

    prev_sum_sv = comb["cum_sum_sv"] - comb["sv_filled"]
    prev_cnt_sv = comb["cum_cnt_sv"] - comb["sv_notna"]

    prev_mean_sv = np.divide(
        prev_sum_sv,
        prev_cnt_sv,
        out=np.full(len(prev_sum_sv), np.nan, dtype=float),
        where=prev_cnt_sv > 0
    )

    global_mean_sv = float(train_sess[TARGET_COL].mean())
    comb["user_prev_mean_sv"] = np.where(np.isnan(prev_mean_sv), global_mean_sv, prev_mean_sv)

    # cleanup helpers
    comb = comb.drop(columns=["sv_notna","sv_filled","cum_sum_sv","cum_cnt_sv"])

    # --- Split back ---
    train_hist = comb[comb["_is_train"] == 1].drop(columns=["_is_train"])
    test_hist  = comb[comb["_is_train"] == 0].drop(columns=["_is_train"])

    # types
    for df in (train_hist, test_hist):
        df["user_prev_n_sessions"] = df["user_prev_n_sessions"].astype(int)
        df["user_prev_buy_rate"]   = df["user_prev_buy_rate"].astype(float)
        df["user_prev_mean_sv"]    = df["user_prev_mean_sv"].astype(float)

    return train_hist, test_hist

# Run it
if CFG["add_user_history"]:
    train_sess, test_sess = add_user_history(train_sess, test_sess)
    print("Added user history features.")
    display(train_sess.filter(like="user_prev").head(3))
# Feature columns
categorical_cols = ["first_event_type", "last_event_type"]
numeric_cols = [
    "n_events", "n_products", "n_categories", "n_event_types",
    "cnt_buy", "cnt_add", "cnt_rem", "cnt_view",
    "duration_sec", "has_buy", "idx_first_buy", "events_after_first_buy",
    "cnt_add_before_buy", "cnt_rem_before_buy", "n_transitions",
    "start_hour", "start_dow", "start_day",
]

if CFG["add_user_history"]:
    numeric_cols += ["user_prev_n_sessions", "user_prev_buy_rate", "user_prev_mean_sv"]

# Ensure columns exist (robustness)
categorical_cols = [c for c in categorical_cols if c in train_sess.columns]
numeric_cols = [c for c in numeric_cols if c in train_sess.columns]

FEATS = categorical_cols + numeric_cols
print("Num features:", len(FEATS))
print("Categorical:", categorical_cols)
print("Numeric:", [c for c in FEATS if c not in categorical_cols])

# Target
if CFG["use_log_target"]:
    train_sess["target"] = np.log1p(train_sess[TARGET_COL].clip(lower=0))
else:
    train_sess["target"] = train_sess[TARGET_COL].astype(float)

# LightGBM requires category dtype for cat features
for c in categorical_cols:
    train_sess[c] = train_sess[c].astype("category")
    test_sess[c]  = test_sess[c].astype("category")

X = train_sess[FEATS].copy()
y = train_sess["target"].values
X_test = test_sess[FEATS].copy()

X.shape, X_test.shape


Added user history features.


Unnamed: 0,user_prev_n_sessions,user_prev_buy_rate,user_prev_mean_sv
0,0,0.0,42.19813
1,0,0.0,42.19813
2,0,0.0,42.19813


Num features: 23
Categorical: ['first_event_type', 'last_event_type']
Numeric: ['n_events', 'n_products', 'n_categories', 'n_event_types', 'cnt_buy', 'cnt_add', 'cnt_rem', 'cnt_view', 'duration_sec', 'has_buy', 'idx_first_buy', 'events_after_first_buy', 'cnt_add_before_buy', 'cnt_rem_before_buy', 'n_transitions', 'start_hour', 'start_dow', 'start_day', 'user_prev_n_sessions', 'user_prev_buy_rate', 'user_prev_mean_sv']


((70736, 23), (30789, 23))

In [8]:
# Feature columns
categorical_cols = ["first_event_type", "last_event_type"]
numeric_cols = [
    "n_events", "n_products", "n_categories", "n_event_types",
    "cnt_buy", "cnt_add", "cnt_rem", "cnt_view",
    "duration_sec", "has_buy", "idx_first_buy", "events_after_first_buy",
    "cnt_add_before_buy", "cnt_rem_before_buy", "n_transitions",
    "start_hour", "start_dow", "start_day",
]

if CFG["add_user_history"]:
    numeric_cols += ["user_prev_n_sessions", "user_prev_buy_rate", "user_prev_mean_sv"]

# Ensure columns exist (robustness)
categorical_cols = [c for c in categorical_cols if c in train_sess.columns]
numeric_cols = [c for c in numeric_cols if c in train_sess.columns]

FEATS = categorical_cols + numeric_cols
print("Num features:", len(FEATS))
print("Categorical:", categorical_cols)
print("Numeric:", [c for c in FEATS if c not in categorical_cols])

# Target
if CFG["use_log_target"]:
    train_sess["target"] = np.log1p(train_sess[TARGET_COL].clip(lower=0))
else:
    train_sess["target"] = train_sess[TARGET_COL].astype(float)

# LightGBM requires category dtype for cat features
for c in categorical_cols:
    train_sess[c] = train_sess[c].astype("category")
    test_sess[c]  = test_sess[c].astype("category")

X = train_sess[FEATS].copy()
y = train_sess["target"].values
X_test = test_sess[FEATS].copy()

X.shape, X_test.shape


Num features: 23
Categorical: ['first_event_type', 'last_event_type']
Numeric: ['n_events', 'n_products', 'n_categories', 'n_event_types', 'cnt_buy', 'cnt_add', 'cnt_rem', 'cnt_view', 'duration_sec', 'has_buy', 'idx_first_buy', 'events_after_first_buy', 'cnt_add_before_buy', 'cnt_rem_before_buy', 'n_transitions', 'start_hour', 'start_dow', 'start_day', 'user_prev_n_sessions', 'user_prev_buy_rate', 'user_prev_mean_sv']


((70736, 23), (30789, 23))

In [9]:
# We create 3 chronological folds that mimic the competition's temporal split.
# You can inspect date distribution and adjust cut points if desired.

def make_time_folds(df: pd.DataFrame, n_splits=3, date_col="session_start"):
    # Sort sessions by start time and split by contiguous chunks
    df_sorted = df.sort_values(date_col).reset_index()
    n = len(df_sorted)
    fold_sizes = [n // n_splits] * n_splits
    for i in range(n % n_splits):
        fold_sizes[i] += 1
    idxs = []
    start = 0
    for fs in fold_sizes:
        end = start + fs
        idxs.append(df_sorted.loc[start:end-1, "index"].values)
        start = end
    folds = []
    for i in range(n_splits):
        val_idx = idxs[i]
        tr_idx = np.concatenate([idxs[j] for j in range(n_splits) if j != i])
        folds.append((tr_idx, val_idx))
    return folds

folds = make_time_folds(train_sess, n_splits=CFG["n_splits"], date_col="session_start")
for i, (_, va) in enumerate(folds):
    d1 = train_sess.loc[va, "session_start"].min()
    d2 = train_sess.loc[va, "session_start"].max()
    print(f"Fold {i}: val window {d1} → {d2}, size={len(va)}")


Fold 0: val window 2025-06-01 00:00:24+00:00 → 2025-06-07 03:06:40+00:00, size=23579
Fold 1: val window 2025-06-07 03:06:51+00:00 → 2025-06-14 09:07:02+00:00, size=23579
Fold 2: val window 2025-06-14 09:07:35+00:00 → 2025-06-21 23:58:05+00:00, size=23578


In [10]:
# Cell 10 — Train LightGBM with time-based CV (index-safe)

oof_s = pd.Series(0.0, index=X.index)  # OOF predictions aligned by label index
models = []
fi_frames = []

# Align y to X's index for label-based indexing
y_s = pd.Series(train_sess["target"].values, index=X.index)

# Optional guard: ensure fold indices line up with X
bad = [i for i, (tr, va) in enumerate(folds)
       if not set(tr).issubset(set(X.index)) or not set(va).issubset(set(X.index))]
if bad:
    raise ValueError(f"Fold indices not aligned with X.index on folds {bad}. "
                     f"Rebuild folds after any reindexing.")

for i, (tr_idx, va_idx) in enumerate(folds):
    # Use .loc (label-based), not .iloc
    X_tr, y_tr = X.loc[tr_idx], y_s.loc[tr_idx].values
    X_va, y_va = X.loc[va_idx], y_s.loc[va_idx].values

    lgb_train = lgb.Dataset(
        X_tr, label=y_tr, categorical_feature=categorical_cols, free_raw_data=False
    )
    lgb_valid = lgb.Dataset(
        X_va, label=y_va, categorical_feature=categorical_cols, free_raw_data=False
    )

    model = lgb.train(
        CFG["lgb_params"],
        lgb_train,
        num_boost_round=CFG["num_boost_round"],
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "valid"],
        callbacks=[lgb.early_stopping(stopping_rounds=CFG["early_stopping_rounds"], verbose=False)],
    )

    pred = model.predict(X_va, num_iteration=model.best_iteration)
    oof_s.loc[va_idx] = pred
    models.append(model)

    fi = pd.DataFrame({
        "feature": FEATS,
        "gain": model.feature_importance(importance_type="gain"),
        "split": model.feature_importance(importance_type="split"),
    })
    fi["fold"] = i
    fi_frames.append(fi)
    print(f"Fold {i}: best_iter={model.best_iteration}, val_size={len(va_idx)}")

# OOF score on the competition (raw) scale
if CFG["use_log_target"]:
    y_raw   = np.expm1(y_s.values)
    oof_raw = np.expm1(oof_s.values).clip(min=0)
    oof_mse = mean_squared_error(y_raw, oof_raw)
else:
    oof_mse = mean_squared_error(y_s.values, oof_s.values)

print(f"OOF MSE: {oof_mse:,.4f}")

# Aggregate feature importance across folds
feat_importance = (
    pd.concat(fi_frames, ignore_index=True)
      .groupby("feature")[["gain", "split"]]
      .mean()
      .sort_values("gain", ascending=False)
      .reset_index()
)
display(feat_importance.head(30))

# Keep 'oof' for Cell 13 compatibility
oof = oof_s.values



Fold 0: best_iter=11999, val_size=23579
Fold 1: best_iter=2085, val_size=23579
Fold 2: best_iter=11839, val_size=23578
OOF MSE: 396.3284


Unnamed: 0,feature,gain,split
0,has_buy,503422800.0,18842.666667
1,cnt_buy,386558800.0,1364.333333
2,last_event_type,99893140.0,84848.0
3,first_event_type,71362140.0,99453.333333
4,duration_sec,49273610.0,37233.0
5,n_products,44801740.0,1450.333333
6,n_events,42811350.0,1623.666667
7,events_after_first_buy,39423150.0,1933.333333
8,n_categories,37904870.0,1352.666667
9,start_hour,24365110.0,56311.0


In [11]:
full_train = lgb.Dataset(X, label=y, categorical_feature=categorical_cols, free_raw_data=False)
full_model = lgb.train(
    CFG["lgb_params"],
    full_train,
    num_boost_round=int(np.mean([m.best_iteration for m in models]))  # a common heuristic
)

test_pred = full_model.predict(X_test)

# Back-transform if using log target
if CFG["use_log_target"]:
    test_pred = np.expm1(test_pred).clip(min=0)

# Non-negative clamp (session_value cannot be negative)
test_pred = np.clip(test_pred, 0, None)

# Optionally cap extreme outliers if helpful (tune later)
# cap = np.percentile(test_pred, 99.8)
# test_pred = np.clip(test_pred, 0, cap)

print("Pred summary:", pd.Series(test_pred).describe())


Pred summary: count    30789.000000
mean        41.329168
std         45.832326
min          0.000000
25%         21.333997
50%         26.359077
75%         39.836563
max        976.479399
dtype: float64


In [12]:
# Expect submission format: ["user_session", "session_value"]
sub_out = sub.copy()
# The submission's user_session order must match sample_submission (common in Kaggle)
# Ensure we have predictions per session present in test_sess with the same key
key = "user_session"

# Map predictions by user_session
pred_map = dict(zip(test_sess[key], test_pred))

sub_out[TARGET_COL] = sub_out[key].map(pred_map).fillna(0.0)  # fill if any missing
print(sub_out.head(3))

save_name = "submission.csv"
sub_out.to_csv(save_name, index=False)
print("Saved:", save_name)


     user_session  session_value
0  SESSION_164059     224.995630
1  SESSION_109583      51.176259
2  SESSION_171382      40.236241
Saved: submission.csv


In [13]:
# OOF vs target (raw scale) distribution check
if CFG["use_log_target"]:
    oof_raw = np.expm1(oof).clip(min=0)
    y_raw   = np.expm1(y)
    print("OOF MSE (raw):", mean_squared_error(y_raw, oof_raw))
    display(pd.DataFrame({"y": y_raw, "oof": oof_raw}).describe())
else:
    print("OOF MSE (raw):", mean_squared_error(y, oof))
    display(pd.DataFrame({"y": y, "oof": oof}).describe())

# Per-day stability (helps spot drift)
tmp = train_sess.assign(y=y, oof=oof)
day_mse = tmp.groupby(tmp["session_start"].dt.date).apply(lambda d: mean_squared_error(d["y"], d["oof"]))
display(day_mse.to_frame("mse").reset_index().rename(columns={"session_start":"date"}).head(20))


OOF MSE (raw): 396.3283569505139


Unnamed: 0,y,oof
count,70736.0,70736.0
mean,42.19813,40.618121
std,47.552369,43.065841
min,5.38,-19.417726
25%,18.53,21.979818
50%,30.75,26.970475
75%,46.62,38.921137
max,2328.66,848.58122


Unnamed: 0,date,mse
0,2025-06-01,1615.941627
1,2025-06-02,549.943216
2,2025-06-03,315.956575
3,2025-06-04,258.473903
4,2025-06-05,395.694055
5,2025-06-06,243.428099
6,2025-06-07,422.970303
7,2025-06-08,263.466604
8,2025-06-09,230.01767
9,2025-06-10,240.898103
