In [23]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np

In [24]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [25]:
data = pd.read_csv("../input/data.csv")
train_flag = pd.read_csv("../input/train_flag.csv")
sub = pd.read_csv("../input/sample_submission.csv")

In [26]:
print("data shape:", data.shape)
print("train_flag shape:", train_flag.shape)
print("sub shape:", sub.shape)

print("\n[data columns]")
print(data.columns.tolist())

print("\n[train_flag columns]")
print(train_flag.columns.tolist())

print("\n[data head]")
display(data.head(3))

print("\n[train_flag head]")
display(train_flag.head(3))

# 欠損率（上位だけ）
na_rate = (data.isna().mean().sort_values(ascending=False).head(15))
print("\n[data NA rate top15]")
print(na_rate)

# dateの範囲（文字のままだと min/max が怪しいので一旦文字列で確認）
print("\n[date min/max raw]")
print(data["date"].min(), data["date"].max())

# user_id数
print("\n[nunique user_id]")
print("data users:", data["user_id"].nunique())
print("train_flag users:", train_flag["user_id"].nunique())
print("sub users:", sub["user_id"].nunique())

# train_flagにしかいない/ dataにしかいない
train_only = set(train_flag["user_id"]) - set(data["user_id"])
data_only = set(data["user_id"]) - set(train_flag["user_id"])
print("\n[user_id mismatch]")
print("train_flag only:", len(train_only))
print("data only:", len(data_only))

data shape: (2757288, 24)
train_flag shape: (29965, 2)
sub shape: (10000, 2)

[data columns]
['jan_cd', 'item_name', 'item_spec', 'item_category_cd_1', 'item_category_cd_2', 'item_category_cd_3', 'item_category_name', 'average_unit_price', 'amount', 'total_price', 'user_id', 'date', 'store_deli', 'user_flag_ec', 'membership_start_ym', 'age_category', 'sex', 'user_stage', 'user_flag_1', 'user_flag_2', 'user_flag_3', 'user_flag_4', 'user_flag_5', 'user_flag_6']

[train_flag columns]
['user_id', 'churn']

[data head]


Unnamed: 0,jan_cd,item_name,item_spec,item_category_cd_1,item_category_cd_2,item_category_cd_3,item_category_name,average_unit_price,amount,total_price,...,membership_start_ym,age_category,sex,user_stage,user_flag_1,user_flag_2,user_flag_3,user_flag_4,user_flag_5,user_flag_6
0,4904230041160,ブラックニッカディープブレンド,７００ｍｌ,25,12,1,国産洋酒,1375.0,4,5500,...,201610.0,80代～,女性,メンバー,0,1,0,1,0,0
1,4901777284364,ＳＵ　角瓶ジャンボ,１９２０ｍｌ,25,12,1,国産洋酒,4895.0,1,4895,...,201702.0,50代,女性,ゴールド,0,0,0,0,0,0
2,280743000000,寿司セット（景福）,１パック,18,34,3,セット,1078.0,4,4312,...,202402.0,60代,女性,メンバー,0,1,0,0,0,0



[train_flag head]


Unnamed: 0,user_id,churn
0,$1$c92,1
1,$1cd7f,1
2,$1d3a9,1



[data NA rate top15]
item_spec              0.001962
sex                    0.000020
user_stage             0.000013
user_id                0.000013
age_category           0.000013
membership_start_ym    0.000013
jan_cd                 0.000000
user_flag_ec           0.000000
user_flag_5            0.000000
user_flag_4            0.000000
user_flag_3            0.000000
user_flag_2            0.000000
user_flag_1            0.000000
store_deli             0.000000
item_name              0.000000
dtype: float64

[date min/max raw]
20240203 20250202

[nunique user_id]
data users: 40496
train_flag users: 29965
sub users: 10000

[user_id mismatch]
train_flag only: 0
data only: 10532


In [27]:
# 1) 型変換
data["date"] = pd.to_datetime(data["date"].astype(str), format="%Y%m%d")

ms = pd.to_numeric(data["membership_start_ym"], errors="coerce").astype("Int64")
data["membership_start_ym"] = pd.to_datetime(ms.astype(str), format="%Y%m", errors="coerce")

# 2) 基準日
REF_DATE = pd.Timestamp("2025-02-03")
hist = data[data["date"] < REF_DATE].copy()

# 3) 直近窓集計
def agg_window(df, days):
    start = REF_DATE - pd.Timedelta(days=days)
    w = df[df["date"] >= start]
    g = w.groupby("user_id").agg(
        txn_count=("total_price", "size"),
        spend_sum=("total_price", "sum"),
        amount_sum=("amount", "sum"),
        avg_unit_price=("average_unit_price", "mean"),
    )
    g.columns = [f"{c}_{days}d" for c in g.columns]
    return g

# Recency
last_date = hist.groupby("user_id")["date"].max()
recency = (REF_DATE - last_date).dt.days.to_frame("days_since_last_purchase")

# 全期間集計
base = hist.groupby("user_id").agg(
    txn_count_all=("total_price", "size"),
    spend_sum_all=("total_price", "sum"),
    amount_sum_all=("amount", "sum"),
    avg_unit_price_all=("average_unit_price", "mean"),
    unique_items_all=("jan_cd", "nunique"),
    unique_cat1_all=("item_category_cd_1", "nunique"),
)
# 直近窓
w7  = agg_window(hist, 7)
w14 = agg_window(hist, 14)
w30 = agg_window(hist, 30)
w60 = agg_window(hist, 60)
w90 = agg_window(hist, 90)

# プロフィール（最新）
hist_sorted = hist.sort_values(["user_id", "date"])
profile_last = hist_sorted.groupby("user_id").tail(1).set_index("user_id")

profile = pd.DataFrame(index=profile_last.index)
profile["age_category"] = profile_last["age_category"].fillna("unknown").astype("category")
profile["sex"] = profile_last["sex"].fillna("unknown").astype("category")
profile["user_stage"] = profile_last["user_stage"].fillna("unknown").astype("category")
profile["user_flag_ec"] = pd.to_numeric(profile_last["user_flag_ec"], errors="coerce").fillna(0).astype(int)

for k in range(1, 7):
    col = f"user_flag_{k}"
    profile[col] = pd.to_numeric(profile_last[col], errors="coerce").fillna(0).astype(int)

# 会員歴（月数）
ms_last = profile_last["membership_start_ym"]
m = (ms_last.dt.year * 12 + ms_last.dt.month)
profile["membership_months"] = ((REF_DATE.year * 12 + REF_DATE.month) - m).astype("float")
profile["membership_months"] = profile["membership_months"].fillna(-1.0)
# 特徴量テーブル
X = (
    profile
    .join(recency, how="left")
    .join(base, how="left")
    .join(w7, how="left")
    .join(w14, how="left")
    .join(w30, how="left")
    .join(w60, how="left")
    .join(w90, how="left")
)

# トレンド
X["spend_prev30d"] = (X["spend_sum_60d"] - X["spend_sum_30d"]).clip(lower=0)
X["txn_prev30d"]   = (X["txn_count_60d"] - X["txn_count_30d"]).clip(lower=0)
X["spend_trend_30d"] = X["spend_sum_30d"] - X["spend_prev30d"]
X["txn_trend_30d"]   = X["txn_count_30d"] - X["txn_prev30d"]
X["spend_ratio_30d_prev30d"] = X["spend_sum_30d"] / (X["spend_prev30d"] + 1)
X["txn_ratio_30d_prev30d"]   = X["txn_count_30d"] / (X["txn_prev30d"] + 1)
X["activity_decay_30d_over_90d"] = X["txn_count_30d"] / (X["txn_count_90d"] + 1)

# 購入間隔（安全版）
hist_sorted2 = hist.sort_values(["user_id", "date"]).reset_index(drop=True)
interval_days = hist_sorted2.groupby("user_id")["date"].diff().dt.days
interval_feat = interval_days.groupby(hist_sorted2["user_id"]).agg(
    mean_purchase_interval="mean",
    std_purchase_interval="std",
    last_purchase_interval="last",
)
X = X.join(interval_feat, how="left")
# 数値欠損は0埋め
num_cols = X.select_dtypes(include=[np.number]).columns
X[num_cols] = X[num_cols].fillna(0)

print("X shape:", X.shape)
display(X.head())

X shape: (40496, 48)


Unnamed: 0_level_0,age_category,sex,user_stage,user_flag_ec,user_flag_1,user_flag_2,user_flag_3,user_flag_4,user_flag_5,user_flag_6,...,spend_prev30d,txn_prev30d,spend_trend_30d,txn_trend_30d,spend_ratio_30d_prev30d,txn_ratio_30d_prev30d,activity_decay_30d_over_90d,mean_purchase_interval,std_purchase_interval,last_purchase_interval
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$1$c92,50代,女性,メンバー,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$1cd7f,70代,男性,メンバー,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$1d062,20代,男性,メンバー,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$1d3a9,30代,女性,メンバー,0,1,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$1f87d,20代,男性,メンバー,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# train_flagと結合（学習用）
train_df = (
    train_flag[["user_id", "churn"]]
    .merge(X, on="user_id", how="left")   # ★ train_flag 側を基準にする
)

# test用（sample_submissionに合わせる）
test_df = (
    sub[["user_id"]]
    .merge(X, on="user_id", how="left")   # ★ sub 側を基準にする（落とさない）
)

print("train_df:", train_df.shape)
print("test_df:", test_df.shape)
print("positive rate:", train_df["churn"].mean())

# 念のため：欠損チェック（特徴量が作れなかった user_id がいるか）
print("train_df feature NA users:", train_df.drop(columns=["user_id", "churn"]).isna().any(axis=1).sum())
print("test_df feature NA users:", test_df.drop(columns=["user_id"]).isna().any(axis=1).sum())

train_df: (29965, 50)
test_df: (10000, 49)
positive rate: 0.7349240780911063
train_df feature NA users: 0
test_df feature NA users: 0


In [29]:
# 目的変数（そのまま churn=1 を学習）
y = train_df["churn"].astype(int)

X_train_full = train_df.drop(columns=["churn", "user_id"])
X_test_full  = test_df.drop(columns=["user_id"])

# カテゴリ型
cat_cols = ["age_category", "sex", "user_stage"]
for col in cat_cols:
    X_train_full[col] = X_train_full[col].astype("category")
    X_test_full[col]  = X_test_full[col].astype("category")

for col in cat_cols:
    if "unknown" not in X_train_full[col].cat.categories:
        X_train_full[col] = X_train_full[col].cat.add_categories(["unknown"])
    if "unknown" not in X_test_full[col].cat.categories:
        X_test_full[col]  = X_test_full[col].cat.add_categories(["unknown"])

    X_train_full[col] = X_train_full[col].fillna("unknown")
    X_test_full[col]  = X_test_full[col].fillna("unknown")

num_cols = X_train_full.select_dtypes(include=[np.number]).columns
X_train_full[num_cols] = X_train_full[num_cols].fillna(0)
X_test_full[num_cols]  = X_test_full[num_cols].fillna(0)
# split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_full, y, test_size=0.2, random_state=42, stratify=y
)

model = lgb.LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(200)]
)

val_pred = model.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, val_pred)
print("Validation AUC (churn=1):", auc)


[LightGBM] [Info] Number of positive: 17618, number of negative: 6354
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008768 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8164
[LightGBM] [Info] Number of data points in the train set: 23972, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.734941 -> initscore=1.019837
[LightGBM] [Info] Start training from score 1.019837
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[94]	valid_0's auc: 0.900135	valid_0's binary_logloss: 0.333781
Validation AUC (churn=1): 0.9001349108225316


In [32]:
final_model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

final_model.fit(X_train_full, y)

test_pred = final_model.predict_proba(X_test_full)[:, 1]

# subの順番・user_idに厳密に合わせて提出を作る
submission = sub[["user_id"]].copy()
submission["pred"] = test_pred
submission.to_csv("../submission/submission.csv", index=False)

submission.head()

[LightGBM] [Info] Number of positive: 22022, number of negative: 7943
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8275
[LightGBM] [Info] Number of data points in the train set: 29965, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.734924 -> initscore=1.019751
[LightGBM] [Info] Start training from score 1.019751


Unnamed: 0,user_id,pred
0,$1d062,0.99331
1,$5$ab$4,0.894093
2,$5$f5af,0.989261
3,$623182,0.968072
4,$65b$2,0.814034
