In [1]:
import pandas as pd

tx = pd.read_csv("transactions_train.csv",
    parse_dates=["t_dat"],
    dtype={"customer_id": "string", "article_id": "int32"})

print(tx.shape)
tx.head()

(31788324, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [2]:
"""
Cell 2 — Create the label (article-only “exchange-like”)
Why we need it
Supervised ML requires a target column (y).
The H&M dataset doesn’t include returns, so we define a proxy label.
Article-only repeat within 14 days is a clean, low-noise behavioral signal (your EDA proved product-group was noisy).

Output: data["label"]: 1 if the customer buys the same article_id again within 14 days (not same-day), else 0.
"""

WINDOW_DAYS = 14

data = tx.sort_values(["customer_id", "article_id", "t_dat"]).reset_index(drop=True)

g = data.groupby(["customer_id", "article_id"], sort=False)
next_date = g["t_dat"].shift(-1)

days_to_next = (next_date - data["t_dat"]).dt.days

data["label"] = days_to_next.between(1, WINDOW_DAYS).astype("int8")

print("label rate:", data["label"].mean())
data[["customer_id", "t_dat", "article_id", "label"]].head(10)

label rate: 0.02461630251409291


Unnamed: 0,customer_id,t_dat,article_id,label
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,176209023,0
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-25,568601006,0
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-25,568601006,0
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2020-09-05,568601043,0
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-07-25,607642008,0
5,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,625548001,0
6,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,627759010,0
7,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-09-18,656719005,0
8,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-11-28,694736004,0
9,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-02,697138006,0


In [None]:
"""
Same-day repeats are usually bulk shopping, not exchanges
Re-buying the exact same SKU shortly after strongly suggests:
size swap
color swap
delivery issue

Product-group similarity was too noisy in H&M data
Article-only is high-precision, even if recall is lower
That’s why final rate is ~2.46% — rare but meaningful.
"""

"""
We label a transaction as exchange-like if the same customer purchases the same article again within 14 days, 
indicating a likely replacement rather than normal shopping behavior.
"""

In [3]:
"""
Cell 3 — Customer history features (past-only)
Different customers behave differently.
Some customers frequently re-buy the same item, others almost never do.

These features summarize each customer’s past behavior up to that purchase, without using any future information.

What is created
c_orders → how many purchases the customer made before
c_label_cnt → how many exchange-like events they had before
c_label_rate → customer’s historical exchange rate
c_days_since → days since their last purchase
All features are time-safe (no leakage).
"""

# Ensure chronological order per customer
data = data.sort_values(
    ["customer_id", "t_dat", "article_id"]
).reset_index(drop=True)

g = data.groupby("customer_id", sort=False)

# Number of previous purchases by this customer
data["c_orders"] = g.cumcount().astype("int32")

# Number of previous exchange-like events
data["c_label_cnt"] = (
    g["label"].cumsum().shift(1, fill_value=0).astype("int32")
)

# Historical exchange rate (smoothed)
data["c_label_rate"] = (
    data["c_label_cnt"] / (data["c_orders"] + 1)
).astype("float32")

# Days since last purchase
prev_date = g["t_dat"].shift(1)
data["c_days_since"] = (
    (data["t_dat"] - prev_date)
    .dt.days
    .fillna(-1)
    .astype("int32")
)

data[
    [
        "customer_id",
        "t_dat",
        "article_id",
        "label",
        "c_orders",
        "c_label_cnt",
        "c_label_rate",
        "c_days_since",
    ]
].head(5)


Unnamed: 0,customer_id,t_dat,article_id,label,c_orders,c_label_cnt,c_label_rate,c_days_since
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,176209023,0,0,0,0.0,-1
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,625548001,0,1,0,0.0,0
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,627759010,0,2,0,0.0,0
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-02,697138006,0,3,0,0.0,126
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-25,568601006,0,4,0,0.0,23


In [4]:
"""
Cell 4 — Article history features (past-only)

Some items are more prone to repeat purchases than others
(e.g., sizing issues, popular basics, fulfillment problems).
These features summarize an article’s history up to that purchase, without using future information.

What is created
a_orders → how many times this article was purchased before
a_label_cnt → how many exchange-like events this article had before
a_label_rate → historical exchange rate for the article

"""
# Sort chronologically per article
data = data.sort_values(["article_id", "t_dat", "customer_id"]).reset_index(drop=True)

g = data.groupby("article_id", sort=False)

# Number of previous purchases of this article
data["a_orders"] = g.cumcount().astype("int32")

# Number of previous exchange-like events for this article
data["a_label_cnt"] = (g["label"].cumsum().shift(1, fill_value=0).astype("int32"))

# Historical exchange rate for the article (smoothed)
data["a_label_rate"] = (
    data["a_label_cnt"] / (data["a_orders"] + 1)
).astype("float32")

# Quick sanity check
data[
    [
        "article_id",
        "t_dat",
        "customer_id",
        "label",
        "a_orders",
        "a_label_cnt",
        "a_label_rate",
    ]
].head(10)


Unnamed: 0,article_id,t_dat,customer_id,label,a_orders,a_label_cnt,a_label_rate
0,108775015,2018-09-20,05ed96931b707698bc94aa53766d44686ae5ccbbc99dfb...,0,0,0,0.0
1,108775015,2018-09-20,05ed96931b707698bc94aa53766d44686ae5ccbbc99dfb...,0,1,0,0.0
2,108775015,2018-09-20,0aaa9683d5f45b85d0f9a81e2f4d4ef774fed43339fb75...,0,2,0,0.0
3,108775015,2018-09-20,20ccafd82d923baecf1fb8705d459e243c8ca9c74ee24e...,0,3,0,0.0
4,108775015,2018-09-20,2135790e6b63d10b59f9d8b84a58e24328b0f5ed3a48f1...,0,4,0,0.0
5,108775015,2018-09-20,3329634c9451f438049c5ebb4e77d0e0b7c730228aa330...,0,5,0,0.0
6,108775015,2018-09-20,3e6b4fc03a858ef3d12e58d5b525e2f4bd88dea2701615...,0,6,0,0.0
7,108775015,2018-09-20,3e6b4fc03a858ef3d12e58d5b525e2f4bd88dea2701615...,0,7,0,0.0
8,108775015,2018-09-20,3e6b4fc03a858ef3d12e58d5b525e2f4bd88dea2701615...,0,8,0,0.0
9,108775015,2018-09-20,4d83b15cd0ece8ec05277dcbb38b1f86ba6fe07615efcb...,0,9,0,0.0


In [5]:
"""
Cell 5 — Time-based train / validation split

Retail behavior changes over time (seasonality, promos, trends).
A random split would leak future information.
A time-based split answers the real question:

Can we predict future exchange behavior using only past data?
"""
# Choose a cutoff date (80–85% of time range is typical)
cutoff_date = data["t_dat"].quantile(0.85)

print("Train / validation cutoff:", cutoff_date)

train = data[data["t_dat"] <= cutoff_date]
valid = data[data["t_dat"] > cutoff_date]

# Sanity checks
print("Train size:", train.shape)
print("Validation size:", valid.shape)

print("Train label rate:", train["label"].mean())
print("Validation label rate:", valid["label"].mean())

Train / validation cutoff: 2020-06-09 00:00:00
Train size: (27035600, 13)
Validation size: (4752724, 13)
Train label rate: 0.024709346195386825
Validation label rate: 0.024087028828099422


In [None]:
"""
Cell 6 — Baseline model (Logistic Regression)

Establish a baseline before complex models
Validate that your features have real signal
Get interpretable coefficients (what matters most)

Because the dataset is large and the label is imbalanced (~2.5%):
Sample the training set for speed
Use class weights
Evaluate with AUC + precision/recall
"""

In [6]:
# Cell 6: Baseline Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# -----------------------
# Feature selection
# -----------------------
features = [
    "c_orders",
    "c_label_rate",
    "c_days_since",
    "a_orders",
    "a_label_rate",
]

X_train = train[features]
y_train = train["label"]

X_valid = valid[features]
y_valid = valid["label"]

# -----------------------
# Downsample train for speed (keep all positives)
# -----------------------
pos = train[train["label"] == 1]
neg = train[train["label"] == 0].sample(
    n=len(pos) * 5,  # 1:5 ratio
    random_state=42
)

train_small = pd.concat([pos, neg]).sample(frac=1, random_state=42)

X_train_small = train_small[features]
y_train_small = train_small["label"]

# -----------------------
# Model pipeline
# -----------------------
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

# -----------------------
# Train
# -----------------------
pipe.fit(X_train_small, y_train_small)

# -----------------------
# Evaluate
# -----------------------
valid_probs = pipe.predict_proba(X_valid)[:, 1]
valid_preds = (valid_probs >= 0.5).astype(int)

auc = roc_auc_score(y_valid, valid_probs)
print("Validation ROC-AUC:", auc)

print("\nClassification report (threshold = 0.5):")
print(classification_report(y_valid, valid_preds))

Validation ROC-AUC: 0.5996348593313242

Classification report (threshold = 0.5):
              precision    recall  f1-score   support

           0       0.98      0.55      0.71   4638245
           1       0.03      0.59      0.06    114479

    accuracy                           0.56   4752724
   macro avg       0.51      0.57      0.38   4752724
weighted avg       0.96      0.56      0.69   4752724

