In [1]:
import pandas as pd

tx = pd.read_csv("transactions_train.csv",parse_dates=["t_dat"],dtype={"customer_id": "string", "article_id": "int32"})

articles = pd.read_csv("articles.csv",dtype={"article_id": "int32"})

print("tx rows, cols:", tx.shape)
print("articles rows, cols:", articles.shape)

display(tx.head())
display(articles[["article_id", "product_group_name", "garment_group_name"]].head())

tx rows, cols: (31788324, 5)
articles rows, cols: (105542, 25)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


Unnamed: 0,article_id,product_group_name,garment_group_name
0,108775015,Garment Upper body,Jersey Basic
1,108775044,Garment Upper body,Jersey Basic
2,108775051,Garment Upper body,Jersey Basic
3,110065001,Underwear,"Under-, Nightwear"
4,110065002,Underwear,"Under-, Nightwear"


In [2]:
"""
Dataset scale
transactions_train: ~31.8 million rows
articles: ~105k rows

That means:
Any groupby + apply on transactions will be slow
We need vectorized or sampled logic, not per-group Python loops

Before defining “returns”, we need to answer:
Does every transaction map to exactly one article?
Are product / garment groups evenly distributed?
Are there customers who buy many items on the same day?
(this explains over-labeling)
"""

'\nDataset scale\ntransactions_train: ~31.8 million rows\narticles: ~105k rows\n\nThat means:\nAny groupby + apply on transactions will be slow\nWe need vectorized or sampled logic, not per-group Python loops\n\nBefore defining “returns”, we need to answer:\nDoes every transaction map to exactly one article?\nAre product / garment groups evenly distributed?\nAre there customers who buy many items on the same day?\n(this explains over-labeling)\n'

In [3]:
# EDA Cell 2: Join coverage check (sampled for speed)

# Sample transactions to keep this fast
tx_sample = tx.sample(n=1_000_000, random_state=42)

tx_joined = tx_sample.merge(
    articles[["article_id", "product_group_name", "garment_group_name"]],
    on="article_id",
    how="left"
)

# Missing join diagnostics
print("Missing product_group_name:",
      tx_joined["product_group_name"].isna().mean())

print("Missing garment_group_name:",
      tx_joined["garment_group_name"].isna().mean())

# Show a few missing examples (if any)
display(
    tx_joined[tx_joined["product_group_name"].isna()].head()
)

Missing product_group_name: 0.0
Missing garment_group_name: 0.0


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,product_group_name,garment_group_name


In [4]:
"""
We need to understand:
How often do customers buy multiple items in the same category within a short time window?
"""

'\nWe need to understand:\nHow often do customers buy multiple items in the same category within a short time window?\n'

In [5]:
# EDA Cell 3: Purchase clustering behavior (sampled)

# Use a manageable sample
tx_s = tx.sample(n=1_000_000, random_state=42)

# Join product group
tx_s = tx_s.merge(
    articles[["article_id", "product_group_name"]],
    on="article_id",
    how="left"
)

# ---------------------------
# Same-day purchases per customer
# ---------------------------
same_day_counts = (
    tx_s.groupby(["customer_id", "t_dat"])
        .size()
        .reset_index(name="items_same_day")
)

print("Same-day purchase distribution:")
display(same_day_counts["items_same_day"].value_counts().head(10))

# ---------------------------
# Same-day purchases per customer + product group
# ---------------------------
same_day_group_counts = (
    tx_s.groupby(["customer_id", "t_dat", "product_group_name"])
        .size()
        .reset_index(name="items_same_group_day")
)

print("Same-day SAME product group distribution:")
display(same_day_group_counts["items_same_group_day"].value_counts().head(10))

Same-day purchase distribution:


items_same_day
1     838180
2      67514
3       7242
4        949
5        161
6         44
7         14
8          6
9          2
10         1
Name: count, dtype: int64

Same-day SAME product group distribution:


items_same_group_day
1     918287
2      36801
3       2316
4        212
5         37
6         13
7          3
8          2
15         1
Name: count, dtype: int64

In [7]:
"""
What these distributions tell us (plain + technical)
1. Same-day purchases are VERY common
From 1M sampled rows:
~84% of customer-days → 1 item
~6.7% → 2 items

Thousands of cases with 3–5 items same day
Long tail up to 10+ items

So if we ever let same-day purchases count as “returns”, we will massively overlabel.


2. Same-day, same-product-group shopping is also common
This is the key insight:
~92% → 1 item
~3.7% → 2 items

Thousands of customers buy 2–4 items in the same product group on the same day

Examples of normal behavior:
buying 2 tops in different colors
buying multiple basics
sale shopping

This explains why:
product_group_name logic inflated return rates
Even with a 14-day window

Category similarity alone is not enough to infer returns in H&M data.

H&M customers:
buy many similar items
often in short bursts
often same or next day
So a “return” proxy must be exchange-shaped, not just “similar purchase soon”.
"""

'\nWhat these distributions tell us (plain + technical)\n1. Same-day purchases are VERY common\nFrom 1M sampled rows:\n~84% of customer-days → 1 item\n~6.7% → 2 items\n\nThousands of cases with 3–5 items same day\nLong tail up to 10+ items\n\nSo if we ever let same-day purchases count as “returns”, we will massively overlabel.\n\n\n2. Same-day, same-product-group shopping is also common\nThis is the key insight:\n~92% → 1 item\n~3.7% → 2 items\n\nThousands of customers buy 2–4 items in the same product group on the same day\n\nExamples of normal behavior:\nbuying 2 tops in different colors\nbuying multiple basics\nsale shopping\n\nThis explains why:\nproduct_group_name logic inflated return rates\nEven with a 14-day window\n\nCategory similarity alone is not enough to infer returns in H&M data.\n\nH&M customers:\nbuy many similar items\noften in short bursts\noften same or next day\nSo a “return” proxy must be exchange-shaped, not just “similar purchase soon”.\n'

In [8]:
# how many times customers buy in the same product group (overall)

tx_s = tx.sample(n=1_000_000, random_state=42)

tx_s = tx_s.merge(
    articles[["article_id", "product_group_name"]],
    on="article_id",
    how="left"
)

group_counts = (
    tx_s.groupby(["customer_id", "product_group_name"])
        .size()
)

print("Distribution of purchases per customer per product group:")
display(group_counts.value_counts().head(10))

Distribution of purchases per customer per product group:


1     601363
2     109992
3      30103
4      10322
5       4023
6       1765
7        871
8        459
9        261
10       144
Name: count, dtype: int64

In [None]:
"""
Interpretation
1 purchase: very common (one-off buys)
2 purchases: also very common
3+ purchases: still very common (tens of thousands)
This confirms something important:

Buying multiple items from the same product group is normal shopping behavior at H&M, not a reliable signal of returns.
Category similarity alone cannot define “returns” in this dataset.

If we keep trying to infer returns from:
same product group
short time window
we will always over-label.

So the fix is not tweaking thresholds anymore.
We need to change the target definition.
"""

In [None]:
"""
Project becomes:
“Predicting exchange-prone purchases based on repeat-article behavior in fashion retail.”

✅ Included (YES)
Case 1
Customer A buys article XYZ, then buys article XYZ again within 14 days.
This is your only positive signal.

❌ Excluded (NO)
Case 2
Customer A buys XYZ, then buys something similar (same product group) one more time.
Even if it could be an exchange in real life, your EDA showed that in this dataset:
This pattern happens very frequently
Often represents normal shopping
Is not reliably distinguishable from browsing or bulk buying
So including it would corrupt the label.
"""

In [9]:
# Article-only exchange-like label:
# exchange_like = 1 if same customer buys same article again within 14 days (not same-day)

WINDOW_DAYS = 14

tx_lab = tx.sort_values(["customer_id", "article_id", "t_dat"]).reset_index(drop=True)

g = tx_lab.groupby(["customer_id", "article_id"], sort=False)

next_date = g["t_dat"].shift(-1)

days_to_next = (next_date - tx_lab["t_dat"]).dt.days

tx_lab["exchange_like"] = days_to_next.between(1, WINDOW_DAYS).astype("int8")

print("Exchange-like rate (article-only):", tx_lab["exchange_like"].mean())

display(tx_lab[["customer_id", "t_dat", "article_id", "exchange_like"]].head(12))

Exchange-like rate (article-only): 0.02461630251409291


Unnamed: 0,customer_id,t_dat,article_id,exchange_like
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,176209023,0
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-25,568601006,0
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-25,568601006,0
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2020-09-05,568601043,0
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-07-25,607642008,0
5,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,625548001,0
6,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2018-12-27,627759010,0
7,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-09-18,656719005,0
8,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-11-28,694736004,0
9,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,2019-05-02,697138006,0


In [None]:
"""
With this target, the model learns:
“Which purchases are likely to trigger a same-item replacement shortly after?”
"""