# 🛍️ H&M Strong Vanilla Baseline

This notebook enhances the baseline model with **Transactions-only** Time-decay + Co-visitation

This notebook:
- Uses only `transactions_train.csv` and `sample_submission.csv`.
- Builds a time-decayed co-visitation matrix (behavior-based, not content).
- Uses time-decayed trending items as fallback.

In [1]:
# Import the libaray
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from itertools import combinations
import math
import gc
print('Successfully loaded')

Successfully loaded


### 1) Parameters (tweak for speed / quality)

In [None]:
# Parameters to control how much data is used and how relevance is calculated
LAST_WEEKS = 12      # Keep only last 12 weeks of transactions
ALPHA = 1.0          # Time-decay factor in weighting
PAIR_DAYS = 14       # Only consider co-visits within 14 days
TOPK_COVISIT = 20    # Keep top 20 co-visited items for each item
TREND_TOPK = 100     # Number of trending items for fallback

print('Params set')

Params set


### 2) Load data

In [3]:

print('Loading transactions (this may take a while)...')
df = pd.read_csv('transactions_train/transactions_train.csv', parse_dates=['t_dat'])
sample_sub = pd.read_csv('sample_submission/sample_submission.csv')

print('Transactions rows:', len(df))
print('Sample submission rows:', len(sample_sub))


Loading transactions (this may take a while)...
Transactions rows: 31788324
Sample submission rows: 1371980


### 3) Filter recent history for speed & relevance
Keeps only the most recent LAST_WEEKS weeks to improve relevance and reduce computation.

In [None]:
# Only keep transactions from the last 12 weeks
max_date = df['t_dat'].max()
cutoff = max_date - pd.Timedelta(weeks=LAST_WEEKS)
df = df[df['t_dat'] >= cutoff].copy().reset_index(drop=True)

print('Using transactions from', cutoff.date(), 'to', max_date.date())
print('Filtered transactions rows:', len(df))


Using transactions from 2020-06-30 to 2020-09-22
Filtered transactions rows: 3448116


## 4) Build time-decayed trending list (fallback)
Builds a list of most trending items (recently purchased, weighted by recency) for fallback recommendations.

In [None]:
# Calculate days since purchase
df['days_diff'] = (max_date - df['t_dat']).dt.days
# Apply time decay formula
df['time_weight'] = 1.0 / (1.0 + ALPHA * df['days_diff'])

# Aggregate scores by article and sort
trend_scores = df.groupby('article_id')['time_weight'].sum().sort_values(ascending=False)
# Take top trending items (zero-padded)
trending_items = [str(x).zfill(10) for x in trend_scores.index[:TREND_TOPK]]

print('Top trending (sample):', trending_items[:10])


Top trending (sample): ['0751471001', '0448509014', '0918292001', '0924243001', '0918522001', '0915529003', '0866731001', '0714790020', '0706016001', '0924243002']


## 5) Build co-visitation
Creates a time-decayed item-to-item similarity map from customers’ recent purchases — core of co-visitation recommendations.

In [None]:
# Group purchases by customer
cust_groups = df.groupby('customer_id').apply(lambda x: list(zip(x['article_id'], x['t_dat'])))
print('Unique customers in filtered data:', len(cust_groups))

co_vis = defaultdict(lambda: defaultdict(float))  # co-visitation dictionary

for cust, items in cust_groups.items():
    # Limit to most recent 50 purchases for that customer
    if len(items) > 50:
        items = sorted(items, key=lambda x: x[1], reverse=True)[:50]
    
    # Keep the most recent date per article
    art_date = {}
    for a, d in items:
        if (a not in art_date) or (d > art_date[a]):
            art_date[a] = d
    
    articles = list(art_date.items())
    n = len(articles)
    if n <= 1:
        continue
    
    # Build item pairs within PAIR_DAYS window
    for i in range(n):
        a, da = articles[i]
        for j in range(i+1, n):
            b, db = articles[j]
            days = abs((da - db).days)
            if days > PAIR_DAYS:
                continue
            # Time-decayed co-visit weight
            w = 1.0 / (1.0 + ALPHA * days)
            co_vis[a][b] += w
            co_vis[b][a] += w

# Keep only top-K neighbors per article
co_vis_topk = {}
for a, nbrs in co_vis.items():
    sorted_n = sorted(nbrs.items(), key=lambda x: -x[1])[:TOPK_COVISIT]
    co_vis_topk[a] = [str(int(x[0])).zfill(10) for x in sorted_n]

print('Built co-visitation for', len(co_vis_topk), 'articles')

# Free memory
del co_vis
gc.collect()


  cust_groups = df.groupby('customer_id').apply(lambda x: list(zip(x['article_id'], x['t_dat'])))


Unique customers in filtered data: 494132
Built co-visitation for 40408 articles


0

## 6) Prepare recent items per customer
Stores each customer’s most recent purchases to start their recommendation list.

In [None]:
# Get recent unique purchases per customer (most recent first)
cust_recent = df.sort_values('t_dat', ascending=False).drop_duplicates(subset=['customer_id','article_id'])
cust_recent = cust_recent.groupby('customer_id')['article_id'].apply(list).to_dict()

# Convert IDs to zero-padded strings for output
for k in list(cust_recent.keys())[:3]:
    cust_recent[k] = [str(int(x)).zfill(10) for x in cust_recent[k]]

print('Sample sizes (first 3 customers):', [len(cust_recent[k]) for k in list(cust_recent.keys())[:3]])


Sample sizes (first 3 customers): [1, 1, 1]


## 7) Generate Recommendations
Recommendation generation function — uses recent items → co-visitation → trending fallback to compile 12 recommendations per customer.

In [None]:
# Ensure co-visitation keys are strings
co_vis_topk = {str(int(k)).zfill(10): v for k, v in co_vis_topk.items()}

def make_recs_for_customer(cust_id, topk=12):
    recs = []
    seen = set()
    
    # 1. Add customer’s own recent purchases
    if cust_id in cust_recent:
        for a in cust_recent[cust_id]:
            if a not in seen:
                seen.add(a)
                recs.append(a)
            if len(recs) == topk:
                return recs
    
    # 2. Add items co-visited with their purchases
    if cust_id in cust_recent:
        for a in cust_recent[cust_id]:
            for nb in co_vis_topk.get(a, []):
                if nb not in seen:
                    seen.add(nb)
                    recs.append(nb)
                if len(recs) == topk:
                    return recs
    
    # 3. Add trending items as fallback
    for t in trending_items:
        if t not in seen:
            seen.add(t)
            recs.append(t)
        if len(recs) == topk:
            return recs
    
    return recs

# Generate predictions for all customers
cust_list = sample_sub['customer_id'].tolist()
preds = []
for cust in cust_list:
    recs = make_recs_for_customer(cust, topk=12)
    # Safety pad if fewer than 12
    if len(recs) < 12:
        for t in trending_items:
            if t not in recs:
                recs.append(t)
            if len(recs) == 12:
                break
    preds.append(' '.join(recs[:12]))

submission = pd.DataFrame({'customer_id': cust_list, 'prediction': preds})


## 8) Sanity checks & save submission

In [None]:
print('rows:', submission.shape[0])
print('cols:', submission.shape[1])
print('header:', submission.columns.tolist())

# Ensure predictions have exactly 12 items
lengths = submission['prediction'].apply(lambda x: len(x.split()))
print('min tokens:', lengths.min(), 'max tokens:', lengths.max())

# Check all IDs are zero-padded numeric strings
def check_padding(pred):
    toks = pred.split()
    return all(len(t)==10 and t.isdigit() for t in toks)

print('All padded correctly?', submission['prediction'].apply(check_padding).all())

print(submission.head())

# Save submission
submission.to_csv('strong_vanilla_submission.csv', index=False)
print("Submission file saved: strong_vanilla_submission.csv")


rows: 1371980
cols: 2
header: ['customer_id', 'prediction']
min tokens: 12 max tokens: 12
All padded correctly? True
                                         customer_id  \
0  00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...   
1  0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...   
2  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   
3  00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...   
4  00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...   

                                          prediction  
0  0568601043 0858856005 0779781015 0762846031 05...  
1  0826211002 0824194002 0873217004 0874113004 05...  
2  0794321007 0805000001 0794321011 0805000007 07...  
3  0751471001 0448509014 0918292001 0924243001 09...  
4  0896152002 0791587015 0927530004 0730683050 07...  
✅ Submission file saved: strong_vanilla_submission.csv
