Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [19]:
from helper_functions import customer_hex_id_to_int, mapk
from helper_functions import recall, recall12, mean_recall, calculate_recall_per_customer_batch, calculate_recall_per_week
from helper_functions import read_parquet_datasets
import pandas as pd
import numpy as np

transactions, customers, articles = read_parquet_datasets()

validation = transactions[transactions.week == transactions.week.max()]

test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [20]:
c2weeks = transactions.groupby('customer_id')['week'].unique()
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
candidates_last_purchase.week=weeks

In [21]:
### Bestsellers candidates
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [22]:
# bestsellers_previous_week.pipe(lambda df: df[df['week']==96])
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.022980
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
...,...,...,...,...
115,105,915529003,8,0.033439
116,105,915529005,9,0.033417
117,105,448509014,10,0.041630
118,105,762846027,11,0.025005


In [23]:
# analogous to calculate_recall_per_customer_batch(...) function
# Get the purchases for the last week in the validation set
actual_purchases_last_week = validation[validation['week'] == validation['week'].max()].groupby('customer_id')['article_id'].apply(list)

# Step 1: Extract the 12 bestselling articles for the last week
bestsellers_last_week = bestsellers_previous_week[bestsellers_previous_week['week'] == validation['week'].max()]['article_id']

# Step 2: Create a DataFrame with 'customer_id' and 'article_id' for each customer and the 12 articles
customers = pd.DataFrame({'customer_id': np.repeat(validation['customer_id'].unique(), 12)})
customers['article_id'] = np.tile(bestsellers_last_week.tolist(), len(validation['customer_id'].unique()))

# Step 3: create a list of articles for each customer
predicted_bestsellers_last_week = customers.groupby('customer_id')['article_id'].apply(list)

In [24]:
actual_purchases_last_week

customer_id
1402273113592184                                   [885951001, 611415001]
1827730561464445                                   [918603001, 921380001]
1951136007097426                                              [778745010]
2639747769247776                                              [819547001]
3177658828628418                                   [869331006, 866731001]
                                              ...                        
18444954504588539615                                          [903062001]
18445164350380731040                                          [730683050]
18445340048433064259                                          [714790028]
18445641720816255142    [898713001, 909014001, 919365008, 827635001, 8...
18446737527580148316         [547780001, 763988001, 763988003, 547780040]
Name: article_id, Length: 68984, dtype: object

In [25]:
predicted_bestsellers_last_week

customer_id
1402273113592184        [909370001, 865799006, 918522001, 924243001, 4...
1827730561464445        [909370001, 865799006, 918522001, 924243001, 4...
1951136007097426        [909370001, 865799006, 918522001, 924243001, 4...
2639747769247776        [909370001, 865799006, 918522001, 924243001, 4...
3177658828628418        [909370001, 865799006, 918522001, 924243001, 4...
                                              ...                        
18444954504588539615    [909370001, 865799006, 918522001, 924243001, 4...
18445164350380731040    [909370001, 865799006, 918522001, 924243001, 4...
18445340048433064259    [909370001, 865799006, 918522001, 924243001, 4...
18445641720816255142    [909370001, 865799006, 918522001, 924243001, 4...
18446737527580148316    [909370001, 865799006, 918522001, 924243001, 4...
Name: article_id, Length: 68984, dtype: object

In [26]:
# Calculate recall between actual purchases and predicted bestsellers for the last week
recall_last_week = mean_recall(actual_purchases_last_week, predicted_bestsellers_last_week)
print("Recall Score for Bestsellers Candidates for last week:", recall_last_week)

Recall Score for Bestsellers Candidates for last week: 0.024726918923118894


In [27]:
pd.options.display.max_seq_items = 2000

In [28]:
recall_per_week = {}
num_weeks = 5
validation = transactions # we use all (actually the last 5 weeks) as validation

for i in range(num_weeks, 0, -1):
    # Get the purchases for the current week in the validation set
    actual_purchases_current_week = validation[validation['week'] == validation['week'].max() - (i - 1)].groupby('customer_id')['article_id'].apply(list)

    # Extract the 12 bestselling articles for the current week
    bestsellers_current_week = bestsellers_previous_week[bestsellers_previous_week['week'] == validation['week'].max() - (i - 1)]['article_id']

    # Create a DataFrame with 'customer_id' and 'article_id' for each customer and the 12 articles
    customers = pd.DataFrame({'customer_id': np.repeat(validation['customer_id'].unique(), 12)})
    customers['article_id'] = np.tile(bestsellers_current_week.tolist(), len(validation['customer_id'].unique()))

    # Create a list of articles for each customer
    predicted_bestsellers_current_week = customers.groupby('customer_id')['article_id'].apply(list)
    
    print("Week:", 104-(i-1))
    print("predicted_bestsellers_current_week: ", predicted_bestsellers_current_week)

    # Calculate recall between actual purchases and predicted bestsellers for the current week
    recall_current_week = mean_recall(actual_purchases_current_week, predicted_bestsellers_current_week)
    recall_per_week[104-(i-1)] = recall_current_week

print(recall_per_week)

Week: 100
predicted_bestsellers_current_week:  customer_id
28847241659200          [916468003, 812668001, 866731001, 610776002, 8...
41318098387474          [916468003, 812668001, 866731001, 610776002, 8...
116809474287335         [916468003, 812668001, 866731001, 610776002, 8...
200292573348128         [916468003, 812668001, 866731001, 610776002, 8...
248294615847351         [916468003, 812668001, 866731001, 610776002, 8...
                                              ...                        
18446624797007271432    [916468003, 812668001, 866731001, 610776002, 8...
18446630855572834764    [916468003, 812668001, 866731001, 610776002, 8...
18446662237889060501    [916468003, 812668001, 866731001, 610776002, 8...
18446705133201055310    [916468003, 812668001, 866731001, 610776002, 8...
18446737527580148316    [916468003, 812668001, 866731001, 610776002, 8...
Name: article_id, Length: 437365, dtype: object
Week: 101
predicted_bestsellers_current_week:  customer_id
28847241659200      