Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

In [13]:
from helper_functions import mean_recall
from helper_functions import read_parquet_datasets
import pandas as pd
import numpy as np
from typing import Dict, List

In [14]:
# read datasets
transactions, customers, articles = read_parquet_datasets()

validation: pd.DataFrame = transactions[transactions.week == transactions.week.max()]

test_week: int = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [15]:
c2weeks = transactions.groupby('customer_id')['week'].unique()
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
candidates_last_purchase.week=weeks

### Bestsellers candidates

In [16]:
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [17]:
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.022980
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
...,...,...,...,...
115,105,915529003,8,0.033439
116,105,915529005,9,0.033417
117,105,448509014,10,0.041630
118,105,762846027,11,0.025005


Above we see 120 articles (because 12 per week) that were the bestsellers in the previous week. This is correct.

### Recall calculation
###### We calculate recall between actual purchases and predicted bestsellers for the last week

In [18]:
# analogous to calculate_recall_per_customer_batch(...) function
# Get the purchases for the last week in the validation set
actual_purchases_last_week_map = validation[validation['week'] == validation['week'].max()].groupby('customer_id')['article_id'].apply(list)
actual_purchases_last_week = actual_purchases_last_week_map.tolist()

# To get bestsellers for the last week, we need to do the following:
# Step 1: Extract the 12 bestselling articles for the last week
bestsellers_last_week = bestsellers_previous_week[bestsellers_previous_week['week'] == validation['week'].max()]['article_id']

# Step 2: Create a DataFrame with 'customer_id' and 'article_id' for each customer and the 12 articles
customers = pd.DataFrame({'customer_id': np.repeat(validation['customer_id'].unique(), 12)})
customers['article_id'] = np.tile(bestsellers_last_week.tolist(), len(validation['customer_id'].unique()))

# Step 3: create a list of articles for each customer
predicted_bestsellers_last_week_map = customers.groupby('customer_id')['article_id'].apply(list)
predicted_bestsellers_last_week = predicted_bestsellers_last_week_map.tolist()

In [19]:
actual_purchases_last_week_map

customer_id
1402273113592184                                   [885951001, 611415001]
1827730561464445                                   [918603001, 921380001]
1951136007097426                                              [778745010]
2639747769247776                                              [819547001]
3177658828628418                                   [869331006, 866731001]
                                              ...                        
18444954504588539615                                          [903062001]
18445164350380731040                                          [730683050]
18445340048433064259                                          [714790028]
18445641720816255142    [898713001, 909014001, 919365008, 827635001, 8...
18446737527580148316         [547780001, 763988001, 763988003, 547780040]
Name: article_id, Length: 68984, dtype: object

In [20]:
predicted_bestsellers_last_week_map

customer_id
1402273113592184        [909370001, 865799006, 918522001, 924243001, 4...
1827730561464445        [909370001, 865799006, 918522001, 924243001, 4...
1951136007097426        [909370001, 865799006, 918522001, 924243001, 4...
2639747769247776        [909370001, 865799006, 918522001, 924243001, 4...
3177658828628418        [909370001, 865799006, 918522001, 924243001, 4...
                                              ...                        
18444954504588539615    [909370001, 865799006, 918522001, 924243001, 4...
18445164350380731040    [909370001, 865799006, 918522001, 924243001, 4...
18445340048433064259    [909370001, 865799006, 918522001, 924243001, 4...
18445641720816255142    [909370001, 865799006, 918522001, 924243001, 4...
18446737527580148316    [909370001, 865799006, 918522001, 924243001, 4...
Name: article_id, Length: 68984, dtype: object

In [21]:
actual_purchases_last_week

[[885951001, 611415001],
 [918603001, 921380001],
 [778745010],
 [819547001],
 [869331006, 866731001],
 [908728002,
  929001001,
  893432005,
  898889001,
  921906005,
  896169005,
  906633001,
  906633001,
  871519008,
  781613006,
  908728003,
  805947006,
  714790020],
 [863583002],
 [788575004,
  788328005,
  797988005,
  889550002,
  888570001,
  888570001,
  927936001,
  933032002,
  812167002],
 [911870009],
 [921266001,
  711053003,
  776179006,
  921298001,
  921298001,
  912579002,
  776179001,
  751664001,
  751664001,
  921298003],
 [879248008, 918522001],
 [717816001,
  717816001,
  768912001,
  889828001,
  768931002,
  889828005,
  540701003,
  787946002,
  237347059,
  785060003,
  596877005],
 [783517006, 852442002, 925124001],
 [863123002, 873276004, 918576001, 751471043],
 [882810001, 851010007, 509669011, 904414003, 906382002],
 [910722001],
 [840260001, 781758001, 781758001],
 [685813001, 685813003, 685813037],
 [934793001, 911034001],
 [917899001, 909915002],
 [88

In [22]:
predicted_bestsellers_last_week

[[909370001,
  865799006,
  918522001,
  924243001,
  448509014,
  751471001,
  809238001,
  918292001,
  762846027,
  809238005,
  673677002,
  923758001],
 [909370001,
  865799006,
  918522001,
  924243001,
  448509014,
  751471001,
  809238001,
  918292001,
  762846027,
  809238005,
  673677002,
  923758001],
 [909370001,
  865799006,
  918522001,
  924243001,
  448509014,
  751471001,
  809238001,
  918292001,
  762846027,
  809238005,
  673677002,
  923758001],
 [909370001,
  865799006,
  918522001,
  924243001,
  448509014,
  751471001,
  809238001,
  918292001,
  762846027,
  809238005,
  673677002,
  923758001],
 [909370001,
  865799006,
  918522001,
  924243001,
  448509014,
  751471001,
  809238001,
  918292001,
  762846027,
  809238005,
  673677002,
  923758001],
 [909370001,
  865799006,
  918522001,
  924243001,
  448509014,
  751471001,
  809238001,
  918292001,
  762846027,
  809238005,
  673677002,
  923758001],
 [909370001,
  865799006,
  918522001,
  924243001,
  4485

A little sanity check: We can see above here (first two prints) that the lengths of the two lists are the same and the customer_ids are at the same place in both lists. In other words, the lists are aligned. This is in order to ensure that the recall calculation is correct.

In [23]:
# Calculate recall between actual purchases and predicted bestsellers for the last week
recall_last_week = mean_recall(actual_purchases_last_week, predicted_bestsellers_last_week)
print("Recall Score for Bestsellers Candidates for last week:", recall_last_week)

Recall Score for Bestsellers Candidates for last week: 0.024726918923118894


###### We calculate recall between actual purchases and predicted bestsellers for the last five weeks

In [24]:
recall_per_week: Dict[int, float] = {}
num_weeks: int = 5 # number of weeks to loop over
validation: pd.DataFrame = transactions # we use all (actually the last 5 weeks) as validation

for i in range(num_weeks, 0, -1):
    # Get the purchases (per customer) for the current week in the validation set
    actual_purchases_current_week: List[List[int]] = validation[validation['week'] == validation['week'].max() - (i - 1)].groupby('customer_id')['article_id'].apply(list).tolist()

    # Extract the 12 bestselling articles for the current week
    bestsellers_current_week = bestsellers_previous_week[bestsellers_previous_week['week'] == validation['week'].max() - (i - 1)]['article_id']

    # Create a DataFrame with 'customer_id' and 'article_id' for each customer and the 12 articles
    customers = pd.DataFrame({'customer_id': np.repeat(validation['customer_id'].unique(), 12)})
    customers['article_id'] = np.tile(bestsellers_current_week.tolist(), len(validation['customer_id'].unique()))

    # Create a list of candidate articles for each customer
    predicted_bestsellers_current_week: List[List[int]] = customers.groupby('customer_id')['article_id'].apply(list).to_list()
    
    print("Week:", 104-(i-1))
    print("predicted_bestsellers_current_week: ", predicted_bestsellers_current_week[0])

    # Calculate recall between actual purchases and predicted bestsellers for the current week
    recall_current_week: float = mean_recall(actual_purchases_current_week, predicted_bestsellers_current_week)
    recall_per_week[104-(i-1)] = recall_current_week # we know that the last week is 104 in the original transactions dataset


Week: 100
predicted_bestsellers_current_week:  [916468003, 812668001, 866731001, 610776002, 896152002, 923460001, 896152001, 751471001, 938182001, 894668003, 706016003, 870328003]
Week: 101
predicted_bestsellers_current_week:  [916468003, 896152003, 896152002, 751471001, 706016001, 918292001, 921906003, 751471043, 706016003, 918292004, 915526002, 920610001]
Week: 102
predicted_bestsellers_current_week:  [898694001, 933706001, 751471001, 915526001, 915529003, 706016001, 918292001, 751471043, 915526002, 915529001, 862970001, 863595006]
Week: 103
predicted_bestsellers_current_week:  [915526001, 751471043, 751471001, 706016001, 919365008, 915529003, 918292001, 863595006, 896152002, 448509014, 909916001, 762846031]
Week: 104
predicted_bestsellers_current_week:  [909370001, 865799006, 918522001, 924243001, 448509014, 751471001, 809238001, 918292001, 762846027, 809238005, 673677002, 923758001]


In [25]:
print(recall_per_week)

{100: 0.01999729628144646, 101: 0.022689908101592778, 102: 0.024923172680216706, 103: 0.02021864403496489, 104: 0.024726918923118894}


We can see that for the bestsellers candidates, the recall is around 0.025 for the last week and around the same for the previous weeks. Some weeks have some overlapping bestsellers.