Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [6]:
from helper_functions import customer_hex_id_to_int, mapk
from helper_functions import recall, recall12, mean_recall, calculate_recall_per_customer_batch
from helper_functions import read_parquet_datasets
import pandas as pd
import numpy as np

transactions, customers, articles = read_parquet_datasets()

validation = transactions[transactions.week == transactions.week.max()]

test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [7]:
c2weeks = transactions.groupby('customer_id')['week'].unique()
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
candidates_last_purchase.week=weeks

In [8]:
### Bestsellers candidates
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [9]:
# bestsellers_previous_week.pipe(lambda df: df[df['week']==96])
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.022980
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
...,...,...,...,...
115,105,915529003,8,0.033439
116,105,915529005,9,0.033417
117,105,448509014,10,0.041630
118,105,762846027,11,0.025005


In [10]:
# analogous to calculate_recall_per_customer_batch(...) function
# Get the purchases for the last week in the validation set
actual_purchases_last_week = validation[validation['week'] == validation['week'].max()].groupby('customer_id')['article_id'].apply(list)

# Step 1: Extract the 12 bestselling articles for the last week
bestsellers_last_week = bestsellers_previous_week[bestsellers_previous_week['week'] == validation['week'].max()]['article_id']

# Step 2: Create a DataFrame with 'customer_id' and 'article_id' for each customer and the 12 articles
customers = pd.DataFrame({'customer_id': np.repeat(validation['customer_id'].unique(), 12)})
customers['article_id'] = np.tile(bestsellers_last_week.tolist(), len(validation['customer_id'].unique()))

# Step 3: create a list of articles for each customer
predicted_bestsellers_last_week = customers.groupby('customer_id')['article_id'].apply(list)

In [11]:
actual_purchases_last_week

customer_id
1402273113592184                                   [885951001, 611415001]
1827730561464445                                   [918603001, 921380001]
1951136007097426                                              [778745010]
2639747769247776                                              [819547001]
3177658828628418                                   [869331006, 866731001]
                                              ...                        
18444954504588539615                                          [903062001]
18445164350380731040                                          [730683050]
18445340048433064259                                          [714790028]
18445641720816255142    [898713001, 909014001, 919365008, 827635001, 8...
18446737527580148316         [547780001, 763988001, 763988003, 547780040]
Name: article_id, Length: 68984, dtype: object

In [12]:
predicted_bestsellers_last_week

customer_id
1402273113592184        [909370001, 865799006, 918522001, 924243001, 4...
1827730561464445        [909370001, 865799006, 918522001, 924243001, 4...
1951136007097426        [909370001, 865799006, 918522001, 924243001, 4...
2639747769247776        [909370001, 865799006, 918522001, 924243001, 4...
3177658828628418        [909370001, 865799006, 918522001, 924243001, 4...
                                              ...                        
18444954504588539615    [909370001, 865799006, 918522001, 924243001, 4...
18445164350380731040    [909370001, 865799006, 918522001, 924243001, 4...
18445340048433064259    [909370001, 865799006, 918522001, 924243001, 4...
18445641720816255142    [909370001, 865799006, 918522001, 924243001, 4...
18446737527580148316    [909370001, 865799006, 918522001, 924243001, 4...
Name: article_id, Length: 68984, dtype: object

In [13]:
# Calculate recall between actual purchases and predicted bestsellers for the last week
recall_last_week = mean_recall(actual_purchases_last_week, predicted_bestsellers_last_week)

print("Recall Score for Bestsellers Candidates:", recall_last_week)

Recall Score for Bestsellers Candidates: 0.024726918923118894


In [14]:
unique_transactions = transactions.groupby(['week', 'customer_id']).head(1).drop(columns=['article_id', 'price']).copy()
candidates_bestsellers = pd.merge(unique_transactions, bestsellers_previous_week, on='week',)
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [15]:
# candidates_bestsellers

In [16]:
# test_set_transactions
# unique_transactions

In [17]:
candidates_bestsellers_test_week = pd.merge(test_set_transactions, bestsellers_previous_week, on='week')
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [18]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,200292573348128,2,96,760084003,0.025094
1,2020-07-22,200292573348128,2,96,866731001,0.024919
2,2020-07-22,200292573348128,2,96,600886001,0.022980
3,2020-07-22,200292573348128,2,96,706016001,0.033197
4,2020-07-22,200292573348128,2,96,372860002,0.013193
...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,1,105,915529003,0.033439
5248376,2020-09-22,18438270306572912089,1,105,915529005,0.033417
5248377,2020-09-22,18438270306572912089,1,105,448509014,0.041630
5248378,2020-09-22,18438270306572912089,1,105,762846027,0.025005


# Combining transactions and candidates / negative examples

In [19]:
transactions['purchased'] = 1
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [20]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0
...,...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,915529003,0.033439,1,105,0.0
5248376,2020-09-22,18438270306572912089,915529005,0.033417,1,105,0.0
5248377,2020-09-22,18438270306572912089,448509014,0.041630,1,105,0.0
5248378,2020-09-22,18438270306572912089,762846027,0.025005,1,105,0.0


### Add bestseller information

In [21]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [22]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

KeyError: Index(['article_id'], dtype='object')

In [None]:
test

In [None]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [None]:
train_baskets

In [None]:
extra_columns = []
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']
columns_to_use.extend(extra_columns)

In [None]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [None]:
test_X

# Model training

In [None]:
from lightgbm.sklearn import LGBMRanker
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

# Calculate predictions

In [None]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

# Create submission

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

In [None]:
from tqdm import tqdm
positive_items_val = validation.groupby(['customer_id'])['article_id'].apply(list)
# creating validation set for metrics use case
val_users = positive_items_val.keys()
val_items = []
for i, user in tqdm(enumerate(val_users)):
    val_items.append(positive_items_val[user])

print("Total users in validation:", len(val_users))
print("mAP12 Score on Validation set:", mapk(val_items, preds))
print("recall Score on Validation set:", recall12(val_items, preds))

In [None]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
# sub.prediction = preds
# sub_name = 'basic_model_submission'
# sub.to_csv(f'{sub_name}.csv.gz', index=False)