Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [35]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [36]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [37]:
import pandas as pd

transactions = pd.read_parquet('../input/transactions_train.parquet')
customers = pd.read_parquet('../input/customers.parquet')
articles = pd.read_parquet('../input/articles.parquet')

test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [38]:
c2weeks = transactions.groupby('customer_id')['week'].unique()

In [39]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
95,2020-07-15,2020-07-21
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15
104,2020-09-16,2020-09-22


In [40]:
# c2weeks

In [41]:
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

In [42]:
# c2weeks2shifted_weeks[28847241659200]
transactions.price.describe().apply('{:.8f}'.format)

count    2762872.00000000
mean           0.02804788
std            0.01924847
min            0.00016949
25%            0.01693220
50%            0.02540678
75%            0.03388136
max            0.50677967
Name: price, dtype: object

In [43]:
# transactions.price.hist(bins=100)

In [44]:
# customer_ids = []
# average_prices_without_outliers = []

# # Step 1: Group by 'customer_id'
# grouped = transactions.groupby('customer_id')

# # Step 2: Calculate the 25th and 75th percentiles for each customer's prices
# for customer_id, group in grouped:
#     Q1 = group['price'].quantile(0.25)
#     Q3 = group['price'].quantile(0.75)
    
#     # Step 3: Identify and remove outliers for each customer
#     IQR = Q3 - Q1
#     price_threshold_low = Q1 - 1.5 * IQR
#     price_threshold_high = Q3 + 1.5 * IQR
#     filtered_group = group[(group['price'] >= price_threshold_low) & (group['price'] <= price_threshold_high)]
    
#     if not filtered_group.empty:
#         # Add the customer ID and average price without outliers to the lists
#         customer_ids.append(customer_id)
#         average_price = filtered_group['price'].mean()
#         average_prices_without_outliers.append(average_price)

# # Step 4: Create a DataFrame with 'customer_id' and 'price' columns
# clipped_prices = pd.DataFrame({'customer_id': customer_ids, 'price': average_prices_without_outliers})

# # Step 5: Merge the clipped_prices DataFrame with the customers DataFrame
# customers = pd.merge(customers, clipped_prices, on='customer_id', how='left')

In [45]:
# transactions['clipped_avg_price_p_c'] = average_prices_without_outliers
# clipped_prices

In [46]:
# customers.price.describe().apply('{:.8f}'.format)

In [47]:
# customers

In [48]:
# transactions = transactions.groupby('customer_id').apply(lambda df: df[(df.price < df.price.quantile(0.75) + 0.5) & (df.price > df.price.quantile(0.2) - 0.5)])

# transactions.price = transactions.groupby('customer_id')['price'].transform(lambda x: x.clip(x.quantile(0.2) - 0.05, x.quantile(0.75) + 0.05))

In [49]:
# transactions.price.describe().apply('{:.8f}'.format)

In [50]:
# transactions

In [51]:
candidates_last_purchase = transactions.copy()

In [52]:
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

In [53]:
# candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040]

In [54]:
# transactions[transactions['customer_id']==272412481300040]

In [55]:
# mean_price_per_c = transactions \
#     .groupby(['week', 'customer_id'])['price'].mean() \
#     .groupby('customer_id').mean() \
#     .rename('mean_price_per_c')
mean_price_per_c = transactions.groupby('customer_id')['price'].mean().rename('mean_price_per_c').reset_index()
median_price_per_c = transactions.groupby('customer_id')['price'].median().rename('median_price_per_c').reset_index()

In [56]:
# Calculate percentiles (25th and 75th) of prices per customer
percentiles = transactions.groupby('customer_id')['price'].quantile([0.25, 0.75]).unstack().reset_index()
percentiles.rename(columns={0.25: 'Q1_price', 0.75: 'Q3_price'}, inplace=True)

In [57]:
max_price_per_customer = transactions.groupby('customer_id')['price'].max().reset_index()
min_price_per_customer = transactions.groupby('customer_id')['price'].min().reset_index()
max_price_per_customer.rename(columns={'price': 'max_price_per_customer'}, inplace=True)
min_price_per_customer.rename(columns={'price': 'min_price_per_customer'}, inplace=True)

max_price_per_article = transactions.groupby('article_id')['price'].max().rename('max_price_per_article').reset_index()
min_price_per_article = transactions.groupby('article_id')['price'].min().rename('min_price_per_article').reset_index()

In [58]:
# transactions_copy = transactions.copy()
# transactions_copy['t_dat'] = pd.to_datetime(transactions_copy['t_dat'])

# season_mapping = {
#     1: 0, 2: 0, 3: 1,
#     4: 1, 5: 1, 6: 2,
#     7: 2, 8: 2, 9: 3,
#     10: 3, 11: 3, 12: 0
# }

# # Extract the month from 't_dat' and map to seasons
# transactions_copy['month'] = transactions_copy['t_dat'].dt.month
# transactions['season'] = transactions_copy['month'].map(season_mapping)

In [59]:
max_price_per_article

Unnamed: 0,article_id,max_price_per_article
0,108775015,0.005068
1,108775044,0.008458
2,110065001,0.006763
3,110065002,0.006085
4,110065011,0.011847
...,...,...
38326,952267001,0.016932
38327,952938001,0.050831
38328,953450001,0.016932
38329,953763001,0.022017


### Bestsellers candidates

In [60]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [61]:
# mean_price

In [62]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [63]:
# sales

In [64]:
# sales.loc[95]

In [65]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [66]:
bestsellers_previous_week.pipe(lambda df: df[df['week']==96])

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.02298
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
5,96,610776002,6,0.008318
6,96,877278002,7,0.025036
7,96,547780003,8,0.024814
8,96,817354001,9,0.021913
9,96,827968001,10,0.016436


In [67]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [68]:
# unique_transactions

In [69]:
transactions.drop_duplicates(['week', 'customer_id'])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95
29064059,2020-07-15,1456826891333599,888294001,0.013542,1,95
29067103,2020-07-15,2133687643102426,843642001,0.042356,2,95
29027487,2020-07-15,6010692573790711,857812010,0.039661,1,95
29046403,2020-07-15,6171059100114610,815447007,0.006763,2,95
...,...,...,...,...,...,...
31760188,2020-09-22,18435221511488011015,573085055,0.033881,1,104
31782234,2020-09-22,18436859303155335645,801447001,0.030492,1,104
31787251,2020-09-22,18437941771381362708,907188001,0.050831,2,104
31776022,2020-09-22,18438270306572912089,751471043,0.033881,1,104


In [70]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [71]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [72]:
# test_set_transactions

In [73]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [74]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [75]:
# candidates_bestsellers

# Combining transactions and candidates / negative examples

In [76]:
transactions['purchased'] = 1

In [77]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [78]:
# data

In [79]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [80]:
data.purchased.mean()

0.13607582749165664

### Add bestseller information

In [81]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [82]:
# data

In [83]:
# merge mean_price_per_c with data to get the average price of the last purchase for each customer
# data = pd.merge(data, mean_price_per_c, on='customer_id', how='left')

# data = pd.merge(data, percentiles[['customer_id', 'Q1_price', 'Q3_price']], on='customer_id', how='left')

# data = pd.merge(data, max_price_per_customer, on='customer_id', how='left')
# data = pd.merge(data, min_price_per_customer, on='customer_id', how='left')

# data = pd.merge(data, transactions_copy[['week', 'customer_id', 'season']], on=['week', 'customer_id'], how='left')

# data = pd.merge(data, median_price_per_c, on='customer_id', how='left')

data = pd.merge(data, max_price_per_article, on='article_id', how='left')
data = pd.merge(data, min_price_per_article, on='article_id', how='left')

In [84]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [85]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,max_price_per_article,min_price_per_article
261987,2020-07-22,200292573348128,880777001,0.025407,2,96,1.0,999.0,0.025407,0.020085
261988,2020-07-22,200292573348128,784332002,0.025407,2,96,1.0,999.0,0.033881,0.011847
261989,2020-07-22,200292573348128,827968001,0.016932,2,96,1.0,10.0,0.016932,0.003373
261990,2020-07-22,200292573348128,599580086,0.011847,2,96,1.0,999.0,0.015237,0.006763
261991,2020-07-22,248294615847351,720504008,0.031458,1,96,1.0,999.0,0.033881,0.014390
...,...,...,...,...,...,...,...,...,...,...
18253744,2020-09-22,18438270306572912089,915529003,0.033439,1,105,0.0,8.0,0.033881,0.022508
18253745,2020-09-22,18438270306572912089,915529005,0.033417,1,105,0.0,9.0,0.033881,0.014814
18253746,2020-09-22,18438270306572912089,448509014,0.041630,1,105,0.0,10.0,0.042356,0.014390
18253747,2020-09-22,18438270306572912089,762846027,0.025005,1,105,0.0,11.0,0.025407,0.012695


In [86]:
# replace every -1 value in age column of customers dataset with the mean age value
# customers.age.replace(-1, customers.age.mean(), inplace=True)

# customers['age_group'] = pd.cut(customers.age, bins=[0, 20, 40, 60, 80, 100], labels=['1', '2', '3', '4', '5'])
# # One-hot-encode the 'age_group' column
# customers = pd.get_dummies(customers, columns=['age_group'], prefix='age_group')


In [87]:
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,6883939031699146327,-1,-1,0,0,49,6305
1,11246327431398957306,-1,-1,0,0,25,33726
2,18439897732908966680,-1,-1,0,0,24,3247
3,18352672461570950206,-1,-1,0,0,54,168643
4,18162778555210377306,1,1,0,1,52,168645
...,...,...,...,...,...,...,...
1371975,7551062398649767985,-1,-1,0,0,24,50351
1371976,9305341941720086711,-1,-1,0,0,21,80169
1371977,10160427316885688932,1,1,0,1,21,106737
1371978,2551401172826382186,1,1,0,1,18,111894


In [88]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [89]:
# correlation = data['article_id'].corr(data['mean_price_per_c'])
correlation = data['article_id'].corr(data['max_price_per_article'])
correlation = data['article_id'].corr(data['min_price_per_article'])

In [90]:
correlation

0.2687475812553158

In [91]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [92]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [93]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [94]:
extra_columns = []
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']
columns_to_use.extend(extra_columns)

In [95]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [96]:
test_X

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,bestseller_rank
11381612,925246001,262,1010016,9,4,5,1201,0,1,19,1007,1,1,0,1,21,57896,999.0
11381613,924243001,252,1010016,13,1,1,1626,0,1,15,1003,1,1,0,1,21,57896,1.0
11381614,924243002,252,1010016,9,4,5,1626,0,1,15,1003,1,1,0,1,21,57896,2.0
11381615,918522001,252,1010016,11,3,9,1626,0,1,15,1003,1,1,0,1,21,57896,3.0
11381616,923758001,-1,1010016,10,3,9,1522,0,1,15,1010,1,1,0,1,21,57896,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991757,915529003,252,1010016,9,4,5,1626,0,1,15,1003,1,1,0,1,60,96323,8.0
17991758,915529005,252,1010016,13,1,1,1626,0,1,15,1003,1,1,0,1,60,96323,9.0
17991759,448509014,272,1010016,72,3,2,1747,1,2,53,1009,1,1,0,1,60,96323,10.0
17991760,762846027,259,1010016,13,1,1,1515,0,1,11,1010,1,1,0,1,60,96323,11.0


# Model training

In [97]:
from lightgbm.sklearn import LGBMRanker

In [98]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [99]:
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.848850
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.153113
[LightGBM] [Debug] init for col-wise cost 0.101089 seconds, init for row-wise cost 0.098679 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.148236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12


In [100]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9989805711820897
age 0.00024136039423249762
article_id 0.0001716082873112681
garment_group_no 0.000144767354190598
department_no 9.631753928857237e-05
product_type_no 9.014783466245737e-05
section_no 7.07609526662605e-05
postal_code 6.79219757232404e-05
club_member_status 6.519780365736126e-05
colour_group_code 5.3587542243445946e-05
perceived_colour_value_id 1.7759133934558557e-05
fashion_news_frequency 0.0
Active 0.0
FN 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
index_group_no 0.0


# Calculate predictions

In [101]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

# Create submission

In [102]:
sub = pd.read_csv('../input/sample_submission.csv')

In [103]:
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

In [104]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [105]:
sub_name = 'basic_model_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)