Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [63]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [64]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [65]:
import pandas as pd
import time

transactions = pd.read_parquet('../input/transactions_train.parquet')
customers = pd.read_parquet('../input/customers.parquet')
articles = pd.read_parquet('../input/articles.parquet')

test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [66]:
c2weeks = transactions.groupby('customer_id')['week'].unique()
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
candidates_last_purchase.week=weeks

In [67]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95
...,...,...,...,...,...,...
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,104
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,104
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,104
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,104


In [68]:
candidates_last_purchase

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,96
29030504,2020-07-15,272412481300040,816592008,0.016932,1,96
29030505,2020-07-15,272412481300040,621381021,0.033881,1,96
29030506,2020-07-15,272412481300040,817477003,0.025407,1,96
29030507,2020-07-15,272412481300040,899088002,0.025407,1,96
...,...,...,...,...,...,...
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,105
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,105
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,105
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,105


In [69]:
# generate personalized candidates for each customer based on three features: age, index_group_name and mean purchase price of the customer.

# first create mean price per customer
mean_price_per_c = transactions.groupby('customer_id')['price'].mean().rename('mean_price_per_c').reset_index()
# figure out the customers preferred index group name by looking at past transactions articles' index_group_name
ttransactions = pd.merge(transactions, articles[['article_id', 'index_group_name']], on='article_id', how='left')
# per customer count the number of purchases of each index_group_name
c2index_group_name = ttransactions.groupby('customer_id')['index_group_name'].value_counts().rename('count').reset_index()
c2index_group_name = c2index_group_name.sort_values('count', ascending=False).groupby('customer_id').head(1).rename(columns={'index_group_name': 'highest_count_ign_per_c'})
# merge the mean_price_per_c and c2index_group_name dataframes
t = pd.merge(c2index_group_name[['customer_id', 'highest_count_ign_per_c']], mean_price_per_c, on='customer_id', how='left')

# per article find the average age of the customers who bought it
transactions_with_age = pd.merge(transactions, customers[['customer_id', 'age']], on='customer_id', how='left')
mean_age_per_a = transactions_with_age.groupby('article_id')['age'].mean().rename('mean_age_per_a').astype('int8').reset_index()

# merge to get the mean_price_per_c, highest_count_ign_per_c and mean_age_per_a columns
transactions_with_age_feat = pd.merge(transactions_with_age, t, on='customer_id', how='left')
transactions_with_3feat = pd.merge(transactions_with_age_feat, mean_age_per_a, on='article_id', how='left')
transactions_with_3feat = pd.merge(transactions_with_3feat, articles[['article_id', 'index_group_name']], on='article_id', how='left')

In [70]:
transactions_with_3feat

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,age,highest_count_ign_per_c,mean_price_per_c,mean_age_per_a,index_group_name
0,2020-07-15,272412481300040,778064028,0.008458,1,95,48,0,0.030597,32,0
1,2020-07-15,272412481300040,816592008,0.016932,1,95,48,0,0.030597,37,2
2,2020-07-15,272412481300040,621381021,0.033881,1,95,48,0,0.030597,35,2
3,2020-07-15,272412481300040,817477003,0.025407,1,95,48,0,0.030597,33,0
4,2020-07-15,272412481300040,899088002,0.025407,1,95,48,0,0.030597,35,2
...,...,...,...,...,...,...,...,...,...,...,...
2762867,2020-09-22,18439937050817258297,891591003,0.084729,2,104,49,0,0.040379,41,0
2762868,2020-09-22,18439937050817258297,869706005,0.084729,2,104,49,0,0.040379,39,0
2762869,2020-09-22,18440902715633436014,918894002,0.016932,1,104,18,0,0.028232,30,0
2762870,2020-09-22,18440902715633436014,761269001,0.016932,1,104,18,0,0.028232,33,0


In [71]:
# print price of transactions_with_3feat grouped by week and then customer_id
transactions_with_3feat.groupby(['week', 'customer_id'])['price'].mean()

week  customer_id         
95    28847241659200          0.029644
      200292573348128         0.034075
      272412481300040         0.022017
      857913002275398         0.019757
      1456826891333599        0.014390
                                ...   
104   18444954504588539615    0.016932
      18445164350380731040    0.042356
      18445340048433064259    0.050831
      18445641720816255142    0.056884
      18446737527580148316    0.023712
Name: price, Length: 755710, dtype: float32

In [72]:
# merge t with customers columns customer_id, age on customer_id
t_merged = pd.merge(t, customers[['customer_id', 'age']], on='customer_id', how='left')

In [73]:
# Create a DataFrame with unique articles
unique_articles = transactions_with_3feat[['t_dat', 'article_id', 'price', 'index_group_name', 'week']].drop_duplicates()
# we only keep latest purchase of each article
unique_articles = unique_articles.sort_values('t_dat', ascending=False).drop_duplicates('article_id').drop(columns=['t_dat'])

chunk_size = 1000  # Define the number of customers to process at once

unique_customers = mean_price_per_c['customer_id'].unique()[:1000]

# Process customers in chunks
chunks = [unique_customers[i:i + chunk_size] for i in range(0, len(unique_customers), chunk_size)]

result_candidates_3feat = pd.DataFrame()  # DataFrame to store final candidates

result_candidates_3feat_chunks = []

for customer_chunk in chunks:
    start = time.time()
    # Cartesian product of unique articles and customers, since we want to choose candidates out of all unique articles for each customer
    candidate_articles = pd.merge(
        unique_articles,
        pd.DataFrame({'customer_id': customer_chunk}),
        how='cross'
    )
    # get the necessary columns to filter out the candidates
    candidate_articles = pd.merge(candidate_articles, t_merged, on='customer_id', how='left')
    candidate_articles = pd.merge(candidate_articles, mean_age_per_a, on='article_id', how='left')
    
    # Select all candidates per customer_id where highest_count_ign_per_c is equal to index_group_name
    candidate_articles = candidate_articles[candidate_articles['highest_count_ign_per_c']==candidate_articles['index_group_name']]
    
    # Calculate price difference for each combination
    candidate_articles['price_difference'] = abs(candidate_articles['mean_price_per_c'] - candidate_articles['price'])
    # Rank articles within each customer group based on price difference
    candidate_articles['price_rank'] = (
        candidate_articles
        .groupby(['week', 'customer_id'])['price_difference']
        .rank(ascending=True, method='min')
    )
    # Select the top 100 candidates for each customer
    top_candidates = (
        candidate_articles
        .sort_values(by=['customer_id', 'week', 'price_rank'])
        .groupby(['week', 'customer_id'])
        .head(50)
    )
    
    # Calculate age difference for each combination
    top_candidates['age_difference'] = abs(top_candidates['age'] - top_candidates['mean_age_per_a'])
    # Rank articles within each customer group based on age difference
    top_candidates['age_rank'] = (
        top_candidates
        .groupby(['week', 'customer_id'])['age_difference']
        .rank(ascending=True, method='min')
    )
    # Select the top 50 candidates for each customer based on age difference
    top_candidates = (
        top_candidates
        .sort_values(by=['customer_id', 'week', 'age_rank'])
        .groupby(['week', 'customer_id'])
        .head(12)
    )
    
    # Append current chunk's candidates to result
    result_candidates_3feat_chunks.append(top_candidates)
    
    print(f'Chunk processed in {time.time() - start:.2f} seconds')
    
# Concatenate all chunks into the final result    
result_candidates_3feat = pd.concat(result_candidates_3feat_chunks, ignore_index=True)

top_candidates_3feat_price = result_candidates_3feat.drop(columns=['price_difference', 'age_difference'])

Chunk processed in 18.00 seconds


In [74]:
top_candidates_3feat_price

Unnamed: 0,article_id,price,index_group_name,week,customer_id,highest_count_ign_per_c,mean_price_per_c,age,mean_age_per_a,price_rank,age_rank
0,754852002,0.050831,0,95,28847241659200,0,0.046085,21,21,21.0,1.0
1,747764002,0.047441,0,95,28847241659200,0,0.046085,21,22,2.0,2.0
2,685944002,0.042356,0,95,28847241659200,0,0.046085,21,20,7.0,2.0
3,746344001,0.033881,0,95,28847241659200,0,0.046085,21,20,36.0,2.0
4,796042001,0.033881,0,95,28847241659200,0,0.046085,21,22,36.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
119995,761406001,0.022017,2,104,42984229297455520,2,0.021452,18,29,40.0,4.0
119996,874110016,0.021339,2,104,42984229297455520,2,0.021452,18,30,3.0,9.0
119997,855686002,0.021322,2,104,42984229297455520,2,0.021452,18,30,4.0,9.0
119998,894481002,0.021169,2,104,42984229297455520,2,0.021452,18,30,13.0,9.0


In [75]:
top_candidates_3feat_price[top_candidates_3feat_price['customer_id']==272412481300040]

Unnamed: 0,article_id,price,index_group_name,week,customer_id,highest_count_ign_per_c,mean_price_per_c,age,mean_age_per_a,price_rank,age_rank
600,578317003,0.027102,0,95,272412481300040,0,0.030597,48,48,26.0,1.0
601,690803001,0.033881,0,95,272412481300040,0,0.030597,48,47,11.0,2.0
602,559626001,0.025424,0,95,272412481300040,0,0.030597,48,47,45.0,2.0
603,594834018,0.030492,0,95,272412481300040,0,0.030597,48,46,1.0,4.0
604,757333004,0.030492,0,95,272412481300040,0,0.030597,48,50,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
715,874816001,0.030492,0,104,272412481300040,0,0.030597,48,38,3.0,8.0
716,646490020,0.030492,0,104,272412481300040,0,0.030597,48,38,3.0,8.0
717,887761002,0.030492,0,104,272412481300040,0,0.030597,48,38,3.0,8.0
718,859736001,0.030492,0,104,272412481300040,0,0.030597,48,37,3.0,11.0


In [76]:
top_candidates_3feat_prev_week = top_candidates_3feat_price.copy()
top_candidates_3feat_prev_week.week += 1

In [77]:
top_candidates_3feat_prev_week

Unnamed: 0,article_id,price,index_group_name,week,customer_id,highest_count_ign_per_c,mean_price_per_c,age,mean_age_per_a,price_rank,age_rank
0,754852002,0.050831,0,96,28847241659200,0,0.046085,21,21,21.0,1.0
1,747764002,0.047441,0,96,28847241659200,0,0.046085,21,22,2.0,2.0
2,685944002,0.042356,0,96,28847241659200,0,0.046085,21,20,7.0,2.0
3,746344001,0.033881,0,96,28847241659200,0,0.046085,21,20,36.0,2.0
4,796042001,0.033881,0,96,28847241659200,0,0.046085,21,22,36.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
119995,761406001,0.022017,2,105,42984229297455520,2,0.021452,18,29,40.0,4.0
119996,874110016,0.021339,2,105,42984229297455520,2,0.021452,18,30,3.0,9.0
119997,855686002,0.021322,2,105,42984229297455520,2,0.021452,18,30,4.0,9.0
119998,894481002,0.021169,2,105,42984229297455520,2,0.021452,18,30,13.0,9.0


In [78]:
unique_transactions_ = transactions.groupby(['week', 'customer_id']).head(1).drop(columns=['article_id', 'price']).copy()
# candidates_ = pd.merge(unique_transactions_, top_candidates_3feat_prev_week, on='week',)
candidates_ = top_candidates_3feat_prev_week.copy()
test_set_transactions_ = unique_transactions_.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions_.week = test_week

In [79]:
test_set_transactions_

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
0,2020-07-15,272412481300040,1,105
1,2020-07-15,1456826891333599,1,105
2,2020-07-15,2133687643102426,2,105
3,2020-07-15,6010692573790711,1,105
4,2020-07-15,6171059100114610,2,105
...,...,...,...,...
437360,2020-09-22,18410229429441241008,2,105
437361,2020-09-22,18417769707947924979,2,105
437362,2020-09-22,18418054986721795659,2,105
437363,2020-09-22,18421175435799911749,2,105


In [80]:
top_candidates_3feat_prev_week

Unnamed: 0,article_id,price,index_group_name,week,customer_id,highest_count_ign_per_c,mean_price_per_c,age,mean_age_per_a,price_rank,age_rank
0,754852002,0.050831,0,96,28847241659200,0,0.046085,21,21,21.0,1.0
1,747764002,0.047441,0,96,28847241659200,0,0.046085,21,22,2.0,2.0
2,685944002,0.042356,0,96,28847241659200,0,0.046085,21,20,7.0,2.0
3,746344001,0.033881,0,96,28847241659200,0,0.046085,21,20,36.0,2.0
4,796042001,0.033881,0,96,28847241659200,0,0.046085,21,22,36.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
119995,761406001,0.022017,2,105,42984229297455520,2,0.021452,18,29,40.0,4.0
119996,874110016,0.021339,2,105,42984229297455520,2,0.021452,18,30,3.0,9.0
119997,855686002,0.021322,2,105,42984229297455520,2,0.021452,18,30,4.0,9.0
119998,894481002,0.021169,2,105,42984229297455520,2,0.021452,18,30,13.0,9.0


In [81]:
# candidates__test_week = pd.merge(test_set_transactions_, top_candidates_3feat_prev_week, on='week')
# candidates_ = pd.concat([candidates_, candidates__test_week])
candidates_.drop(columns=['price_rank', 'age_rank', 'highest_count_ign_per_c', 'mean_price_per_c', 'index_group_name', 'mean_age_per_a', 'age'], inplace=True)

In [82]:
candidates_

Unnamed: 0,article_id,price,week,customer_id
0,754852002,0.050831,96,28847241659200
1,747764002,0.047441,96,28847241659200
2,685944002,0.042356,96,28847241659200
3,746344001,0.033881,96,28847241659200
4,796042001,0.033881,96,28847241659200
...,...,...,...,...
119995,761406001,0.022017,105,42984229297455520
119996,874110016,0.021339,105,42984229297455520
119997,855686002,0.021322,105,42984229297455520
119998,894481002,0.021169,105,42984229297455520


In [83]:
transactions['purchased'] = 1
data = pd.concat([transactions, candidates_last_purchase, candidates_])
data.purchased.fillna(0, inplace=True)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [84]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1.0,95,1.0
29030504,2020-07-15,272412481300040,816592008,0.016932,1.0,95,1.0
29030505,2020-07-15,272412481300040,621381021,0.033881,1.0,95,1.0
29030506,2020-07-15,272412481300040,817477003,0.025407,1.0,95,1.0
29030507,2020-07-15,272412481300040,899088002,0.025407,1.0,95,1.0
...,...,...,...,...,...,...,...
119995,NaT,42984229297455520,761406001,0.022017,,105,0.0
119996,NaT,42984229297455520,874110016,0.021339,,105,0.0
119997,NaT,42984229297455520,855686002,0.021322,,105,0.0
119998,NaT,42984229297455520,894481002,0.021169,,105,0.0


In [85]:
data = pd.merge(
    data,
    top_candidates_3feat_prev_week[['week', 'article_id', 'price_rank', 'age_rank']],
    on=['week', 'article_id'],
    how='left'
)
data = data[data.week != data.week.min()]
data.price_rank.fillna(999, inplace=True)
data.age_rank.fillna(999, inplace=True)

In [86]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,price_rank,age_rank
261987,2020-07-22,200292573348128,880777001,0.025407,2.0,96,1.0,999.0,999.0
261988,2020-07-22,200292573348128,784332002,0.025407,2.0,96,1.0,999.0,999.0
261989,2020-07-22,200292573348128,827968001,0.016932,2.0,96,1.0,999.0,999.0
261990,2020-07-22,200292573348128,599580086,0.011847,2.0,96,1.0,999.0,999.0
261991,2020-07-22,248294615847351,720504008,0.031458,1.0,96,1.0,999.0,999.0
...,...,...,...,...,...,...,...,...,...
8690925,NaT,42984229297455520,894481002,0.021169,,105,0.0,13.0,9.0
8690926,NaT,42984229297455520,865482002,0.021169,,105,0.0,1.0,6.0
8690927,NaT,42984229297455520,865482002,0.021169,,105,0.0,1.0,9.0
8690928,NaT,42984229297455520,865482002,0.021169,,105,0.0,1.0,7.0


In [87]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [88]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [89]:
train

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,price_rank,age_rank,product_code,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-07-26,28847241659200,887770001,0.016932,1.0,96,1.0,999.0,999.0,887770,...,10,1010,6,3692,1,1,0,1,21,57896
1,2020-07-18,28847241659200,762846001,0.025407,1.0,96,0.0,999.0,999.0,762846,...,7,1010,6,492,1,1,0,1,21,57896
2,2020-07-18,28847241659200,829308001,0.033881,1.0,96,0.0,999.0,999.0,829308,...,21,1005,0,9082,1,1,0,1,21,57896
3,NaT,28847241659200,754852002,0.050831,,96,0.0,21.0,1.0,754852,...,45,1001,10,17015,1,1,0,1,21,57896
4,NaT,28847241659200,754852002,0.050831,,96,0.0,5.0,1.0,754852,...,45,1001,10,17015,1,1,0,1,21,57896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5861233,2020-09-20,18445641720816255142,923037003,0.033881,2.0,104,1.0,999.0,999.0,923037,...,31,1017,4,10043,1,1,0,1,31,147613
5861234,2020-09-21,18446737527580148316,547780001,0.023712,2.0,104,1.0,999.0,999.0,547780,...,27,1002,2,271,1,1,0,1,60,96323
5861235,2020-09-21,18446737527580148316,763988001,0.023712,2.0,104,1.0,999.0,999.0,763988,...,16,1002,2,1107,1,1,0,1,60,96323
5861236,2020-09-21,18446737527580148316,763988003,0.023712,2.0,104,1.0,999.0,999.0,763988,...,16,1002,2,1107,1,1,0,1,60,96323


In [90]:
test

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,price_rank,age_rank,product_code,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
5861238,2020-09-03,28847241659200,925246001,0.128797,2.0,105,0.0,999.0,999.0,925246,...,40,1007,9,27855,1,1,0,1,21,57896
5861239,NaT,28847241659200,779068002,0.045746,,105,0.0,7.0,1.0,779068,...,10,1003,3,15430,1,1,0,1,21,57896
5861245,NaT,28847241659200,754323004,0.045746,,105,0.0,7.0,2.0,754323,...,0,1003,3,4874,1,1,0,1,21,57896
5861252,NaT,28847241659200,721481003,0.045746,,105,0.0,7.0,3.0,721481,...,0,1010,6,11900,1,1,0,1,21,57896
5861259,NaT,28847241659200,904225001,0.045746,,105,0.0,7.0,3.0,904225,...,20,1020,7,26543,1,1,0,1,21,57896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8428938,2020-09-06,18446705133201055310,875784002,0.050831,2.0,105,0.0,999.0,999.0,875784,...,15,1003,3,11763,1,1,0,1,60,177867
8428939,2020-09-21,18446737527580148316,547780001,0.023712,2.0,105,0.0,999.0,999.0,547780,...,27,1002,2,271,1,1,0,1,60,96323
8428940,2020-09-21,18446737527580148316,763988001,0.023712,2.0,105,0.0,999.0,999.0,763988,...,16,1002,2,1107,1,1,0,1,60,96323
8428941,2020-09-21,18446737527580148316,763988003,0.023712,2.0,105,0.0,999.0,999.0,763988,...,16,1002,2,1107,1,1,0,1,60,96323


In [91]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [92]:
train_baskets

array([511, 480, 999, ...,   2,   7,   4], dtype=int64)

In [93]:
extra_columns = ['price_rank', 'age_rank']
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code']
columns_to_use.extend(extra_columns)

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [94]:
test_X

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,price_rank,age_rank
5861238,925246001,262,1010016,9,4,5,1201,0,1,19,1007,1,1,0,1,21,57896,999.0,999.0
5861239,779068002,252,1010016,13,2,11,1610,0,1,6,1003,1,1,0,1,21,57896,7.0,1.0
5861245,754323004,252,1010010,14,4,11,1626,0,1,15,1003,1,1,0,1,21,57896,7.0,2.0
5861252,721481003,258,1010017,73,2,6,1522,0,1,15,1010,1,1,0,1,21,57896,7.0,3.0
5861259,904225001,87,1010016,9,4,5,3529,6,1,64,1020,1,1,0,1,21,57896,7.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8428938,875784002,252,1010016,9,4,12,1949,0,1,2,1003,1,1,0,1,60,177867,999.0,999.0
8428939,547780001,272,1010016,9,4,5,1643,1,2,51,1002,1,1,0,1,60,96323,999.0,999.0
8428940,763988001,272,1010016,9,4,5,5833,2,3,26,1002,1,1,0,1,60,96323,999.0,999.0
8428941,763988003,272,1010010,9,4,12,5833,2,3,26,1002,1,1,0,1,60,96323,999.0,999.0


In [95]:
from lightgbm.sklearn import LGBMRanker
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.857265
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.147489
[LightGBM] [Debug] init for col-wise cost 0.042139 seconds, init for row-wise cost 0.055072 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063597 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1160
[LightGBM] [Info] Number of data points in the train set: 5861238, number of used features: 19
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 13


In [96]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

price_rank 0.5767226339082321
article_id 0.1545668302125849
product_type_no 0.10616275726044201
garment_group_no 0.079561831816993
department_no 0.05759610958053573
section_no 0.01456013861489466
graphical_appearance_no 0.00680894920940695
colour_group_code 0.004020749396910688
Active 0.0
FN 0.0
club_member_status 0.0
fashion_news_frequency 0.0
index_group_no 0.0
index_code 0.0
age 0.0
perceived_colour_master_id 0.0
perceived_colour_value_id 0.0
postal_code 0.0
age_rank 0.0


In [97]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

best_last_week = \
    top_candidates_3feat_prev_week[top_candidates_3feat_prev_week.week == top_candidates_3feat_prev_week.week.max()]['article_id'].tolist()

In [98]:
sub = pd.read_csv('../input/sample_submission.csv')
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + best_last_week
    preds.append(pred[:12])
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds
sub_name = 'basic_model_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)

In [48]:
### Bestsellers candidates
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [49]:
# bestsellers_previous_week.pipe(lambda df: df[df['week']==96])
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.022980
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
...,...,...,...,...
115,105,915529003,8,0.033439
116,105,915529005,9,0.033417
117,105,448509014,10,0.041630
118,105,762846027,11,0.025005


In [50]:
unique_transactions = transactions.groupby(['week', 'customer_id']).head(1).drop(columns=['article_id', 'price']).copy()
candidates_bestsellers = pd.merge(unique_transactions, bestsellers_previous_week, on='week',)
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [51]:
# candidates_bestsellers

In [52]:
# test_set_transactions
# unique_transactions

In [53]:
candidates_bestsellers_test_week = pd.merge(test_set_transactions, bestsellers_previous_week, on='week')
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [54]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,200292573348128,2,96,760084003,0.025094
1,2020-07-22,200292573348128,2,96,866731001,0.024919
2,2020-07-22,200292573348128,2,96,600886001,0.022980
3,2020-07-22,200292573348128,2,96,706016001,0.033197
4,2020-07-22,200292573348128,2,96,372860002,0.013193
...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,1,105,915529003,0.033439
5248376,2020-09-22,18438270306572912089,1,105,915529005,0.033417
5248377,2020-09-22,18438270306572912089,1,105,448509014,0.041630
5248378,2020-09-22,18438270306572912089,1,105,762846027,0.025005


# Combining transactions and candidates / negative examples

In [55]:
transactions['purchased'] = 1
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [56]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0
...,...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,915529003,0.033439,1,105,0.0
5248376,2020-09-22,18438270306572912089,915529005,0.033417,1,105,0.0
5248377,2020-09-22,18438270306572912089,448509014,0.041630,1,105,0.0
5248378,2020-09-22,18438270306572912089,762846027,0.025005,1,105,0.0


### Add bestseller information

In [57]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [58]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [59]:
test

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
11381612,2020-09-03,28847241659200,925246001,0.128797,2,105,0.0,999.0,925246,25454,...,40,1007,9,27855,1,1,0,1,21,57896
11381613,2020-07-18,28847241659200,924243001,0.041535,1,105,0.0,1.0,924243,19190,...,0,1003,3,13007,1,1,0,1,21,57896
11381614,2020-07-18,28847241659200,924243002,0.041877,1,105,0.0,2.0,924243,19190,...,0,1003,3,13007,1,1,0,1,21,57896
11381615,2020-07-18,28847241659200,918522001,0.041435,1,105,0.0,3.0,918522,26372,...,0,1003,3,28633,1,1,0,1,21,57896
11381616,2020-07-18,28847241659200,923758001,0.033462,1,105,0.0,4.0,923758,19359,...,0,1010,6,27869,1,1,0,1,21,57896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991757,2020-09-21,18446737527580148316,915529003,0.033439,2,105,0.0,8.0,915529,7046,...,0,1003,3,10909,1,1,0,1,60,96323
17991758,2020-09-21,18446737527580148316,915529005,0.033417,2,105,0.0,9.0,915529,7046,...,0,1003,3,10909,1,1,0,1,60,96323
17991759,2020-09-21,18446737527580148316,448509014,0.041630,2,105,0.0,10.0,448509,259,...,1,1009,5,255,1,1,0,1,60,96323
17991760,2020-09-21,18446737527580148316,762846027,0.025005,2,105,0.0,11.0,762846,472,...,7,1010,6,492,1,1,0,1,60,96323


In [60]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [61]:
train_baskets

array([15, 23, 16, ..., 14, 19, 16], dtype=int64)

In [62]:
extra_columns = []
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']
columns_to_use.extend(extra_columns)

In [63]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [64]:
test_X

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,bestseller_rank
11381612,925246001,262,1010016,9,4,5,1201,0,1,19,1007,1,1,0,1,21,57896,999.0
11381613,924243001,252,1010016,13,1,1,1626,0,1,15,1003,1,1,0,1,21,57896,1.0
11381614,924243002,252,1010016,9,4,5,1626,0,1,15,1003,1,1,0,1,21,57896,2.0
11381615,918522001,252,1010016,11,3,9,1626,0,1,15,1003,1,1,0,1,21,57896,3.0
11381616,923758001,-1,1010016,10,3,9,1522,0,1,15,1010,1,1,0,1,21,57896,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991757,915529003,252,1010016,9,4,5,1626,0,1,15,1003,1,1,0,1,60,96323,8.0
17991758,915529005,252,1010016,13,1,1,1626,0,1,15,1003,1,1,0,1,60,96323,9.0
17991759,448509014,272,1010016,72,3,2,1747,1,2,53,1009,1,1,0,1,60,96323,10.0
17991760,762846027,259,1010016,13,1,1,1515,0,1,11,1010,1,1,0,1,60,96323,11.0


# Model training

In [65]:
from lightgbm.sklearn import LGBMRanker
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.848850
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.153113
[LightGBM] [Debug] init for col-wise cost 0.085042 seconds, init for row-wise cost 0.098664 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.125895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12


In [66]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9989805711820897
age 0.00024136039423249762
article_id 0.0001716082873112681
garment_group_no 0.000144767354190598
department_no 9.631753928857237e-05
product_type_no 9.014783466245737e-05
section_no 7.07609526662605e-05
postal_code 6.79219757232404e-05
club_member_status 6.519780365736126e-05
colour_group_code 5.3587542243445946e-05
perceived_colour_value_id 1.7759133934558557e-05
fashion_news_frequency 0.0
Active 0.0
FN 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
index_group_no 0.0


# Calculate predictions

In [67]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

# Create submission

In [68]:
sub = pd.read_csv('../input/sample_submission.csv')
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds
sub_name = 'basic_model_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)