Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [1]:
import numpy as np
from lightgbm.sklearn import LGBMRanker

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd

In [4]:
%%time

transactions = pd.read_parquet('../input/warmup/transactions_train.parquet')
customers = pd.read_parquet('../input/warmup/customers.parquet')
articles = pd.read_parquet('../input/warmup/articles.parquet')
sample_submission = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: user 5.57 s, sys: 3.27 s, total: 8.85 s
Wall time: 12.7 s


# Feature engineering
We want to add some features or change some values, therefore we engineer some features

In [5]:
# define age groups
def get_age_group(age):
    if age < 18:
        return 0
    elif age >= 18 and age < 25:
        return 1
    elif age >= 25 and age < 35:
        return 2
    elif age >= 35 and age < 45:
        return 3
    elif age >= 45 and age < 55:
        return 4
    elif age >= 55 and age < 65:
        return 5
    else:
        return 6

In [6]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
25784,2018-09-20,1728846800780188,519773001,0.028458,2,0
25785,2018-09-20,1728846800780188,578472001,0.032525,2,0
5389,2018-09-20,2076973761519164,661795002,0.167797,2,0
5390,2018-09-20,2076973761519164,684080003,0.101678,2,0
47429,2018-09-20,2918879973994241,662980001,0.033881,1,0
...,...,...,...,...,...,...
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,104
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,104
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,104
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,104


# Making a recall evaluation function

In [7]:
# return the average recall of generated candidates versus the actual bought items
def average_recall(purchases, candidates):
    joined = pd.merge(purchases, candidates, how='inner').drop_duplicates()
    true_positives = joined.groupby('customer_id').count()
    total_positives = purchases.groupby('customer_id').count()
    recall = true_positives.divide(total_positives, fill_value=0)
    return recall.mean().values[0]

# Generating candidates
TODO: ensure the bestsellers of last week can be save to add to empty users

In [8]:
def candidate_generation(data, test_week):
    ################
    ## Repurchase ##
    ################
    c2weeks = transactions.groupby('customer_id')['week'].unique()
    c2weeks2shifted_weeks = {}
    for c_id, weeks in c2weeks.items():
        c2weeks2shifted_weeks[c_id] = {}
        for i in range(weeks.shape[0]-1):
            c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
        c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    candidates_last_purchase = transactions.copy()
    weeks = []
    for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
        weeks.append(c2weeks2shifted_weeks[c_id][week])
    candidates_last_purchase.week=weeks
    
    ################
    ## bestseller ##
    ################
    mean_price = transactions \
        .groupby(['week', 'article_id'])['price'].mean()
    sales = transactions \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')
    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    bestsellers_previous_week.week += 1
    
    filler = bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()].article_id.value_counts().head(12).index.values
    print("=========================================")
    print("The content supposed to be in filler")
    print(bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()].article_id.value_counts().head(12).index.values)
    print("=========================================")
    print("The content in filler")
    print(filler)

    unique_transactions = transactions \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()
    candidates_bestsellers = pd.merge(
        unique_transactions,
        bestsellers_previous_week,
        on='week',
    )
    test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_transactions.week = test_week
    candidates_bestsellers_test_week = pd.merge(
        test_set_transactions,
        bestsellers_previous_week,
        on='week'
    )
    candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
    candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)
    
    if not test_week > absolute_max_week:
        t_purchases = test_week_transactions[test_week_transactions.week == test_week][['customer_id', 'article_id']].drop_duplicates()
        t_candidates = candidates_bestsellers[candidates_bestsellers.week == test_week][['customer_id', 'article_id']].drop_duplicates()
        print(f"Average recall of bestsellers : {average_recall(t_purchases, t_candidates)}")
    
    ###################################
    ## Bestseller based on age group ##
    ###################################
    # Group the mean_price not per week/article but by week/article/age_group
    # this is so we know
    mean_price_age_group = transactions \
        .groupby(['week', 'age_group', 'article_id'])['price'].mean()

    # group the sales by week AND the age group and so find the most popular article for each age group in each week
    sales_age_group = transactions \
        .groupby(['week', 'age_group'])['article_id'].value_counts() \
        .groupby(['week', 'age_group']).rank(method='dense', ascending=False) \
        .groupby(['week', 'age_group']).head(12).rename('age_group_bestseller_rank').astype('int8')

    # now calculate the bestsellers for these week - age_group combos
    bestsellers_previous_week_age_group = pd.merge(sales_age_group, mean_price_age_group, on=['week', 'age_group', 'article_id']).reset_index()
    bestsellers_previous_week_age_group.week += 1

    unique_age_group_transactions = transactions \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()

    age_group_candidates_bestsellers = pd.merge(
        unique_age_group_transactions,
        bestsellers_previous_week_age_group,
        on=['week', 'age_group'],
    )
    test_set_age_group_transactions = unique_age_group_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_age_group_transactions.week = test_week

    age_group_candidates_bestsellers_test_week = pd.merge(
        test_set_age_group_transactions,
        bestsellers_previous_week_age_group,
        on=['week', 'age_group'],
    )
    age_group_candidates_bestsellers = pd.concat([age_group_candidates_bestsellers, age_group_candidates_bestsellers_test_week])
    age_group_candidates_bestsellers.drop(columns='age_group_bestseller_rank', inplace=True)
    
    if not test_week > absolute_max_week:
        t_purchases = test_week_transactions[test_week_transactions.week == test_week][['customer_id', 'article_id']].drop_duplicates()
        t_candidates = age_group_candidates_bestsellers[age_group_candidates_bestsellers.week == test_week][['customer_id', 'article_id']].drop_duplicates()
        print(f"Average recall of age group bestsellers : {average_recall(t_purchases, t_candidates)}")
    
    
    ###################################################
    # Combine the transactions and negative examples ##
    ###################################################
    purchased_transactions = data.copy()
    transactions['purchased'] = 1
    result = pd.concat([
        data, candidates_last_purchase, age_group_candidates_bestsellers
    ])
    result.purchased.fillna(0, inplace=True)
    result.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)
    result = pd.merge(
        result,
        bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
        on=['week', 'article_id'],
        how='left'
    )
    # merge the data with the bestsellers information from the age_group popularity study
    result = pd.merge(
        result,
        bestsellers_previous_week_age_group[['week', 'age_group', 'article_id', 'age_group_bestseller_rank']],
        on=['week', 'age_group', 'article_id'],
        how='left'
    )
    result = result[result.week != result.week.min()]
    result.bestseller_rank.fillna(999, inplace=True)
    result.age_group_bestseller_rank.fillna(999, inplace=True)
    
    result.sort_values(['week', 'customer_id'], inplace=True)
    result.reset_index(drop=True, inplace=True)
    return result, filler

In [9]:
def add_features(data):
    columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
    'perceived_colour_master_id', 'department_no', 'index_code',
    'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
    'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank', 'age_group_bestseller_rank', 'age_group']
    
    result = data
    result = pd.merge(result, customers, how='left', on=['customer_id', 'age_group'])
    result = pd.merge(result, articles, how='left', on='article_id')
    
    # features from assignment 2 can go here
    
    return result[columns_to_use]
    

# use the generation for training and testing

In [10]:
# define the test week and limit the data to a set of previous weeks
test_week = 105
num_training_weeks = 10
absolute_max_week = transactions.week.max()
print(test_week)
test_week_transactions = transactions[transactions.week == test_week]
transactions = transactions[(transactions.week > test_week - num_training_weeks - 1) & (transactions.week < test_week)].reset_index(drop=True)

customers["age_group"] = customers["age"].apply(get_age_group)
# firstly take the age_groups and the cutomer ids
age_groups_customers = customers[['customer_id', 'age_group']].drop_duplicates()

# now join them into the transactions to create a new transactions set to work with
transactions = pd.merge(transactions, age_groups_customers)
# now the age_group is included, we will have to change some values and names to ensure this is used

# assemble training data by using positive and negative samples
examples, filler = candidate_generation(transactions, test_week)
print(examples)
train_examples = examples[examples.week != test_week]
train_x = add_features(train_examples)
train_y = train_examples['purchased']

# make the ranker, make the train_groups
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
# sort the training_examples
train_groups = train_examples.groupby(['week', 'customer_id'])['article_id'].count().values
ranker.fit(train_x, train_y, group=train_groups)
for i in ranker.feature_importances_.argsort()[::-1]:
    print(train_x.columns[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())
    

# testing
test_examples_not_copy = examples[examples.week == test_week]
print(test_examples_not_copy)
test_examples = test_examples_not_copy.copy()
test_x = add_features(test_examples)

test_examples["score"] = ranker.predict(test_x)

test_examples = test_examples[['customer_id', 'article_id', 'score']]
print(test_examples)
predictions = test_examples.sort_values(["customer_id", "score"], ascending=False)\
    .groupby("customer_id")\
    .head(12)\
    .groupby("customer_id")\
    .article_id.apply(list).reset_index()\
    .rename(columns={"article_id": "prediction"})[["customer_id", "prediction"]]
print(predictions)

# make the predictions of the customers
# predictions = test_examples.sort_values(["customer_id", "score"], ascending=False)\
#     .groupby("customer_id")\
#     .head(12)\
#     .groupby("customer_id", as_index = False)\
#     .article_id.apply(list)\
#     .rename(columns={"article_id": "prediction"})[["customer_id", "prediction"]]

# scored_candidates.sort_values(["customer_id", "score"], ascending=False)
#         .groupby("customer_id")
#         .head(k)
#         .groupby("customer_id", as_index=False)
#         .article_id.apply(list)
#         .rename(columns={"article_id": "prediction"})[["customer_id", "prediction"]]


### evaluate
# if the test week is a week of which the data is in fact known
if test_week < absolute_max_week:
    print("In a previous week right now")
    pass

# if the week is our target week
else:
    missing_customers = pd.Series(
        list(set(age_groups_customers.customer_id) - set(predictions.customer_id)),
        name="customer_id",
    )
    missing_predictions = pd.merge(
        missing_customers, pd.Series([filler], name="prediction"), how="cross"
    )
    print("====================")
    print(missing_customers)
    print("====================")
    print(filler)
    print("====================")
    print(missing_predictions)
    predictions = pd.concat((predictions, missing_predictions))
    
    # create a submission
    predictions = predictions.set_index("customer_id").prediction.to_dict()
    preds = []
    sub = sample_submission.copy()
    for customer_id in customer_hex_id_to_int(sub.customer_id):
        preds.append(" ".join(f"0{x}" for x in predictions[customer_id]))
    sub.prediction = preds
    
    # to csv
    sub_name = 'testing_submission'
    sub.to_csv(f'{sub_name}.csv.gz', index=False)


105
The content supposed to be in filler
[924243001 924243002 918522001 923758001 866731001 909370001 751471001
 915529003 915529005 448509014 762846027 714790020]
The content in filler
[924243001 924243002 918522001 923758001 866731001 909370001 751471001
 915529003 915529005 448509014 762846027 714790020]
              t_dat           customer_id  article_id     price  \
0        2020-07-26        28847241659200   887770001  0.016932   
1        2020-07-18        28847241659200   762846001  0.025407   
2        2020-07-18        28847241659200   829308001  0.033881   
3        2020-07-26        28847241659200   760084003  0.025180   
4        2020-07-26        28847241659200   706016001  0.033148   
...             ...                   ...         ...       ...   
17982673 2020-09-21  18446737527580148316   918522001  0.041050   
17982674 2020-09-21  18446737527580148316   923758001  0.033550   
17982675 2020-09-21  18446737527580148316   910601002  0.040806   
17982676 2020-09-21  

# Calculate predictions

# Create submission