# Based on the template by Noah Daniels

Additions are in the last few blocks of code. The rest is copied from the template.
I will inject another text block from where my code starts.

In [1]:
import pickle

import numpy as np
import pandas as pd
from lightgbm.sklearn import LGBMRanker

# make external scripts auto reload
%load_ext autoreload
%autoreload 2

from baseline import *

In [2]:
BASE_PATH = '../data/'

# make sure the same data preprocessing as in the radek notebook have been performed
# (see 02 FE/DataProcessingRadek.ipynb)
transactions = pd.read_parquet(BASE_PATH + 'parquet/transactions_train.parquet')
customers = pd.read_parquet(BASE_PATH + 'parquet/customers.parquet')
articles = pd.read_parquet(BASE_PATH + 'parquet/articles.parquet')
sample_submission = pd.read_csv(BASE_PATH + 'csv/sample_submission.csv')

In [3]:
# Candidate generation of Radek notebook
def get_data(data, test_week):
    ### repurchase
    # each week is seen as a basket
    # the items bought in one basket, will be example for the next basket
    # the items bought in the last basket, will be candidates for the test basket
    c2weeks = data.groupby('customer_id')['week'].unique()
    c2weeks2shifted_weeks = {}
    for c_id, weeks in c2weeks.items():
        c2weeks2shifted_weeks[c_id] = {}
        for i in range(weeks.shape[0]-1):
            c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
        c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    candidates_last_purchase = data.copy()
    weeks = []
    for i, (c_id, week) in enumerate(zip(data['customer_id'], data['week'])):
        weeks.append(c2weeks2shifted_weeks[c_id][week])
    candidates_last_purchase.week=weeks

    ### bestseller
    # if a user bought an item in a given week, the 12 most popular items in the previous week are example for that week
    # the best selling items in the last week are candidates for all users
    mean_price = data \
        .groupby(['week', 'article_id'])['price'].mean()
    sales = data \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')
    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    bestsellers_previous_week.week += 1
    unique_transactions = data \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()
    candidates_bestsellers = pd.merge(
        unique_transactions,
        bestsellers_previous_week,
        on='week',
    )
    test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_transactions.week = test_week
    candidates_bestsellers_test_week = pd.merge(
        test_set_transactions,
        bestsellers_previous_week,
        on='week'
    )
    candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
    candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

    ### combine
    d = data.copy()
    d['purchased'] = True
    
    result = pd.concat([
        d, candidates_last_purchase, candidates_bestsellers
    ])
    result.purchased.fillna(False, inplace=True)
    result.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

    result = pd.merge(
        result,
        bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
        on=['week', 'article_id'],
        how='left'
    )
    result = result[result.week != result.week.min()]
    result.bestseller_rank.fillna(999, inplace=True)

    result.sort_values(['week', 'customer_id'], inplace=True)
    result.reset_index(drop=True, inplace=True)
    return result

def get_examples(data, test_week):
    data = get_data(data, test_week)
    return data[data.week != test_week]

def get_candidates(data, test_week):
    data = get_data(data, test_week)
    return data[data.week == test_week]

def add_features(data):
    columns_to_use = [
        'article_id', 
        'product_type_no', 
        'graphical_appearance_no', 
        'colour_group_code', 
        'perceived_colour_value_id',
        'perceived_colour_master_id', 
        'department_no', 
        'index_code',
        'index_group_no', 
        'section_no', 
        'garment_group_no', 
        'FN', 
        'Active',
        'club_member_status', 
        'fashion_news_frequency', 
        'age', 
        'postal_code',
        'bestseller_rank'
    ]

    result = data
    result = pd.merge(result, customers, how='left', on='customer_id')
    result = pd.merge(result, articles, how='left', on='article_id')

    # features from assignment 2 could go here
    customer_avg_price = transactions.groupby('customer_id')['price'].mean().to_frame('preferred_price')
    result = pd.merge(result, customer_avg_price, how="left", on="customer_id")
    
    return result[columns_to_use]

In [4]:
### split into training and testing
# one week is used for testing
# a number of weeks leading up to the test week are used to train the ranker
test_week = 104
num_training_weeks = 10
testing_weeks = np.arange(test_week-num_training_weeks, test_week)
train_data = transactions[transactions.week.isin(testing_weeks)].reset_index(drop=True)

### assemble training data (positive + negative examples)
# each example has at least a customer_id, article_id and whether it was purchased or not (positive/negative)
# add_features extracts and adds features to the examples
train_examples = get_examples(train_data, test_week)
X_train = add_features(train_examples)
Y_train = train_examples['purchased']

### fit ranker
# training_groups tells LGBM that each (week, customer_id) combination is a seperate basket
# !!! it is important that the training_examples are sorted according to week, customer_id for this to work
ranker = LGBMRanker(
    force_row_wise=True,
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
train_groups = train_examples.groupby(['week', 'customer_id'])['article_id'].count().values
ranker.fit(X_train, Y_train, group=train_groups)
print_importance(ranker, X_train.columns)

### test
# candidates are generated similarly to the examples, only we don't know whether they are purchased
# the same features are extracted and added
# each candidate is scored by the ranker and predictions are generated using the highest scoring candidates
test_candidates = get_candidates(train_data, test_week)
X_test = add_features(test_candidates)
predictions = get_predictions(test_candidates, X_test, ranker, 12)
print(predictions.head(20))

### evaluate
if test_week < transactions.week.max() + 1:
    # get ground truth data for test week
    purchases = get_purchases(transactions[transactions.week == test_week])
    
    # fill missing prediction for customers in test set with popular items in last week
    # only for customers in test set because only those are evaluated
    popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
    predictions = fill_missing_predictions(predictions, purchases.customer_id, popular)
    
    # calculate score
    score = mean_average_precision(predictions, purchases, 12)
    print(score)

### submit
else:
    # fill missing predictions for all customers with popular items in last week
    # all customers because we don't know which ones will be evaluated
    popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
    predictions = fill_missing_predictions(predictions, customers.customer_id, popular)

    # write submission
    sub = create_submission(predictions)
    sub.to_csv(BASE_PATH + 'sub1.csv.gz', index=False)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.151109
[LightGBM] [Info] Total Bins 1149
[LightGBM] [Info] Number of data points in the train set: 11557594, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
               bestseller_rank 0.99907
                    article_id 0.00028
                           age 0.00024
              garment_group_no 0.00007
            club_member_status 0.00007
                   postal_code 0.00007
               product_type_no 0.00006
             colour_group_code 0.00005
                 department_no 0.00004
                        Active 0.00002
       graphical_appearance_no 0.00001
     perceived_colour_value_id 0.00001
                            FN 0.00000
        fashion_news_frequency 0.00000
                index_group_no 0.00000
                    section_no 0.00000
    perceived_colour_master_id 0.00000
                    index_code 0.00000
        customer_id      

Scores from using various weeks as the test week:

+ 105: 0.02087 (kaggle)
+ 104: 0.025080605661718477
+ 103: 0.023774082148643252
+ 102: 0.022159069556621
+ 101: 0.01881722188115503
+ 100: 0.019754936922870146

I am pretty sure that my implementation of MAP@12 is correct and these deviations are due to noise in the dataset. The submission generated by this code for week 105 has the same score as the submission from the Radek notebook.

# Start code additions

In [5]:
import pickle
predictions = pickle.load(open('../data/LightGCN/predictions.pkl', 'rb'))
# print(predictions.head(20))

# Change the test week to switch between MAP@12 and generating the submission
test_week = 105
### evaluate
if test_week < transactions.week.max() + 1:
    # get ground truth data for test week
    purchases = get_purchases(transactions[transactions.week == test_week])
    
    # fill missing prediction for customers in test set with popular items in last week
    # only for customers in test set because only those are evaluated
    popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
    predictions = fill_missing_predictions(predictions, purchases.customer_id, popular)
    
    # calculate score
    score = mean_average_precision(predictions, purchases, 12)
    print(score)

### submit
else:
    # fill missing predictions for all customers with popular items in last week
    # all customers because we don't know which ones will be evaluated
    popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
    predictions = fill_missing_predictions(predictions, customers.customer_id, popular)

    sample_submission = pd.read_csv("../data/csv/sample_submission.csv")
    # write submission
    sub = create_submission(predictions, sample_submission)
    # Change the submission file depending on the model
    sub.to_csv(BASE_PATH + 'LightGCN_4layer_15sample.csv.gz', index=False)

In [38]:
popular_items = transactions[transactions.week == 104] \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')

popular_items = popular_items.reset_index()
print(popular_items.head(20))

    week  article_id  bestseller_rank
0    104   924243001                1
1    104   924243002                2
2    104   918522001                3
3    104   923758001                4
4    104   866731001                5
5    104   909370001                6
6    104   751471001                7
7    104   915529003                8
8    104   915529005                9
9    104   448509014               10
10   104   762846027               11
11   104   714790020               12


In [39]:
popular_items = set(popular_items["article_id"].unique())
print(popular_items)

{714790020, 751471001, 915529003, 762846027, 915529005, 923758001, 918522001, 909370001, 448509014, 924243001, 924243002, 866731001}


In [40]:
total_predictions = len(predictions) * 12
popular_predictions = 0

for (i, row) in predictions.iterrows():
    for prediction in row["prediction"]:
        if prediction in popular_items:
            popular_predictions += 1
            
print(f"{popular_predictions} / {total_predictions} = {popular_predictions/total_predictions * 100}%")

13994201 / 16463760 = 85.00003036973328%


# Important note

During my presentation and in my report, I said the predictions aren't really similar to popular items. While cleaning my code and refactoring a bit to make it cleaner I found a bug in the code that was used for calculating the similarity. 

If we take the top 12 popular items of the test week (the last week of the dataset, week 104) we get that +-85% of the items recommended by LightGCN are indeed popular items. If we take the top 12 popular items of all weeks in the dataset we also get +-85% (slightly higher but not much). 