In [1]:
# make external scripts auto reload
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from lightgbm.sklearn import LGBMRanker

from template.experiment_template import *
from candidate_generation import *

## Load data

In [3]:
BASE_PATH = '../../data/'
DATA_PATH = BASE_PATH + 'sample_0.05/'
# DATA_PATH = BASE_PATH + 'parquet/'

# make sure the same data preprocessing as in the radek notebook have been performed
# (see radek_preprocessing.ipynb)
transactions = pd.read_parquet(DATA_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(DATA_PATH + 'customers.parquet')
articles = pd.read_parquet(DATA_PATH + 'articles.parquet')
sample_submission = pd.read_csv(BASE_PATH + 'original/sample_submission.csv')

## Settings

In [4]:
test_week = 104
num_training_weeks = 10
handle_cold_customers = True
num_trees = 100

making_submission = test_week == transactions.week.max() + 1

## Split into training and testing

In [5]:
# one week is used for testing
# a number of weeks leading up to the test week are used to train the ranker
transactions = add_relative_week(transactions)
training_weeks = np.arange(test_week-num_training_weeks, test_week)
train_data = transactions[transactions.week.isin(training_weeks)].reset_index(drop=True)

## Generate training examples and testing candidates

In [6]:
# optimisation: only generate testing candidates for customers with ground truth data
# not possible for submission week
test_customers = None
if not making_submission:
    p = get_purchases(transactions[transactions.week == test_week])
    test_customers = p.customer_id.values

# get the examples and candidates
# examples are (customer, week, article, purchased) triplets
# candidates are (customer, article) pairs
train_examples, test_candidates = get_examples_candidates(train_data, test_week, test_customers, customers, articles)

# add features and prepare data for ranker
X_train = add_features(train_examples, transactions, customers, articles)
X_test = add_features(test_candidates, transactions, customers, articles)
Y_train = train_examples['purchased']

## Fit ranker

In [7]:
# training_groups tells LGBM that each (week, customer_id) combination is a seperate basket
# !!! it is important that the training_examples are sorted according to week, customer_id for this to work
ranker = LGBMRanker(
    force_row_wise=True,
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=num_trees,
    importance_type='gain'
)
# train_groups = train_examples.groupby(['customer_id'])['article_id'].count().values
train_groups = train_examples.groupby(['week', 'customer_id'])['article_id'].count().values
ranker.fit(X_train, Y_train, group=train_groups)
print_importance(ranker, X_train.columns)

[LightGBM] [Info] Total Bins 1247
[LightGBM] [Info] Number of data points in the train set: 911325, number of used features: 30
        c_af_colour_group_name 0.13052
             c_af_product_code 0.11839
                      c_cf_age 0.11769
          c_af_department_name 0.10837
                 c_popularity1 0.10038
              c_cf_postal_code 0.08797
c_af_graphical_appearance_name 0.08551
               c_af_index_name 0.08218
                       c_cf_FN 0.07274
                  c_repurchase 0.05182
                 c_popularity2 0.02347
                 article_price 0.00391
             colour_group_code 0.00376
                 department_no 0.00184
               preferred_price 0.00181
               product_type_no 0.00168
                    section_no 0.00159
       graphical_appearance_no 0.00127
                   postal_code 0.00126
                           age 0.00096
              garment_group_no 0.00091
    perceived_colour_master_id 0.00082
     perceived

## Evaluate / Submit

In [26]:
# generate recommendations
predictions = get_predictions(test_candidates, X_test, ranker, 12)

# fill missing predictions with top-12 most purchased articles in last week
popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
predictions = fill_missing_predictions(predictions, customers.customer_id, popular)

# cold users get served special candidates directly (no ranking)
if handle_cold_customers:
    active_users = train_data.customer_id.unique()
    cold_users = list(set(customers.customer_id) - set(active_users))
    
    bask = baskets(None, test_week, cold_users)
    c = (
        pd.concat([
            candidates_article_feature(bask, train_data, articles, 'prod_name', 6, 1, 2, 6, True),
            candidates_popularity(bask, train_data, 12, 1)
        ])
        .drop(columns='week')
        .drop_duplicates(['customer_id', 'article_id'])
        .groupby('customer_id')
        .head(12)
        .groupby('customer_id', as_index=False)
        .article_id.apply(list)
        .rename(columns={'article_id':'prediction'})
    )
    predictions = pd.concat([predictions[predictions.customer_id.isin(active_users)], c])

if making_submission:
    # write submission
    sub = create_submission(predictions, sample_submission)
    sub.to_csv(BASE_PATH + 'sub19-06b.csv.gz', index=False)
else:
    # calculate validation score
    purchases = get_purchases(transactions[transactions.week == test_week])
    score = mean_average_precision(predictions, purchases, 12)
    print(score)

0.022398265847538644
