In [1]:
# make external scripts auto reload
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from lightgbm.sklearn import LGBMRanker

from experiment_template import *
from candidate_generation import *

In [40]:
BASE_PATH = '../../../data/'
# DATA_PATH = BASE_PATH + 'sample_0.05/'
DATA_PATH = BASE_PATH + 'parquet/'

# make sure the same data preprocessing as in the radek notebook have been performed
# (see 02 FE/DataProcessingRadek.ipynb)
transactions = pd.read_parquet(DATA_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(DATA_PATH + 'customers.parquet')
articles = pd.read_parquet(DATA_PATH + 'articles.parquet')
sample_submission = pd.read_csv(BASE_PATH + 'original/sample_submission.csv')

In [41]:
### split into training and testing
# one week is used for testing
# a number of weeks leading up to the test week are used to train the ranker
test_week = 105
num_training_weeks = 10
transactions = add_relative_week(transactions, test_week)
training_weeks = np.arange(test_week-num_training_weeks, test_week)
train_data = transactions[transactions.week.isin(training_weeks)].reset_index(drop=True)

active_users = train_data.customer_id.unique()
cold_users = list(set(customers.customer_id) - set(active_users))

### generate training examples and testing candidates
# optimisation: only generate testing candidates for customers with ground truth data
# not possible for submission week
test_customers = None
if test_week < transactions.week.max() + 1:
    p = get_purchases(transactions[transactions.week == test_week])
    test_customers = p.customer_id.values

# get the examples and candidates
# examples are (customer, week, article, purchased)
# candidates are (customer, article)
train_examples, test_candidates = get_examples_candidates(train_data, test_week, test_customers, customers, articles)

In [42]:
# add features and prepare data for ranker
X_train = add_features(train_examples, transactions, customers, articles)
X_test = add_features(test_candidates, transactions, customers, articles)
Y_train = train_examples['purchased']

### fit ranker
# training_groups tells LGBM that each (week, customer_id) combination is a seperate basket
# !!! it is important that the training_examples are sorted according to week, customer_id for this to work
ranker = LGBMRanker(
    force_row_wise=True,
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=100,
    importance_type='gain'
)
# train_groups = train_examples.groupby(['customer_id'])['article_id'].count().values
train_groups = train_examples.groupby(['week', 'customer_id'])['article_id'].count().values
ranker.fit(X_train, Y_train, group=train_groups)
print_importance(ranker, X_train.columns)

### evaluate / submit
# generate recommendations
predictions = get_predictions(test_candidates, X_test, ranker, 12)

# cold users
bask = baskets(None, 105, cold_users, True)
c = pd.concat([
    candidates_article_feature(bask, transactions, articles, 'prod_name', 6, 1, 2, 6, True),
    candidates_popularity(bask, transactions, 12, 1)
]).drop(columns='week').drop_duplicates(['customer_id', 'article_id']).groupby('customer_id').head(12).groupby('customer_id', as_index=False).article_id.apply(list).rename(columns={'article_id':'prediction'})
predictions = pd.concat([predictions[predictions.customer_id.isin(active_users)], c])

if test_week < transactions.week.max() + 1:
    # calculate score
    purchases = get_purchases(transactions[transactions.week == test_week])
    score = mean_average_precision(predictions, purchases, 12)
    print(score)
else:
    # write submission
    sub = create_submission(predictions, sample_submission)
    sub.to_csv(BASE_PATH + 'sub05-12f.csv.gz', index=False)

[LightGBM] [Info] Total Bins 1153
[LightGBM] [Info] Number of data points in the train set: 13598824, number of used features: 28
        c_af_colour_group_name 0.1676949708
                 c_popularity1 0.1610734732
          c_af_department_name 0.1594326588
                      c_cf_age 0.1522262954
                c_af_prod_code 0.1277999610
              c_cf_postal_code 0.1041752131
                       c_cf_FN 0.0543506745
                  c_repurchase 0.0437788219
                 c_popularity2 0.0232170143
                   postal_code 0.0011987464
              garment_group_no 0.0009644397
             colour_group_code 0.0007661071
               preferred_price 0.0006843258
                    section_no 0.0004807633
                 article_price 0.0003824342
       graphical_appearance_no 0.0003703204
                 department_no 0.0003680577
               product_type_no 0.0002923639
                           age 0.0002413341
     perceived_colour_value_id 0.0

In [88]:
scored_candidates = test_candidates.copy()
scored_candidates["score"] = ranker.predict(X_test)
a = scored_candidates.sort_values(["customer_id", "score"], ascending=False).groupby("customer_id").head(12)

for c in a.columns:
    if c[0:2] == 'c_':
        vc = a[c].value_counts()
        vct = vc[True]
        vcf = vc[False]
        print(c, vct, vcf, vct/(vct+vcf))

c_af_colour_group_name 6116 35020 0.1486775573706729
c_af_department_name 5804 35332 0.1410929599377674
c_af_prod_code 10909 30227 0.2651935044729677
c_cf_FN 14370 26766 0.34932905484247373
c_cf_age 2859 38277 0.06950116686114352
c_cf_postal_code 4039 37097 0.09818650330610658
c_popularity1 19348 21788 0.47034227926876704
c_popularity2 9147 31989 0.22235997666277713
c_repurchase 8388 32748 0.20390898483080513


In [91]:
def recall(predictions, test_data):
    joined = pd.merge(test_data, predictions, how='inner').drop_duplicates()
    relevant_selected = joined.groupby('customer_id').count()
    relevant_total = test_data.groupby('customer_id').count()

    recall = relevant_selected.divide(relevant_total, fill_value=0)
    return recall.mean().values[0]

In [103]:
pred = a[['customer_id', 'article_id']].reset_index(drop=True)
actual = transactions[transactions.week == test_week][['customer_id', 'article_id']].drop_duplicates()

In [104]:
recall(pred, actual)

0.05676208646027177

+ 0.021324595079570654 no rank
+ 0.021802492909964423 combined rank
+ 0.02138256058599553 seperate rank per candidate generation scheme

Not using rank

+ 0.02234637343056455 customer_id and article_id

Candidate can have multiple sources

+ 0.025760478853858997 with multiple bools

0.05 sample, 10 weeks training data

+ 0.026162023628432167
+ 0.025892707105825957 article_id, customer_id
+ 0.025796035625746828 article_id, customer_id, rank

0.05 sample, 20 weeks training data

+ 0.027860987873826185

0.05 sample, 50 weeks training data

+ 0.02644852466659573