In [42]:
import numpy as np 
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.base import BaseEstimator, TransformerMixin

In [43]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


In [44]:
# load preprocessed data with parquet
transactions = pd.read_parquet('transactions_train.parquet')
customers = pd.read_parquet('customers.parquet')
articles = pd.read_parquet('articles.parquet')

In [45]:
customers["postal_code"].unique().size

352899

In [46]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype
---  ------                        --------------   -----
 0   article_id                    105542 non-null  int32
 1   product_code                  105542 non-null  int32
 2   prod_name                     105542 non-null  int32
 3   product_type_no               105542 non-null  int32
 4   product_type_name             105542 non-null  int16
 5   product_group_name            105542 non-null  int8 
 6   graphical_appearance_no       105542 non-null  int32
 7   graphical_appearance_name     105542 non-null  int8 
 8   colour_group_code             105542 non-null  int32
 9   colour_group_name             105542 non-null  int8 
 10  perceived_colour_value_id     105542 non-null  int32
 11  perceived_colour_value_name   105542 non-null  int8 
 12  perceived_colour_master_id    105542 non-null  int32
 13  perceived_colo

In [47]:
postal_articles = transactions.merge(customers, on="customer_id", how="inner")[["article_id", "postal_code"]]

In [48]:
postal_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31788324 entries, 0 to 31788323
Data columns (total 2 columns):
 #   Column       Dtype
---  ------       -----
 0   article_id   int32
 1   postal_code  int32
dtypes: int32(2)
memory usage: 485.1 MB


In [49]:
from recpack.matrix import InteractionMatrix
from recpack.scenarios import WeakGeneralization
from recpack.algorithms import MultVAE
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem, MaxItemsPerUser, Deduplicate

In [50]:
# define preprocess pipeline
preprocess = DataFramePreprocessor("article_id", "postal_code")

preprocess.add_filter(
    Deduplicate("article_id", "postal_code")
)
preprocess.add_filter(
    MinItemsPerUser(400, "article_id", "postal_code")
)
preprocess.add_filter(
    MinUsersPerItem(100, "article_id", "postal_code")
)
preprocess.add_filter(
    MaxItemsPerUser(5000, "article_id", "postal_code")
)
interaction_matrix = preprocess.process(postal_articles)

  0%|          | 0/1429465 [00:00<?, ?it/s]

  0%|          | 0/1429465 [00:00<?, ?it/s]

In [51]:
scenario = WeakGeneralization(0.8, validation=True)
scenario.split(interaction_matrix)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [52]:
interaction_matrix.values.shape

(5109, 7582)

In [53]:
algorithm = MultVAE()

Instead of fitting algorithm every time, it is saved the first time and loaded from disk.

In [None]:
# fit = algorithm.fit(scenario.full_training_data, (scenario.validation_data_in, scenario.validation_data_out))
# fit.save()

In [54]:
algorithm.load("MultVAE_loss_0.05672078799998033.trch")

In [55]:
result = algorithm.predict(interaction_matrix)

In [56]:
result.min()

-4.902098655700684

In [57]:
result.max()

4.029760837554932

In [58]:
result.mean()

-0.1441667068197327

The result now contains scores for every item per postal code. This score tells us how interesting this item is for that postal code. In the next sections, I will explore how this affects the candidate generation.

## Baseline generation

As a baseline for candidate generation I will use features about popularity and repurchases. With this, we will see whether the score improves and by how much when adding features/candidates taking the postal code results into account. Baseline inspired by https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03c_Basic_Model_Submission.ipynb

In [59]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

### Repurchase samples

In [60]:
# Get all weeks a customer purchased articles in
purch_weeks = transactions.groupby('customer_id')['week'].unique()

In [61]:
# create a mapping from every week a purchase was made to the next week a purchase was made
# the last purchase week is mapped to the test week
purch_weeks_next = {}

for c_id, weeks in purch_weeks.items():
    purch_weeks_next[c_id] = {}
    for i in range(weeks.shape[0]-1):
        purch_weeks_next[c_id][weeks[i]] = weeks[i+1]
    purch_weeks_next[c_id][weeks[-1]] = test_week

In [62]:
# the repurchase samples will be based on the original transactions
repurchase_samples = transactions.copy()

In [63]:
# use week mappings to set weeks in repurchase samples
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(purch_weeks_next[c_id][week])
repurchase_samples.week=weeks

### Popularity samples

In [64]:
# get the mean price of each article on a weekly basis
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [65]:
# for each week get the top 12 items sold, assign them a rank
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [66]:
# create dataframe stating for each item the bestseller rank of the item in the previous week
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [67]:
# get unique transaction for every user and week it was active in
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [68]:
# create samples of of items that were sold the most in the week before the sample
bestseller_samples = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [69]:
# create one transactions for each user in the test week
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [70]:
# generate bestseller candidates
bestseller_candidates = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [71]:
# concatenate all bestseller samples, drop ranking column
bestseller_samples = pd.concat([bestseller_samples, bestseller_candidates])
bestseller_samples.drop(columns='bestseller_rank', inplace=True)

### Combining everything

In [163]:
transactions['purchased'] = 1

In [164]:
# combine transactions, negative samples and candidates
data = pd.concat([transactions, repurchase_samples, bestseller_samples])
data.purchased.fillna(0, inplace=True)
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
data.fillna(999, inplace=True)
data = data[data.week != data.week.min()]

In [165]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [166]:
# add article and customer info
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [167]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [168]:
# train test split
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [169]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [181]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [182]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

### Ranker model

In [183]:
!pip install lightgbm



In [184]:
from lightgbm.sklearn import LGBMRanker

In [185]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [186]:
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.848850
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.153099
[LightGBM] [Debug] init for col-wise cost 0.203516 seconds, init for row-wise cost 0.424966 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12


In [187]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9989805519216203
age 0.00024136038957903926
article_id 0.00017160828400263902
garment_group_no 0.0001448188543340445
department_no 9.637421875769266e-05
product_type_no 9.014783292439592e-05
section_no 7.067204716548531e-05
postal_code 6.792197441369627e-05
club_member_status 6.519780240033951e-05
colour_group_code 5.358754121027148e-05
perceived_colour_value_id 1.775913359216025e-05
fashion_news_frequency 0.0
Active 0.0
FN 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
index_group_no 0.0


In [188]:
# create predictions
test['preds'] = ranker.predict(test_X)

# order predicted articles per user
user_predicted_articles = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

# get bestsellers from last week before test week
bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

In [189]:
sub = pd.read_csv('sample_submission.csv')

In [190]:
# reform predictions to correct format for submission
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = user_predicted_articles.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [191]:
# baseline submission
sub_name = 'baseline_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)

## Candidates based on postal codes

Based on the interaction matrix we can generate samples/candidates for postal codes using a threshold score. 

In [90]:
sel_postal = list(preprocess.user_id_mapping.keys())
len(sel_postal)

5109

In [91]:
sel_articles = list(preprocess.item_id_mapping.keys())
len(sel_articles)

7582

In [92]:
# create dataframe with postal codes and items

# first get all entries for dataframe
scores = result.toarray().flatten()

# select only the entries passing threshold
threshold = 1.5
idx = np.nonzero(scores > threshold)

postal_codes_entries = np.repeat(sel_postal, len(articles))[idx]
articles_entries = np.tile(sel_articles, len(sel_postal))[idx]
score_entries = scores[idx]

In [93]:
# amount of candidates per postal code
postal_codes_entries.size / len(sel_postal)

163.75611665688

In [94]:
# dataframe that contains most interesting articles per postal code, by score
postal_scores = pd.DataFrame({"postal_code": postal_codes_entries,
                             "article_id": articles_entries,
                             "score": score_entries})

In [95]:
# get customer postal code pairs
customer_postal = customers[["customer_id", "postal_code"]]

In [96]:
# get a unique transaction per user with postal code information
unique_transactions_postal = pd.merge(test_set_transactions, customer_postal, on="customer_id")

In [97]:
# generate candidates: for each postal code, assign article candidates with score
postal_candidates = pd.merge(unique_transactions_postal, postal_scores, on="postal_code")

In [98]:
# compute mean price and add info to candidates
overall_mean_price = transactions \
    .groupby('article_id')['price'].mean()

In [99]:
# add price information, temporarily drop score and postal code for concatenation
postal_candidates = pd.merge(postal_candidates, overall_mean_price, on="article_id")
postal_candidates.drop(columns=["score", "postal_code"], inplace=True)

In [100]:
postal_candidates

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-15,48202911737860740,2,105,372860024,0.012065
1,2020-07-15,48202911737860740,2,105,372860024,0.012065
2,2020-07-15,48202911737860740,2,105,372860024,0.012065
3,2020-07-15,48202911737860740,2,105,372860024,0.012065
4,2020-07-16,263518441120604598,2,105,372860024,0.012065
...,...,...,...,...,...,...
6638560,2020-08-23,2113435388202881281,2,105,638899001,0.030492
6638561,2020-08-29,4137185423853919351,2,105,638899001,0.030492
6638562,2020-08-14,14811029570079169435,1,105,560047002,0.012792
6638563,2020-08-19,3279808540341119399,2,105,560047002,0.012792


### Combine with baseline

In [101]:
transactions['purchased'] = 1

In [102]:
# combine transactions, negative samples and candidates
data = pd.concat([transactions, repurchase_samples, bestseller_samples, postal_candidates])
data.purchased.fillna(0, inplace=True)
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
data.fillna(999, inplace=True)
data = data[data.week != data.week.min()]

In [103]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [104]:
# add article and customer info
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data = pd.merge(data, postal_scores, on=["postal_code", "article_id"], how="left").fillna(0)

In [105]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [106]:
# train test split
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [107]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [126]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'score', 'bestseller_rank']

In [127]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

### Baseline and postal scores with ranker

In [128]:
from lightgbm.sklearn import LGBMRanker

In [129]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [130]:
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.893460
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.196473
[LightGBM] [Debug] init for col-wise cost 0.236309 seconds, init for row-wise cost 0.387230 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1338
[LightGBM] [Info] Number of data points in the train set: 11638535, number of used features: 19
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12


In [131]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9990015743165809
age 0.0002330120343692021
article_id 0.00016838111819779592
garment_group_no 0.00014286846237975996
department_no 9.392537128593108e-05
product_type_no 8.760091539015257e-05
section_no 7.22913140521762e-05
postal_code 6.847171178016363e-05
club_member_status 6.311055935375977e-05
colour_group_code 5.1134378800232604e-05
perceived_colour_value_id 1.7629817809927905e-05
fashion_news_frequency 0.0
Active 0.0
FN 0.0
score 0.0
index_group_no 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0


In [132]:
# create predictions
test['preds'] = ranker.predict(test_X)

# order predicted articles per user
user_predicted_articles = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

# get bestsellers from last week before test week
bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

In [133]:
sub = pd.read_csv('sample_submission.csv')

In [134]:
# reform predictions to correct format for submission
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = user_predicted_articles.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [135]:
# baseline submission
sub_name = 'postal_scores_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)

## Postal score rank

The previous section only made a small improvement on the baseline, so small it can be dismissed. The problem here is that only for 372 postal codes candidates extra candidates are being generated because of the threshold. Besides that, all the postal codes/item pairs that have been filtered during preprocessing receive a placeholder score of 0. What might be more interesting is to instead include a rank per postal code of the 12 best items based on their score from the interaction matrix.

In [136]:
# get top 12 articles per postal code
scores = result.toarray()
idx = np.argpartition(scores, -12)[:, -12:]

In [137]:
# sort every row so that best score comes first
sorted_idx = []
for i, row in enumerate(idx):
    idx_row = np.argsort(scores[i, row])
    sorted_idx.append(idx_row[::-1])
sorted_idx = np.array(sorted_idx)
rows = [[_ for i in range(12)] for _ in range(len(scores))]

In [138]:
# create flattened list of scores
scores = scores[rows, idx[rows, sorted_idx]].flatten()
postal_codes_entries = np.repeat(sel_postal, 12)
articles_entries = np.array(sel_articles)[idx.flatten()]
ranks = np.array([_ for _ in range(1, 13)] * len(sel_postal))

In [139]:
# combine into dataframe
postal_ranks = pd.DataFrame({"postal_code": postal_codes_entries,
                             "article_id": articles_entries,
                             "score": scores,
                             "postal_rank": ranks})

In [140]:
postal_ranks

Unnamed: 0,postal_code,article_id,score,postal_rank
0,5499,751471001,2.416172,1
1,5499,759871002,2.408906,2
2,5499,685814001,2.366963,3
3,5499,372860001,2.223141,4
4,5499,484398001,2.212671,5
...,...,...,...,...
61303,298429,760084003,2.436683,8
61304,298429,610776002,2.407167,9
61305,298429,706016001,2.403442,10
61306,298429,759871002,2.346993,11


In [141]:
# generate candidates: for each postal code, assign article candidates with score
postal_candidates = pd.merge(unique_transactions_postal, postal_ranks, on="postal_code")

In [142]:
# compute mean price and add info to candidates
overall_mean_price = transactions \
    .groupby('article_id')['price'].mean()

In [143]:
# add price information, temporarily drop score and postal code for concatenation
postal_candidates = pd.merge(postal_candidates, overall_mean_price, on="article_id")
postal_candidates.drop(columns=["score", "postal_code", "postal_rank"], inplace=True)

In [144]:
postal_candidates

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-15,1456826891333599,1,105,711053003,0.013172
1,2020-07-16,3729806434627100156,1,105,711053003,0.013172
2,2020-07-21,6764689423798104216,2,105,711053003,0.013172
3,2020-07-21,13035302529127383758,2,105,711053003,0.013172
4,2020-07-25,938841704694046555,1,105,711053003,0.013172
...,...,...,...,...,...,...
396480,2020-08-31,9320718993689417505,2,105,550309001,0.014995
396481,2020-09-05,17875119368269053830,2,105,777148006,0.020181
396482,2020-09-05,17875119368269053830,2,105,790167001,0.038328
396483,2020-09-06,3267368325910590416,2,105,707135001,0.022017


### Combine again with baseline

In [145]:
transactions['purchased'] = 1

In [146]:
# combine transactions, negative samples and candidates
data = pd.concat([transactions, repurchase_samples, bestseller_samples, postal_candidates])
data.purchased.fillna(0, inplace=True)
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
data.fillna(999, inplace=True)
data = data[data.week != data.week.min()]

In [147]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [148]:
# add article and customer info
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data = pd.merge(data, postal_ranks, on=["postal_code", "article_id"], how="left")

In [149]:
data.score.fillna(0, inplace=True)
data["postal_rank"].fillna(999, inplace=True)

In [150]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [151]:
# train test split
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [152]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [153]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'score', 'postal_rank', 'bestseller_rank']

In [154]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

### Baseline and postal scores with ranker

In [155]:
from lightgbm.sklearn import LGBMRanker

In [156]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [157]:
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.920645
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.237033
[LightGBM] [Debug] init for col-wise cost 0.225499 seconds, init for row-wise cost 0.392378 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1348
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 20
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12


In [158]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9989805519216203
age 0.00024136038957903926
article_id 0.00017160828400263902
garment_group_no 0.0001448188543340445
department_no 9.637421875769266e-05
product_type_no 9.014783292439592e-05
section_no 7.067204716548531e-05
postal_code 6.792197441369627e-05
club_member_status 6.519780240033951e-05
colour_group_code 5.358754121027148e-05
perceived_colour_value_id 1.775913359216025e-05
FN 0.0
Active 0.0
postal_rank 0.0
score 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
fashion_news_frequency 0.0
index_group_no 0.0


In [159]:
# create predictions
test['preds'] = ranker.predict(test_X)

# order predicted articles per user
user_predicted_articles = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

# get bestsellers from last week before test week
bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

In [160]:
sub = pd.read_csv('sample_submission.csv')

In [161]:
# reform predictions to correct format for submission
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = user_predicted_articles.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [162]:
# baseline submission
sub_name = 'postal_ranks_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)