In [110]:
import numpy as np 
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.base import BaseEstimator, TransformerMixin

In [111]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


In [112]:
# load preprocessed data with parquet
transactions = pd.read_parquet('transactions_train.parquet')
customers = pd.read_parquet('customers.parquet')
articles = pd.read_parquet('articles.parquet')

In [113]:
customers["postal_code"].unique().size

352899

In [114]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype
---  ------                        --------------   -----
 0   article_id                    105542 non-null  int32
 1   product_code                  105542 non-null  int32
 2   prod_name                     105542 non-null  int32
 3   product_type_no               105542 non-null  int32
 4   product_type_name             105542 non-null  int16
 5   product_group_name            105542 non-null  int8 
 6   graphical_appearance_no       105542 non-null  int32
 7   graphical_appearance_name     105542 non-null  int8 
 8   colour_group_code             105542 non-null  int32
 9   colour_group_name             105542 non-null  int8 
 10  perceived_colour_value_id     105542 non-null  int32
 11  perceived_colour_value_name   105542 non-null  int8 
 12  perceived_colour_master_id    105542 non-null  int32
 13  perceived_colo

In [115]:
postal_articles = transactions.merge(customers, on="customer_id", how="inner")[["article_id", "postal_code"]]

In [116]:
postal_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31788324 entries, 0 to 31788323
Data columns (total 2 columns):
 #   Column       Dtype
---  ------       -----
 0   article_id   int32
 1   postal_code  int32
dtypes: int32(2)
memory usage: 485.1 MB


In [117]:
from recpack.matrix import InteractionMatrix
from recpack.scenarios import WeakGeneralization
from recpack.algorithms import MultVAE
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem, MaxItemsPerUser, Deduplicate

In [118]:
# define preprocess pipeline
preprocess = DataFramePreprocessor("article_id", "postal_code")

preprocess.add_filter(
    Deduplicate("article_id", "postal_code")
)
preprocess.add_filter(
    MinItemsPerUser(400, "article_id", "postal_code")
)
preprocess.add_filter(
    MinUsersPerItem(100, "article_id", "postal_code")
)
preprocess.add_filter(
    MaxItemsPerUser(5000, "article_id", "postal_code")
)
interaction_matrix = preprocess.process(postal_articles)

  0%|          | 0/1429465 [00:00<?, ?it/s]

  0%|          | 0/1429465 [00:00<?, ?it/s]

In [10]:
scenario = WeakGeneralization(0.8, validation=True)
scenario.split(interaction_matrix)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [11]:
interaction_matrix.values.shape

(5109, 7582)

In [12]:
algorithm = MultVAE()

In [21]:
fit = algorithm.fit(scenario.full_training_data, (scenario.validation_data_in, scenario.validation_data_out))

2022-11-29 22:23:28,190 - base - recpack - INFO - Processed epoch 0 in 1.14 s.Batch Training Loss = 2005.4914
2022-11-29 22:23:54,608 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.011561434392827651, which is better than previous iterations.
2022-11-29 22:23:54,610 - base - recpack - INFO - Model improved. Storing better model.
2022-11-29 22:23:54,664 - base - recpack - INFO - Evaluation at end of 0 took 26.47 s.
2022-11-29 22:23:55,750 - base - recpack - INFO - Processed epoch 1 in 1.08 s.Batch Training Loss = 2006.5076
2022-11-29 22:24:17,095 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.016065811222353378, which is worse than previous iterations.
2022-11-29 22:24:17,097 - base - recpack - INFO - Evaluation at end of 1 took 21.35 s.
2022-11-29 22:24:18,155 - base - recpack - INFO - Processed epoch 2 in 1.06 s.Batch Training Loss = 2010.0359
2022-11-29 22:24:39,415 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.022

2022-11-29 22:32:00,583 - base - recpack - INFO - Evaluation at end of 22 took 21.32 s.
2022-11-29 22:32:01,662 - base - recpack - INFO - Processed epoch 23 in 1.08 s.Batch Training Loss = 1971.4098
2022-11-29 22:32:22,770 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.043283590875467036, which is worse than previous iterations.
2022-11-29 22:32:22,772 - base - recpack - INFO - Evaluation at end of 23 took 21.11 s.
2022-11-29 22:32:23,880 - base - recpack - INFO - Processed epoch 24 in 1.11 s.Batch Training Loss = 1970.8116
2022-11-29 22:32:45,013 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.04325145218707472, which is worse than previous iterations.
2022-11-29 22:32:45,016 - base - recpack - INFO - Evaluation at end of 24 took 21.13 s.
2022-11-29 22:32:46,091 - base - recpack - INFO - Processed epoch 25 in 1.07 s.Batch Training Loss = 1973.0999
2022-11-29 22:33:06,866 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.

2022-11-29 22:40:28,525 - base - recpack - INFO - Processed epoch 46 in 1.07 s.Batch Training Loss = 1961.9926
2022-11-29 22:40:49,390 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.04454320905058891, which is worse than previous iterations.
2022-11-29 22:40:49,393 - base - recpack - INFO - Evaluation at end of 46 took 20.87 s.
2022-11-29 22:40:50,470 - base - recpack - INFO - Processed epoch 47 in 1.08 s.Batch Training Loss = 1968.2176
2022-11-29 22:41:12,187 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.04484744554171584, which is worse than previous iterations.
2022-11-29 22:41:12,190 - base - recpack - INFO - Evaluation at end of 47 took 21.72 s.
2022-11-29 22:41:13,255 - base - recpack - INFO - Processed epoch 48 in 1.06 s.Batch Training Loss = 1968.2877
2022-11-29 22:41:34,326 - stopping_criterion - recpack - INFO - StoppingCriterion has value 0.04490782730311098, which is worse than previous iterations.
2022-11-29 22:41:34,329 - bas

KeyboardInterrupt: 

In [19]:
fit.save()

In [13]:
algorithm.load("MultVAE_loss_0.05672078799998033.trch")

In [14]:
result = algorithm.predict(interaction_matrix)

In [15]:
result.min()

-4.902098655700684

In [16]:
result.max()

4.029760837554932

In [17]:
result.mean()

-0.1441667068197327

The result now contains scores for every item per postal code. This score tells us how interesting this item is for that postal code. In the next sections, I will explore how this affects the candidate generation.

## Baseline generation

As a baseline for candidate generation I will use features about popularity and repurchases. With this, we will see whether the score improves and by how much when adding features/candidates taking the postal code results into account. Baseline inspired by https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03c_Basic_Model_Submission.ipynb

In [18]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

### Repurchase samples

In [19]:
# Get all weeks a customer purchased articles in
purch_weeks = transactions.groupby('customer_id')['week'].unique()

In [20]:
# create a mapping from every week a purchase was made to the next week a purchase was made
# the last purchase week is mapped to the test week
purch_weeks_next = {}

for c_id, weeks in purch_weeks.items():
    purch_weeks_next[c_id] = {}
    for i in range(weeks.shape[0]-1):
        purch_weeks_next[c_id][weeks[i]] = weeks[i+1]
    purch_weeks_next[c_id][weeks[-1]] = test_week

In [21]:
# the repurchase samples will be based on the original transactions
repurchase_samples = transactions.copy()

In [22]:
# use week mappings to set weeks in repurchase samples
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(purch_weeks_next[c_id][week])
repurchase_samples.week=weeks

### Popularity samples

In [23]:
# get the mean price of each article on a weekly basis
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [24]:
# for each week get the top 12 items sold, assign them a rank
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [25]:
# create dataframe stating for each item the bestseller rank of the item in the previous week
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [26]:
# get unique transaction for every user and week it was active in
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [27]:
# create samples of of items that were sold the most in the week before the sample
bestseller_samples = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [28]:
# create one transactions for each user in the test week
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [29]:
# generate bestseller candidates
bestseller_candidates = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [30]:
# concatenate all bestseller samples, drop ranking column
bestseller_samples = pd.concat([bestseller_samples, bestseller_candidates])
bestseller_samples.drop(columns='bestseller_rank', inplace=True)

### Combining everything

In [33]:
transactions['purchased'] = 1

In [34]:
# combine transactions, negative samples and candidates
data = pd.concat([transactions, repurchase_samples, bestseller_samples])
data.purchased.fillna(0, inplace=True)

In [35]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [36]:
# add article and customer info
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [37]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [38]:
# train test split
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [39]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [40]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code']

In [41]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

### Ranker model

In [42]:
from lightgbm.sklearn import LGBMRanker

In [43]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [44]:
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.847855
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.161885
[LightGBM] [Debug] init for col-wise cost 0.083216 seconds, init for row-wise cost 0.173942 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1067
[LightGBM] [Info] Number of data points in the train set: 11643599, number of used features: 17
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9


In [45]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

article_id 0.2895738969603472
graphical_appearance_no 0.2031760352858636
department_no 0.1518033461810415
product_type_no 0.07518838393666324
section_no 0.05930797516241852
garment_group_no 0.05875708751234863
perceived_colour_master_id 0.0514193307202976
perceived_colour_value_id 0.04928437615963122
colour_group_code 0.04590556194744773
index_group_no 0.01558400613394081
index_code 0.0
age 0.0
FN 0.0
Active 0.0
club_member_status 0.0
fashion_news_frequency 0.0
postal_code 0.0


In [46]:
# create predictions
test['preds'] = ranker.predict(test_X)

# order predicted articles per user
user_predicted_articles = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

# get bestsellers from last week before test week
bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

In [48]:
sub = pd.read_csv('../data/sample_submission.csv')

In [51]:
# reform predictions to correct format for submission
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = user_predicted_articles.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [52]:
# baseline submission
sub_name = 'baseline_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)

## Candidates based on postal codes

Based on the interaction matrix we can generate samples/candidates for postal codes using a threshold score. 

In [54]:
# create dataframe with postal codes and items

# TODO: this is wrong. I thought the code below would get me the postal codes and article ids, but instead it
# returns indices for the interaction matrix

# first get all entries for dataframe
scores = result.toarray().flatten()
postal_codes = np.unique(interaction_matrix.indices[0])
articles = np.unique(interaction_matrix.indices[1])

# select only the entries passing threshold
threshold = 2
idx = np.nonzero(scores > threshold)

postal_codes_entries = np.repeat(postal_codes, len(articles))[idx]
articles_entries = np.tile(articles, len(postal_codes))[idx]
score_entries = scores[idx]

In [55]:
# amount of candidates per postal code
postal_codes_entries.size / postal_codes.size

39.050107653161085

In [93]:
postal_scores = pd.DataFrame({"postal_code": postal_codes_entries,
                             "article_id": articles_entries,
                             "score": score_entries})

In [120]:
postal_scores

Unnamed: 0,postal_code,article_id,score
0,0,37,2.067054
1,0,67,2.366963
2,0,204,2.002607
3,0,212,2.212671
4,0,362,2.223141
...,...,...,...
199502,5108,2939,2.053555
199503,5108,3034,2.028477
199504,5108,3402,2.195340
199505,5108,3422,2.310904
