based on https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03c_Basic_Model_Submission.ipynb

implemented features: [Here](#Features)

In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [6]:
%%time

BASE_PATH = '../../../data/parquet/'

transactions = pd.read_parquet(BASE_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(BASE_PATH + 'customers.parquet')
articles = pd.read_parquet(BASE_PATH + 'articles.parquet')

CPU times: user 1 s, sys: 790 ms, total: 1.79 s
Wall time: 863 ms


In [7]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

In [84]:
transactions.week.unique()

array([ 95,  96,  97,  98,  99, 100, 101, 102, 103, 104], dtype=int8)

# Generating candidates

## Last purchase candidates 

In [9]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

candidates_last_purchase = transactions.copy()

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 12.1 s, sys: 127 ms, total: 12.2 s
Wall time: 12.2 s


In [8]:
# transactions.groupby('week')['t_dat'].agg(['min', 'max'])
# c2weeks
# c2weeks2shifted_weeks[28847241659200]
# transactions[transactions['customer_id'] == 272412481300040]
# candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040]

## Bestsellers candidates

In [11]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [12]:
# mean_price
# sales
# sales.loc[95]
# bestsellers_previous_week.pipe(lambda df: df[df['week']==96])
# unique_transactions
# transactions.drop_duplicates(['week', 'customer_id'])
# test_set_transactions
# candidates_bestsellers[candidates_bestsellers.customer_id == 200292573348128]

# Combining transactions and candidates / negative examples

In [13]:
transactions['purchased'] = 1

In [14]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [15]:
data.purchased.mean()

0.13607582749165664

## Add bestseller information

In [16]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [18]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

# Features

In [19]:
# bonus feature: clipped price
transactions['clipped_price'] = transactions['price'].clip(0, 0.2)

In [20]:
# feature1: average price for customers
customer_avg_price = transactions.groupby('customer_id')['clipped_price'].mean().to_frame('preferred_price')

In [21]:
# feature2: max price for articles
article_max_price = transactions.groupby('article_id')['clipped_price'].max().to_frame('article_price')

In [22]:
# feature3: has kids
buys_for_kids = pd.merge(
    transactions[['customer_id', 'article_id']],
    articles[['article_id', 'index_group_name']], 
    on='article_id'
).groupby('customer_id').index_group_name.agg(lambda x: 1 in x.values).to_frame('buys_for_kids')
# index_group_name 1 is 'Baby/Children'

In [23]:
data = pd.merge(data, customer_avg_price, on='customer_id', how='left')
data = pd.merge(data, article_max_price, on='article_id', how='left')
data = pd.merge(data, buys_for_kids, on='customer_id', how='left')

In [24]:
new_features = ['preferred_price', 'article_price', 'buys_for_kids']

In [25]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [26]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [27]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [28]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank', *new_features]

In [29]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 170 ms, sys: 430 ms, total: 600 ms
Wall time: 866 ms


In [101]:
train_X.head(10)

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,...,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,bestseller_rank,preferred_price,article_price,buys_for_kids
0,887770001,253,1010016,9,4,5,1510,0,1,6,...,True,True,0,1,21,57896,999.0,0.2,0.2,False
1,762846001,259,1010016,10,3,9,1515,0,1,11,...,True,True,0,1,21,57896,999.0,0.2,0.2,False
2,829308001,273,1010016,9,4,5,8310,9,26,5,...,True,True,0,1,21,57896,999.0,0.2,0.2,False
3,760084003,272,1010016,9,4,5,1747,1,2,53,...,True,True,0,1,21,57896,1.0,0.2,0.2,False
4,866731001,273,1010016,9,4,5,8310,9,26,5,...,True,True,0,1,21,57896,2.0,0.2,0.2,False
5,600886001,59,1010016,9,4,5,4242,7,1,60,...,True,True,0,1,21,57896,3.0,0.2,0.2,False
6,706016001,272,1010016,9,4,5,1747,1,2,53,...,True,True,0,1,21,57896,4.0,0.2,0.2,False
7,372860002,302,1010016,10,3,9,3611,7,1,62,...,True,True,0,1,21,57896,5.0,0.2,0.2,False
8,610776002,255,1010016,9,4,5,1676,0,1,16,...,True,True,0,1,21,57896,6.0,0.2,0.2,False
9,877278002,272,1010001,12,1,11,1636,0,1,15,...,True,True,0,1,21,57896,7.0,0.2,0.2,False


In [31]:
test_X

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,...,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,bestseller_rank,preferred_price,article_price,buys_for_kids
11381612,925246001,262,1010016,9,4,5,1201,0,1,19,...,True,True,0,1,21,57896,999.0,0.2,0.2,False
11381613,924243001,252,1010016,13,1,1,1626,0,1,15,...,True,True,0,1,21,57896,1.0,0.2,0.2,False
11381614,924243002,252,1010016,9,4,5,1626,0,1,15,...,True,True,0,1,21,57896,2.0,0.2,0.2,False
11381615,918522001,252,1010016,11,3,9,1626,0,1,15,...,True,True,0,1,21,57896,3.0,0.2,0.2,False
11381616,923758001,-1,1010016,10,3,9,1522,0,1,15,...,True,True,0,1,21,57896,4.0,0.2,0.2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991757,915529003,252,1010016,9,4,5,1626,0,1,15,...,True,True,0,1,60,96323,8.0,0.2,0.2,False
17991758,915529005,252,1010016,13,1,1,1626,0,1,15,...,True,True,0,1,60,96323,9.0,0.2,0.2,False
17991759,448509014,272,1010016,72,3,2,1747,1,2,53,...,True,True,0,1,60,96323,10.0,0.2,0.2,False
17991760,762846027,259,1010016,13,1,1,1515,0,1,11,...,True,True,0,1,60,96323,11.0,0.2,0.2,False


# Model training

In [32]:
from lightgbm.sklearn import LGBMRanker

In [33]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [34]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.923886
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.322295
[LightGBM] [Debug] init for col-wise cost 0.101143 seconds, init for row-wise cost 0.523474 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.216152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1087
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 21
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
CPU times: user 12.2 s, sys: 1.3 s, total: 13.5 s
Wall time: 4.01 s


In [35]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9986468102503531
buys_for_kids 0.00035750481960537616
article_id 0.0002002107021987056
age 0.00018870858486433255
garment_group_no 0.00012491079900885322
index_group_no 0.00010494755016736
section_no 8.300656096389022e-05
department_no 7.067087055189052e-05
postal_code 6.0616819501138794e-05
club_member_status 5.708301834740371e-05
product_type_no 5.0199122789140196e-05
colour_group_code 2.2373548760100817e-05
perceived_colour_value_id 1.7753200580837954e-05
Active 1.520415230786778e-05
fashion_news_frequency 0.0
FN 0.0
article_price 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
preferred_price 0.0


# Calculate predictions

In [74]:
my_test_data = pd.read_parquet(BASE_PATH + 'candidates.parquet')
my_test_data = pd.merge(my_test_data, customers, how='left')
my_test_data = pd.merge(my_test_data, articles, how='left')

In [75]:
my_test_data = pd.merge(my_test_data, customer_avg_price, on='customer_id', how='left')
my_test_data = pd.merge(my_test_data, article_max_price, on='article_id', how='left')
my_test_data = pd.merge(my_test_data, buys_for_kids, on='customer_id', how='left')

In [76]:
my_test_data['bestseller_rank'] = 999.0
my_test_data['buys_for_kids'] = my_test_data['buys_for_kids'].astype(bool)

In [77]:
%time

my_test_data['preds'] = ranker.predict(my_test_data[columns_to_use])

c_id2predicted_article_ids = my_test_data \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 5.01 µs


# Create submission

In [79]:
sub = pd.read_csv(BASE_PATH + '../original/sample_submission.csv')

In [80]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 2.76 s, sys: 74.2 ms, total: 2.84 s
Wall time: 2.84 s


In [81]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [82]:
sub_name = 'basic_model_submission5'
sub.to_csv(f'{sub_name}.csv.gz', index=False)