In [172]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [173]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [174]:
import pandas as pd

In [233]:
%%time

transactions = pd.read_parquet('../../data/alt/transactions_train.parquet')
customers = pd.read_parquet('../../data/alt/customers.parquet')
articles = pd.read_parquet('../../data/alt/articles.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: total: 1.67 s
Wall time: 1.08 s


In [234]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [236]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: total: 14.7 s
Wall time: 19.7 s


In [237]:
c2weeks

customer_id
28847241659200          [95, 96, 101, 102]
41318098387474                        [98]
116809474287335                 [101, 103]
200292573348128          [95, 96, 99, 102]
248294615847351                       [96]
                               ...        
18446624797007271432                  [95]
18446630855572834764                 [103]
18446662237889060501                 [100]
18446705133201055310                 [102]
18446737527580148316                 [104]
Name: week, Length: 437365, dtype: object

In [238]:
# Verification that weeks are actually properly grouped
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
95,2020-07-15,2020-07-21
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15
104,2020-09-16,2020-09-22


In [239]:
%%time

c2weeks2shifted_weeks = {}

# Shift all weeks by one, and add the test week as the last week
for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: total: 438 ms
Wall time: 727 ms


In [241]:
c2weeks2shifted_weeks[28847241659200]

{95: 96, 96: 101, 101: 102, 102: 105}

In [242]:
candidates_last_purchase = transactions.copy()

In [243]:
%%time

# Applies the shifted weeks to the transactions
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
candidates_last_purchase.week=weeks

CPU times: total: 15.9 s
Wall time: 19.6 s


### Bestsellers candidates

In [245]:
# Gets the mean price of each article in each week
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()

In [246]:
mean_price

week  article_id
95    108775015     0.004729
      108775044     0.008458
      110065001     0.006085
      110065002     0.006085
      111565001     0.004288
                      ...   
104   952267001     0.013732
      952938001     0.048651
      953450001     0.016932
      953763001     0.021885
      956217002     0.059068
Name: price, Length: 196880, dtype: float32

In [248]:
# Gets the bestsellers of each week
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [249]:
# Bestsellers of week 95
sales.loc[95]

article_id
760084003     1
866731001     2
600886001     3
706016001     4
372860002     5
610776002     6
877278002     7
547780003     8
817354001     9
827968001    10
866731003    11
866383006    12
Name: bestseller_rank, dtype: int8

In [250]:
# Merge bestsellers and mean_price of products
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [251]:
# Get the transactions in week 95
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()
print(hash_dataframe(unique_transactions))


f6697294304f661350e744c6c0822da2ee24edd670d728ee1c3f14aa78929146


In [252]:
# Remove duplicates where the customer has purchased more than one item in the same week
transactions.drop_duplicates(['week', 'customer_id'])
print(hash_dataframe(transactions))

4ef150200c2c4cced13205b6b09b3e050d3384c7b19d58e75ae0d0e2b96fd9d5


In [196]:
# Merge bestsellers of week 96 with transactions of week 95
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)
print(hash_dataframe(candidates_bestsellers))


debdba81ee6e49784321699817202694d49705d01c553a06a21e6daf21ad6370


In [197]:
# Keep only one transaction per customer
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week
print(hash_dataframe(test_set_transactions))


a1b3911927771e69692a52d0fdcd52cd31575b4621c7265fc9023e318871d852


In [198]:
test_set_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
0,2020-07-15,272412481300040,1,105
1,2020-07-15,1456826891333599,1,105
2,2020-07-15,2133687643102426,2,105
3,2020-07-15,6010692573790711,1,105
4,2020-07-15,6171059100114610,2,105
...,...,...,...,...
437360,2020-09-22,18410229429441241008,2,105
437361,2020-09-22,18417769707947924979,2,105
437362,2020-09-22,18418054986721795659,2,105
437363,2020-09-22,18421175435799911749,2,105


In [199]:
# Join the rankings of all top products for the test week for each customer
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)
print(hash_dataframe(candidates_bestsellers_test_week))


80b74a00ca6cdab8ed511bcf179164d8410b1f365b63255f25daf501e042689a


In [200]:
# Join the bestsellers with the bestsellers of the test week generating the complete candidate set
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)
print(hash_dataframe(candidates_bestsellers))


f776275a94b1605aaf13b8ebb67cc98133e6539d84b638a59c09dc003a8ea9e5


# Combining transactions and candidates / negative examples

In [201]:
transactions['purchased'] = 1

In [202]:
# Full candidate set consists of all transactions (bought items), the last bought item of each customer and the bestsellers of each week
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)
print(hash_dataframe(data))


9f00f26bc2c67bae702c4a6102293fe00e8db3ce51a315037f8727fb05008a07


In [203]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)
print(hash_dataframe(data))


4f66379dabb63dd6fb2b3f976942bc9429eadca9edc512d3d22fc8ec00ad5752


In [204]:
# 13.6% of the data consists of positive examples
data.purchased.mean()

0.13607582749165664

### Add bestseller information

In [205]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
print(hash_dataframe(data))


f3fea7fef5b6f1c871aabdf6a6ab8973f697f971ac06d1498a3ab90d2ded2ff0


In [206]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [207]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [208]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [209]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-07-26,28847241659200,887770001,0.016932,1,96,1.0,999.0,887770,727,...,10,1010,6,3692,1,1,0,1,21,57896
1,2020-07-18,28847241659200,762846001,0.025407,1,96,0.0,999.0,762846,472,...,7,1010,6,492,1,1,0,1,21,57896
2,2020-07-18,28847241659200,829308001,0.033881,1,96,0.0,999.0,829308,11402,...,21,1005,0,9082,1,1,0,1,21,57896
3,2020-07-26,28847241659200,760084003,0.025094,1,96,0.0,1.0,760084,1134,...,1,1009,5,847,1,1,0,1,21,57896
4,2020-07-26,28847241659200,866731001,0.024919,1,96,0.0,2.0,866731,3609,...,21,1005,0,3130,1,1,0,1,21,57896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991757,2020-09-21,18446737527580148316,915529003,0.033439,2,105,0.0,8.0,915529,7046,...,0,1003,3,10909,1,1,0,1,60,96323
17991758,2020-09-21,18446737527580148316,915529005,0.033417,2,105,0.0,9.0,915529,7046,...,0,1003,3,10909,1,1,0,1,60,96323
17991759,2020-09-21,18446737527580148316,448509014,0.041630,2,105,0.0,10.0,448509,259,...,1,1009,5,255,1,1,0,1,60,96323
17991760,2020-09-21,18446737527580148316,762846027,0.025005,2,105,0.0,11.0,762846,472,...,7,1010,6,492,1,1,0,1,60,96323


In [210]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95,1
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95,1
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95,1
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95,1
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95,1
...,...,...,...,...,...,...,...
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,104,1
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,104,1
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,104,1
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,104,1


In [211]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [212]:
# Train baskets
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [213]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [214]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: total: 344 ms
Wall time: 523 ms


# Model training

In [215]:
from lightgbm.sklearn import LGBMRanker

In [216]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [217]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.848850
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.153099
[LightGBM] [Debug] init for col-wise cost 0.150212 seconds, init for row-wise cost 0.229178 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
CPU times: total: 37.1 s
Wall time: 3.66 s


In [218]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9989805519216203
age 0.00024136038957903926
article_id 0.00017160828400263902
garment_group_no 0.0001448188543340445
department_no 9.637421875769266e-05
product_type_no 9.014783292439592e-05
section_no 7.067204716548531e-05
postal_code 6.792197441369627e-05
club_member_status 6.519780240033951e-05
colour_group_code 5.358754121027148e-05
perceived_colour_value_id 1.775913359216025e-05
fashion_news_frequency 0.0
Active 0.0
FN 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
index_group_no 0.0


# Calculate predictions

In [219]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: total: 0 ns
Wall time: 0 ns


In [220]:
bestsellers_last_week

[924243001,
 924243002,
 918522001,
 923758001,
 866731001,
 909370001,
 751471001,
 915529003,
 915529005,
 448509014,
 762846027,
 714790020]

# Create submission

In [221]:
sub = pd.read_csv('../../data/submissions/sample_submission.csv')

In [222]:
c_id2predicted_article_ids

{28847241659200: [925246001,
  924243001,
  924243002,
  918522001,
  915529003,
  915529005,
  866731001,
  909370001,
  751471001,
  448509014,
  714790020,
  923758001,
  762846027],
 41318098387474: [868879003,
  924243002,
  918522001,
  924243001,
  915529003,
  915529005,
  866731001,
  909370001,
  751471001,
  448509014,
  714790020,
  923758001,
  762846027],
 116809474287335: [906305002,
  924243001,
  924243002,
  918522001,
  915529003,
  915529005,
  866731001,
  909370001,
  751471001,
  448509014,
  714790020,
  923758001,
  762846027],
 200292573348128: [903861001,
  924243001,
  924243002,
  918522001,
  915529003,
  915529005,
  866731001,
  909370001,
  751471001,
  448509014,
  714790020,
  923758001,
  762846027],
 248294615847351: [720504008,
  337991001,
  878987003,
  471714002,
  924243001,
  924243002,
  918522001,
  915529003,
  915529005,
  866731001,
  909370001,
  751471001,
  448509014,
  714790020,
  923758001,
  762846027],
 272412481300040: [923460002

In [223]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: total: 3.59 s
Wall time: 3.68 s


In [224]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [225]:
sub

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0924243001 0924243002 0918522001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0918522001 0923758001 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243001 0924243002 0918522001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0918522001 0923758001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0927530004 0730683050 0791587015 0896152002 09...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0713997002 0720125039 0557599022 0740922009 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0762846031 0924243001 0924243002 0918522001 09...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0924243001 0924243002 0918522001 0915529003 09...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0924243001 0924243002 0918522001 0923758001 08...


In [226]:
sub_name = 'basic_model_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)