In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd

In [4]:
%%time

transactions = pd.read_parquet('transactions_train.parquet')
customers = pd.read_parquet('customers.parquet')
articles = pd.read_parquet('articles.parquet')

CPU times: user 5.42 s, sys: 2.3 s, total: 7.72 s
Wall time: 4.56 s


# Generating candidates

### Last purchase candidates

In [8]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

In [9]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 7.4 s, sys: 105 ms, total: 7.51 s
Wall time: 7.5 s


In [10]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
98,2020-07-19,2020-07-25
99,2020-07-26,2020-08-01
100,2020-08-02,2020-08-08
101,2020-08-09,2020-08-15
102,2020-08-16,2020-08-22
103,2020-08-23,2020-08-29
104,2020-08-30,2020-09-05
105,2020-09-06,2020-09-12
106,2020-09-13,2020-09-19
107,2020-09-20,2020-09-22


In [11]:
c2weeks

customer_id
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657         [104]
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318         [106]
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a         [101]
0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d8cd0c725276a467a2a         [106]
0000945f66de1a11d9447609b8b41b1bc987ba185a5496ae8831e8493afa24ff          [98]
                                                                       ...    
ffff12aa623c69eae8959d673f1f12ad0194ad760d77fd489cd7c5a4aa9ae240    [100, 103]
ffff61677073258d461e043cc9ed4ed97be5617a920640ff61024f4619bf41c4     [98, 102]
ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e4747568cac33e8c541831    [101, 105]
ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab53481233731b5c4f8b7    [100, 102]
ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1778d0116cffd259264         [105]
Name: week, Length: 420641, dtype: object

In [12]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 309 ms, sys: 18.2 ms, total: 327 ms
Wall time: 327 ms


In [13]:
# c2weeks2shifted_weeks[28847241659200]

In [14]:
candidates_last_purchase = transactions.copy()

In [15]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 1.58 s, sys: 38.5 ms, total: 1.62 s
Wall time: 1.65 s


In [16]:
candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,Day,Month,Year,total_purchases,most_recent_purchase,recency,week


In [17]:
transactions[transactions['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,Day,Month,Year,total_purchases,most_recent_purchase,recency,week


### Bestsellers candidates

In [18]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [19]:
mean_price

week  article_id
98    108775015     0.003842
      108775044     0.008320
      110065002     0.006085
      111565001     0.003868
      111586001     0.011713
                      ...   
107   952267001     0.014992
      952938001     0.050831
      953450001     0.016932
      953763001     0.021701
      956217002     0.058831
Name: price, Length: 189610, dtype: float64

In [20]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [21]:
sales

week  article_id
98    760084003      1
      827968001      2
      610776002      3
      706016001      4
      372860002      5
                    ..
107   930380001      8
      915529003      9
      923758001     10
      928206001     11
      915529005     12
Name: bestseller_rank, Length: 120, dtype: int8

In [25]:
#sales.loc[95]

In [26]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [27]:
bestsellers_previous_week.pipe(lambda df: df[df['week']==96])

Unnamed: 0,week,article_id,bestseller_rank,price


In [28]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [29]:
unique_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,Day,Month,Year,total_purchases,most_recent_purchase,recency,week
29202636,2020-07-19,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,2,19,7,2020,136,2020-07-30,55,98
29202644,2020-07-19,0015e1028e70b159e9bee92455c4782e72fb4ceb3fccb6...,1,19,7,2020,63,2020-07-19,66,98
29202648,2020-07-19,00190ae87b184c5a8886a6024c043275e883636c238fad...,1,19,7,2020,23,2020-08-25,29,98
29202650,2020-07-19,002b985c40b9ab3cd56bccc18e3c1d0a1242a172eb7a69...,1,19,7,2020,77,2020-09-11,12,98
29202655,2020-07-19,002ef9a63f8c49c145b59dbccc5dc540891869cc1c1f03...,2,19,7,2020,24,2020-07-19,66,98
...,...,...,...,...,...,...,...,...,...,...
31788303,2020-09-22,ffcba92c45455b22f54dd02303b057c0a4b9b1f4fcfdf2...,2,22,9,2020,192,2020-09-22,1,107
31788310,2020-09-22,ffd4cf2217de4a0a3f9f610cdec334c803692a18af08ac...,2,22,9,2020,421,2020-09-22,1,107
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,1,22,9,2020,15,2020-09-22,1,107
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,1,22,9,2020,79,2020-09-22,1,107


In [30]:
transactions.drop_duplicates(['week', 'customer_id'])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,Day,Month,Year,total_purchases,most_recent_purchase,recency,week
29202636,2020-07-19,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,917434002,0.011847,2,19,7,2020,136,2020-07-30,55,98
29202644,2020-07-19,0015e1028e70b159e9bee92455c4782e72fb4ceb3fccb6...,879248008,0.014390,1,19,7,2020,63,2020-07-19,66,98
29202648,2020-07-19,00190ae87b184c5a8886a6024c043275e883636c238fad...,823168008,0.009051,1,19,7,2020,23,2020-08-25,29,98
29202650,2020-07-19,002b985c40b9ab3cd56bccc18e3c1d0a1242a172eb7a69...,867044004,0.033881,1,19,7,2020,77,2020-09-11,12,98
29202655,2020-07-19,002ef9a63f8c49c145b59dbccc5dc540891869cc1c1f03...,560208003,0.012186,2,19,7,2020,24,2020-07-19,66,98
...,...,...,...,...,...,...,...,...,...,...,...,...
31788303,2020-09-22,ffcba92c45455b22f54dd02303b057c0a4b9b1f4fcfdf2...,919273002,0.042356,2,22,9,2020,192,2020-09-22,1,107
31788310,2020-09-22,ffd4cf2217de4a0a3f9f610cdec334c803692a18af08ac...,896169005,0.050831,2,22,9,2020,421,2020-09-22,1,107
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1,22,9,2020,15,2020-09-22,1,107
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1,22,9,2020,79,2020-09-22,1,107


In [31]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [32]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [33]:
test_set_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,Day,Month,Year,total_purchases,most_recent_purchase,recency,week
0,2020-07-19,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,2,19,7,2020,136,2020-07-30,55,108
1,2020-07-19,0015e1028e70b159e9bee92455c4782e72fb4ceb3fccb6...,1,19,7,2020,63,2020-07-19,66,108
2,2020-07-19,00190ae87b184c5a8886a6024c043275e883636c238fad...,1,19,7,2020,23,2020-08-25,29,108
3,2020-07-19,002b985c40b9ab3cd56bccc18e3c1d0a1242a172eb7a69...,1,19,7,2020,77,2020-09-11,12,108
4,2020-07-19,002ef9a63f8c49c145b59dbccc5dc540891869cc1c1f03...,2,19,7,2020,24,2020-07-19,66,108
...,...,...,...,...,...,...,...,...,...,...
420636,2020-09-22,ff5b8a8b26bf93a66290e9bd1b73393ac6a58968a78519...,2,22,9,2020,3,2020-09-22,1,108
420637,2020-09-22,ff6f55a51af284b71dcd264396b299e548f968c1769e71...,2,22,9,2020,10,2020-09-22,1,108
420638,2020-09-22,ff732b98019465f078144bae647d7d9765d76e3431ef92...,2,22,9,2020,10,2020-09-22,1,108
420639,2020-09-22,ffb72741f3bc3d98855703b55d34e05bc7893a5d6a99a3...,2,22,9,2020,23,2020-09-22,1,108


In [34]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [35]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [36]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,Day,Month,Year,total_purchases,most_recent_purchase,recency,week,article_id,price
0,2020-07-26,001968eb5b52125225a9bbca15d0d4e8bd8fd1d6874b76...,1,26,7,2020,31,2020-07-26,59,99,760084003,0.025092
1,2020-07-26,001968eb5b52125225a9bbca15d0d4e8bd8fd1d6874b76...,1,26,7,2020,31,2020-07-26,59,99,827968001,0.016565
2,2020-07-26,001968eb5b52125225a9bbca15d0d4e8bd8fd1d6874b76...,1,26,7,2020,31,2020-07-26,59,99,610776002,0.008302
3,2020-07-26,001968eb5b52125225a9bbca15d0d4e8bd8fd1d6874b76...,1,26,7,2020,31,2020-07-26,59,99,706016001,0.033168
4,2020-07-26,001968eb5b52125225a9bbca15d0d4e8bd8fd1d6874b76...,1,26,7,2020,31,2020-07-26,59,99,372860002,0.013235
...,...,...,...,...,...,...,...,...,...,...,...,...
5047687,2020-09-22,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,2,22,9,2020,9,2020-09-22,1,108,930380001,0.033645
5047688,2020-09-22,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,2,22,9,2020,9,2020-09-22,1,108,915529003,0.033479
5047689,2020-09-22,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,2,22,9,2020,9,2020-09-22,1,108,923758001,0.033418
5047690,2020-09-22,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,2,22,9,2020,9,2020-09-22,1,108,928206001,0.033473


# Combining transactions and candidates / negative examples

In [37]:
transactions['purchased'] = 1

In [38]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [39]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,Day,Month,Year,total_purchases,most_recent_purchase,recency,week,purchased
29202636,2020-07-19,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,917434002,0.011847,2,19,7,2020,136,2020-07-30,55,98,1.0
29202637,2020-07-19,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,685814048,0.033881,2,19,7,2020,136,2020-07-30,55,98,1.0
29202638,2020-07-19,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,895418003,0.033881,2,19,7,2020,136,2020-07-30,55,98,1.0
29202639,2020-07-19,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,875469002,0.016932,2,19,7,2020,136,2020-07-30,55,98,1.0
29202640,2020-07-19,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,835851001,0.042356,2,19,7,2020,136,2020-07-30,55,98,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5047687,2020-09-22,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,930380001,0.033645,2,22,9,2020,9,2020-09-22,1,108,0.0
5047688,2020-09-22,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,915529003,0.033479,2,22,9,2020,9,2020-09-22,1,108,0.0
5047689,2020-09-22,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,923758001,0.033418,2,22,9,2020,9,2020-09-22,1,108,0.0
5047690,2020-09-22,ffc92c3f7b0b302f393c2968b290f6e5c5b5510d1cf1df...,928206001,0.033473,2,22,9,2020,9,2020-09-22,1,108,0.0


In [40]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [41]:
data.purchased.mean()

0.13536595818870187

### Add bestseller information

In [42]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [43]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [44]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [45]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [46]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [47]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [53]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [54]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 278 ms, sys: 323 ms, total: 601 ms
Wall time: 606 ms


# Model training

In [55]:
from lightgbm.sklearn import LGBMRanker

In [56]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [57]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: index_code: object

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

# Calculate predictions

In [None]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

# Create submission

In [None]:
sub = pd.read_csv('../datasets/sample_submission.csv')

In [None]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

In [None]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [None]:
sub_name = 'basic_model_submission3'
sub.to_csv(f'{sub_name}.csv.gz', index=False)