Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [2]:
import pandas as pd

In [3]:
%%time

transactions = pd.read_parquet('../input/warmup/transactions_train.parquet')
customers = pd.read_parquet('../input/warmup/customers.parquet')
articles = pd.read_parquet('../input/warmup/articles.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: user 1.47 s, sys: 1.67 s, total: 3.13 s
Wall time: 5.86 s


# Test Week
Change the test week based on what we want to check, if we want to predict -> 'transactions.week.max() + 1'. 
If we want to check the recall (how well it can predict) -> value of the test_week we want to test for

In [4]:
test_week = 104
absolute_max_week = transactions.week.max()
print(test_week)
total_transactions = transactions.copy()
transactions = transactions[(transactions.week > test_week - 11) & (transactions.week < test_week)]
# transactions = transactions[transactions.week < test_week]

104


# Generating candidates

### Last purchase candidates

In [5]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 17.9 s, sys: 282 ms, total: 18.2 s
Wall time: 18.1 s


In [6]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
94,2020-07-08,2020-07-14
95,2020-07-15,2020-07-21
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15


In [7]:
c2weeks

customer_id
28847241659200          [94, 95, 96, 101, 102]
41318098387474                            [98]
116809474287335                     [101, 103]
200292573348128              [95, 96, 99, 102]
208119717816961                           [94]
                                 ...          
18446590778427270109                 [97, 102]
18446624797007271432                      [95]
18446630855572834764                     [103]
18446662237889060501                     [100]
18446705133201055310                     [102]
Name: week, Length: 439368, dtype: object

In [8]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 664 ms, sys: 46 ms, total: 710 ms
Wall time: 710 ms


In [9]:
c2weeks2shifted_weeks[28847241659200]

{94: 95, 95: 96, 96: 101, 101: 102, 102: 104}

In [10]:
candidates_last_purchase = transactions.copy()

In [11]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 6.67 s, sys: 28 ms, total: 6.7 s
Wall time: 6.7 s


In [12]:
candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,96
29030504,2020-07-15,272412481300040,816592008,0.016932,1,96
29030505,2020-07-15,272412481300040,621381021,0.033881,1,96
29030506,2020-07-15,272412481300040,817477003,0.025407,1,96
29030507,2020-07-15,272412481300040,899088002,0.025407,1,96
29319533,2020-07-22,272412481300040,885077001,0.008458,1,103
29410772,2020-07-24,272412481300040,850176003,0.029034,2,103
29410773,2020-07-24,272412481300040,875803001,0.064559,2,103
29410774,2020-07-24,272412481300040,892970003,0.020966,2,103
29410775,2020-07-24,272412481300040,854619003,0.020966,2,103


In [13]:
transactions[transactions['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95
29319533,2020-07-22,272412481300040,885077001,0.008458,1,96
29410772,2020-07-24,272412481300040,850176003,0.029034,2,96
29410773,2020-07-24,272412481300040,875803001,0.064559,2,96
29410774,2020-07-24,272412481300040,892970003,0.020966,2,96
29410775,2020-07-24,272412481300040,854619003,0.020966,2,96


### Bestsellers candidates

get the mean of the prices of transactions sorted on the article_id and the week

In [14]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [15]:
mean_price

week  article_id
94    108775044     0.008339
      110065002     0.005508
      111565001     0.005452
      111586001     0.013270
      111593001     0.011669
                      ...   
103   952267001     0.014631
      952938001     0.045746
      953450001     0.016769
      953763001     0.021932
      956217002     0.059203
Name: price, Length: 199492, dtype: float32

make sales which is the ranking of the 12 most bought article_ids in each week

In [16]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [17]:
sales

week  article_id
94    806388001      1
      730683021      2
      610776002      3
      805308002      4
      866383006      5
                    ..
103   918292001      8
      762846027      9
      809238005     10
      673677002     11
      923758001     12
Name: bestseller_rank, Length: 120, dtype: int8

In [18]:
sales.loc[95]

article_id
760084003     1
866731001     2
600886001     3
706016001     4
372860002     5
610776002     6
877278002     7
547780003     8
817354001     9
827968001    10
866731003    11
866383006    12
Name: bestseller_rank, dtype: int8

bestsellers_previous_week will be a collection for all weeks where the mean price and the rankings are joined. Therefor the dataframe will look like the [week, article] combination with the ranking and then the average price. The week is increased with one.

In [19]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [20]:
bestsellers_previous_week.pipe(lambda df: df[df['week']==96])

Unnamed: 0,week,article_id,bestseller_rank,price
12,96,760084003,1,0.025094
13,96,866731001,2,0.024919
14,96,600886001,3,0.02298
15,96,706016001,4,0.033197
16,96,372860002,5,0.013193
17,96,610776002,6,0.008318
18,96,877278002,7,0.025036
19,96,547780003,8,0.024814
20,96,817354001,9,0.021913
21,96,827968001,10,0.016436


Unique transactions is a dataframe containing one entry of the [week, customer_id] combination. Here the article_ids are dropped as well as the prices

In [21]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [22]:
unique_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
28777300,2020-07-08,857913002275398,1,94
28787123,2020-07-08,1658289241058394,1,94
28788562,2020-07-08,3828854365940846,1,94
28744235,2020-07-08,4195624216542755,1,94
28753719,2020-07-08,4233235614030232,2,94
...,...,...,...,...
31521960,2020-09-15,18439897732908966680,2,103
31531712,2020-09-15,18444276791873187543,2,103
31539937,2020-09-15,18444799607866739422,2,103
31543799,2020-09-15,18446250046654386343,1,103


drop all duplicate [week, customer] baskets from transactions

In [23]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94
28777301,2020-07-08,857913002275398,776237011,0.025407,1,94
28777302,2020-07-08,857913002275398,844294001,0.011847,1,94
28787123,2020-07-08,1658289241058394,877773001,0.007610,1,94
28788562,2020-07-08,3828854365940846,507883009,0.013542,1,94
...,...,...,...,...,...,...
31536744,2020-09-15,18446630855572834764,568601045,0.050831,2,103
31536745,2020-09-15,18446630855572834764,568601045,0.050831,2,103
31536746,2020-09-15,18446630855572834764,898713001,0.067780,2,103
31536747,2020-09-15,18446630855572834764,898713001,0.067780,2,103


In [24]:
transactions.drop_duplicates(['week', 'customer_id'])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94
28787123,2020-07-08,1658289241058394,877773001,0.007610,1,94
28788562,2020-07-08,3828854365940846,507883009,0.013542,1,94
28744235,2020-07-08,4195624216542755,817417005,0.022864,1,94
28753719,2020-07-08,4233235614030232,876986001,0.030492,2,94
...,...,...,...,...,...,...
31521960,2020-09-15,18439897732908966680,794321007,0.061000,2,103
31531712,2020-09-15,18444276791873187543,867969008,0.033881,2,103
31539937,2020-09-15,18444799607866739422,909721003,0.042356,2,103
31543799,2020-09-15,18446250046654386343,869872006,0.033881,1,103


In [25]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94
28777301,2020-07-08,857913002275398,776237011,0.025407,1,94
28777302,2020-07-08,857913002275398,844294001,0.011847,1,94
28787123,2020-07-08,1658289241058394,877773001,0.007610,1,94
28788562,2020-07-08,3828854365940846,507883009,0.013542,1,94
...,...,...,...,...,...,...
31536744,2020-09-15,18446630855572834764,568601045,0.050831,2,103
31536745,2020-09-15,18446630855572834764,568601045,0.050831,2,103
31536746,2020-09-15,18446630855572834764,898713001,0.067780,2,103
31536747,2020-09-15,18446630855572834764,898713001,0.067780,2,103


bestsellers for each customer from revious week for all the weeks that are in the dataset

### !!!FOR AGE GROUP MERGE ON AGE

In [26]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

make a set of users that would buy in the test week

In [27]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [28]:
test_set_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
0,2020-07-08,857913002275398,1,104
1,2020-07-08,1658289241058394,1,104
2,2020-07-08,3828854365940846,1,104
3,2020-07-08,4195624216542755,1,104
4,2020-07-08,4233235614030232,2,104
...,...,...,...,...
439363,2020-09-15,18431808737044686839,1,104
439364,2020-09-15,18436707407200418746,2,104
439365,2020-09-15,18439897732908966680,2,104
439366,2020-09-15,18446250046654386343,1,104


all the bestsellers based on the weeks for the test week

In [29]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

combine both lists of bestsellers

In [30]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [31]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-15,272412481300040,1,95,806388001,0.013301
1,2020-07-15,272412481300040,1,95,730683021,0.025643
2,2020-07-15,272412481300040,1,95,610776002,0.008303
3,2020-07-15,272412481300040,1,95,805308002,0.013609
4,2020-07-15,272412481300040,1,95,866383006,0.024971
...,...,...,...,...,...,...
5272411,2020-09-15,18446630855572834764,2,104,918292001,0.041424
5272412,2020-09-15,18446630855572834764,2,104,762846027,0.025104
5272413,2020-09-15,18446630855572834764,2,104,809238005,0.041656
5272414,2020-09-15,18446630855572834764,2,104,673677002,0.024925


# Making a Recall evaluation function
For the recall evaluation we have several requirements that have to be met:
* We need to compare two sets of items, the first being the items that are in fact bought, the other set of items is the set of items that are candidates for the predictions
* Only compare recall -> amount of positives (candidates that are in fact bought) / total positives (amount of items bought)

!! If we would be to generate more candidates, this would automatically grow as the chance we include one is bigger

## My own first attempt
This part is my own function, created to calculate the recall values. This turned out to be so slow that even subsets took very long computation times. Therefore I asked Noah to help out.

In [32]:
# function that returns the recall value 
# It returns None if the predicted week is not in the purchases set
def recall (purchases, candidates):
    # check if the test_week is the same as the most recent week in the predictions
    if not candidates.week.max() == test_week:
        print("There is something wrong with the predictions")
    
    # check if the predicted week is in fact in the dataset
    if test_week > absolute_max_week:
        return None
    
    # Take the data in question
    test_week_purchases = purchases[purchases.week == test_week]
    test_week_candidates = candidates[candidates.week == test_week]
    
    # take all the users
    users = test_week_purchases.customer_id.drop_duplicates()
    
    # start the total recall value which will be averaged out at the end
    total_recall = 0
    
    # loop over all the users in this week
    for user in users:
        # get the corresponding purchases and candidates
        user_purchases = test_week_purchases[test_week_purchases.customer_id == user]
        user_candidates = test_week_candidates[test_week_candidates.customer_id == user]
        
        # get the article_ids in a Series
        user_purchased_articles = user_purchases.article_id.drop_duplicates()
        user_candidate_articles = user_candidates.article_id.drop_duplicates()
        
        # get the total positives
        total_positives = user_purchased_articles.count()
        
        # get the true positives
        intersection = user_candidate_articles[user_candidate_articles.isin(user_purchased_articles)]
        true_positives = intersection.count()
        
        # calculate the recall value for this user and add it to the total
        recall_value = true_positives / total_positives
        total_recall += recall_value
    
    # take the average of the recalls and return it
    total_recall = total_recall / users.count()
    return total_recall
        

The next cell contains a function I created based on the tips and guidance Noah provided. It merges the two tables of candidates and purchases. By doing this only the ones present in both are kept. So taking the count of the purchases we have the two only values needed.

In [33]:
# return the average recall of generated candidates versus the actual bought items
def average_recall(purchases, candidates):
    joined = pd.merge(purchases, candidates, how='inner').drop_duplicates()
    true_positives = joined.groupby('customer_id').count()
    total_positives = purchases.groupby('customer_id').count()
    recall = true_positives.divide(total_positives, fill_value=0)
    return recall.mean().values[0]

This then checks the recall of the candidates generated by the bestsellers

In [34]:
if not test_week > absolute_max_week:
    purchases = total_transactions[total_transactions.week == test_week][['customer_id', 'article_id']].drop_duplicates()
    candidates = candidates_bestsellers[candidates_bestsellers.week == test_week][['customer_id', 'article_id']].drop_duplicates()
    print(average_recall(purchases, candidates))

0.01692710508568287


In [35]:
# test = candidates_bestsellers[candidates_bestsellers.week == test_week].customer_id.drop_duplicates()
# print(test.count())

In [36]:
# test2 = total_transactions[total_transactions.week == test_week].customer_id.drop_duplicates()
# print(test2.count())

# Making a new group of candidates based on age group
First make a new column and then apply the same process 

In [37]:
# define age groups
def get_age_group(age):
    if age < 18:
        return 'Under 18'
    elif age >= 18 and age < 25:
        return '18-24'
    elif age >= 25 and age < 35:
        return '25-34'
    elif age >= 35 and age < 45:
        return '35-44'
    elif age >= 45 and age < 55:
        return '45-54'
    elif age >= 55 and age < 65:
        return '55-64'
    else:
        return '65+'

Created a function to apply on the ages in the table. Then created a new column containing this. 
For now the age groups are strings, this means they cannot be used as features in the ranker model. 

In [38]:
#apply the age groups on transactions
customers["age_group"] = customers["age"].apply(get_age_group)

## Now make a similar popularity calculation with respect to the age_group popularity 
(how popular is the item in a certain age_group that week)
Then calculate similarly and evaluate
The difference with normal popularity lies in the grouping by week AND age group

In [39]:
# firstly take the age_groups and the cutomer ids
age_groups_customers = customers[['customer_id', 'age_group']].drop_duplicates()

# now join them into the transactions to create a new transactions set to work with
age_group_transactions = pd.merge(transactions, age_groups_customers)
# now the age_group is included, we will have to change some values and names to ensure this is used

In [40]:
# Group the mean_price not per week/article but by week/article/age_group
# this is so we know
mean_price_age_group = age_group_transactions \
    .groupby(['week', 'age_group', 'article_id'])['price'].mean()

# group the sales by week AND the age group and so find the most popular article for each age group in each week
sales_age_group = age_group_transactions \
    .groupby(['week', 'age_group'])['article_id'].value_counts() \
    .groupby(['week', 'age_group']).rank(method='dense', ascending=False) \
    .groupby(['week', 'age_group']).head(12).rename('bestseller_rank').astype('int8')

In [41]:
mean_price_age_group

week  age_group  article_id
94    18-24      108775044     0.008458
                 111565001     0.006789
                 111586001     0.012630
                 111593001     0.012040
                 111609001     0.011610
                                 ...   
103   Under 18   942733001     0.011847
                 944506001     0.042356
                 944989001     0.023373
                 949551001     0.030492
                 953450001     0.016932
Name: price, Length: 570982, dtype: float32

In [42]:
sales_age_group

week  age_group  article_id
94    18-24      806388001     1
                 866383006     2
                 894140004     3
                 850244003     4
                 733749001     5
                              ..
103   Under 18   685813043     5
                 685814063     5
                 706016001     5
                 762143010     5
                 767423001     5
Name: bestseller_rank, Length: 840, dtype: int8

In [43]:
# now calculate the bestsellers for these week - age_group combos
bestsellers_previous_week_age_group = pd.merge(sales_age_group, mean_price_age_group, on=['week', 'age_group', 'article_id']).reset_index()
bestsellers_previous_week_age_group.week += 1

In [44]:
bestsellers_previous_week_age_group.pipe(lambda df: df[(df['week']==96) & (df['age_group']=='18-24')])

Unnamed: 0,week,age_group,article_id,bestseller_rank,price
84,96,18-24,760084003,1,0.02518
85,96,18-24,706016001,2,0.033148
86,96,18-24,759871002,3,0.00615
87,96,18-24,880099001,4,0.016395
88,96,18-24,372860002,5,0.013193
89,96,18-24,600886001,6,0.022921
90,96,18-24,547780003,7,0.024935
91,96,18-24,866731001,8,0.024758
92,96,18-24,895002002,9,0.013611
93,96,18-24,733749001,10,0.004966


In [45]:
unique_age_group_transactions = age_group_transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [46]:
age_group_candidates_bestsellers = pd.merge(
    unique_age_group_transactions,
    bestsellers_previous_week_age_group,
    on=['week', 'age_group'],
)

In [47]:
test_set_age_group_transactions = unique_age_group_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_age_group_transactions.week = test_week

In [48]:
age_group_candidates_bestsellers_test_week = pd.merge(
    test_set_age_group_transactions,
    bestsellers_previous_week_age_group,
    on=['week', 'age_group'],
)

In [49]:
age_group_candidates_bestsellers = pd.concat([age_group_candidates_bestsellers, age_group_candidates_bestsellers_test_week])
age_group_candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [50]:
age_group_candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,age_group,article_id,price
0,2020-07-17,857913002275398,1,95,18-24,806388001,0.013250
1,2020-07-17,857913002275398,1,95,18-24,866383006,0.024987
2,2020-07-17,857913002275398,1,95,18-24,894140004,0.021638
3,2020-07-17,857913002275398,1,95,18-24,850244003,0.004267
4,2020-07-17,857913002275398,1,95,18-24,733749001,0.004952
...,...,...,...,...,...,...,...
5272411,2020-09-15,18275989151869979916,1,104,Under 18,685813043,0.016297
5272412,2020-09-15,18275989151869979916,1,104,Under 18,685814063,0.032186
5272413,2020-09-15,18275989151869979916,1,104,Under 18,706016001,0.032189
5272414,2020-09-15,18275989151869979916,1,104,Under 18,762143010,0.013220


# Check The Recall

In [51]:
if not test_week > absolute_max_week:
    purchases = total_transactions[total_transactions.week == test_week][['customer_id', 'article_id']].drop_duplicates()
    candidates = age_group_candidates_bestsellers[candidates_bestsellers.week == test_week][['customer_id', 'article_id']].drop_duplicates()
    print(average_recall(purchases, candidates))

0.018756583007294242


1. # My code
First a bit of encoding. We will change the missing age value to the median. 

In [52]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit_transform(train['age'])

ages_copy = customers[customers.age != -1].copy()
median_age = ages_copy['age'].median()
median_age

32.0

In [53]:
customers.loc[customers['age'] == -1] = median_age 
customers['age'].min()

16

## Most popular item types

In [54]:
# transactions.loc[:'year'] = transactions['t_dat'].dt.year
# transactions['year'].value_counts()

## Find the most bought color for a user
We want to know the favourite color of the user based on the items bought.

In [55]:
articles_and_transactions = pd.merge(articles, transactions, on="article_id")
pop_colors = articles_and_transactions.groupby('customer_id')['colour_group_code'].value_counts().groupby('customer_id').rank(method='dense', ascending=False) \
    .groupby('customer_id').head(12).rename('favourite_color').astype('int8')

In [56]:
pop_colors

customer_id           colour_group_code
28847241659200        9                    1
                      10                   2
                      52                   2
41318098387474        73                   1
116809474287335       9                    1
                                          ..
18446624797007271432  51                   1
18446630855572834764  42                   1
                      33                   2
18446662237889060501  9                    1
18446705133201055310  9                    1
Name: favourite_color, Length: 1545671, dtype: int8

# Combining transactions and candidates / negative examples

In [57]:
transactions['purchased'] = 1

In [58]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [59]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94,1.0
28777301,2020-07-08,857913002275398,776237011,0.025407,1,94,1.0
28777302,2020-07-08,857913002275398,844294001,0.011847,1,94,1.0
28787123,2020-07-08,1658289241058394,877773001,0.007610,1,94,1.0
28788562,2020-07-08,3828854365940846,507883009,0.013542,1,94,1.0
...,...,...,...,...,...,...,...
5272411,2020-09-15,18446630855572834764,918292001,0.041424,2,104,0.0
5272412,2020-09-15,18446630855572834764,762846027,0.025104,2,104,0.0
5272413,2020-09-15,18446630855572834764,809238005,0.041656,2,104,0.0
5272414,2020-09-15,18446630855572834764,673677002,0.024925,2,104,0.0


In [60]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [61]:
data.purchased.mean()

0.1369278646522875

### Add bestseller information

In [62]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [63]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [64]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [65]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

# my code

In [66]:
# make a copy of the date column
# we use the date column to replace it with the days  so we still have the date, but now have a day column
data["day"] = data.loc[:,"t_dat"]
data.loc[:, "day"] = data["day"].dt.day
data["day"].value_counts()

12    800948
15    775155
10    680497
11    668513
5     661071
8     659042
13    656858
29    640582
9     632048
4     614292
22    598148
30    597581
27    585844
28    585662
24    583192
6     573094
3     572728
14    561459
20    557972
25    556323
26    553036
23    548998
21    538875
17    534922
7     514924
1     513802
18    508906
16    503085
31    493735
2     480370
19    464138
Name: day, dtype: int64

In [67]:
data["t_dat"].value_counts()

2020-08-12    427360
2020-07-15    418490
2020-08-05    377314
2020-07-30    373500
2020-07-29    367905
               ...  
2020-07-13    153466
2020-09-15    150876
2020-07-14    145520
2020-08-16    142685
2020-07-12    111472
Name: t_dat, Length: 70, dtype: int64

In [68]:
data.loc[:, "day_sin"] = np.sin(2 * np.pi * data["day"]/365)
data["day_sin"]

0           0.304921
1           0.304921
2           0.238673
3           0.238673
4           0.238673
              ...   
18215795    0.103102
18215796    0.103102
18215797    0.103102
18215798    0.103102
18215799    0.103102
Name: day_sin, Length: 18215800, dtype: float64

In [69]:
data.loc[:, "day_cos"] = np.cos(2 * np.pi * data["day"]/365)
data["day_sin"]

0           0.304921
1           0.304921
2           0.238673
3           0.238673
4           0.238673
              ...   
18215795    0.103102
18215796    0.103102
18215797    0.103102
18215798    0.103102
18215799    0.103102
Name: day_sin, Length: 18215800, dtype: float64

In [70]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [71]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [72]:
train['age'].min()

16.0

In [73]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active', 'day_sin', 'day_cos',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [74]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 266 ms, sys: 175 ms, total: 442 ms
Wall time: 440 ms


# Model training

In [75]:
from lightgbm.sklearn import LGBMRanker

In [76]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [77]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.843325
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.135675
[LightGBM] [Debug] init for col-wise cost 0.195535 seconds, init for row-wise cost 0.900390 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1217
[LightGBM] [Info] Number of data points in the train set: 11557594, number of used features: 20
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
CPU times: user 23 s, sys: 741 ms, total: 23.8 s
Wall time: 7.66 s


In [78]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.998576263874084
day_sin 0.00048019114217671564
article_id 0.0004462109884813076
age 0.00016144988300781788
garment_group_no 9.808106205412895e-05
product_type_no 8.342173044933447e-05
club_member_status 4.696334666489633e-05
postal_code 4.425691724660244e-05
colour_group_code 3.229196203482674e-05
department_no 3.086909380029724e-05
fashion_news_frequency 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
perceived_colour_value_id 0.0
index_group_no 0.0
index_code 0.0
FN 0.0
Active 0.0
day_cos 0.0
section_no 0.0


# Calculate predictions

In [79]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


# Create submission

In [80]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [81]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 5.59 s, sys: 287 ms, total: 5.88 s
Wall time: 5.88 s


In [82]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [83]:
sub_name = 'added_2_features_model_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)