Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [95]:
import numpy as np
import tfr


def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=15))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [21]:
import pandas as pd

In [22]:
%%time

transactions = pd.read_parquet('..\\..\\Input\\Dataset\\transactions_train.parquet')
customers = pd.read_parquet('..\\..\\Input\\Dataset\\customers.parquet')
articles = pd.read_parquet('..\\..\\Input\\Dataset\\articles.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: total: 3.38 s
Wall time: 5.98 s


In [23]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [24]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: total: 9.73 s
Wall time: 18.2 s


In [25]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
95,2020-07-15,2020-07-21
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15
104,2020-09-16,2020-09-22


In [26]:
c2weeks

customer_id
28847241659200          [95, 96, 101, 102]
41318098387474                        [98]
116809474287335                 [101, 103]
200292573348128          [95, 96, 99, 102]
248294615847351                       [96]
                               ...        
18446624797007271432                  [95]
18446630855572834764                 [103]
18446662237889060501                 [100]
18446705133201055310                 [102]
18446737527580148316                 [104]
Name: week, Length: 437365, dtype: object

In [27]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: total: 266 ms
Wall time: 632 ms


In [28]:
c2weeks2shifted_weeks[28847241659200]
# todo ??

{95: 96, 96: 101, 101: 102, 102: 105}

In [29]:
candidates_last_purchase = transactions.copy()

In [30]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: total: 13.8 s
Wall time: 21.9 s


In [31]:
candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,96
29030504,2020-07-15,272412481300040,816592008,0.016932,1,96
29030505,2020-07-15,272412481300040,621381021,0.033881,1,96
29030506,2020-07-15,272412481300040,817477003,0.025407,1,96
29030507,2020-07-15,272412481300040,899088002,0.025407,1,96
29319533,2020-07-22,272412481300040,885077001,0.008458,1,103
29410772,2020-07-24,272412481300040,850176003,0.029034,2,103
29410773,2020-07-24,272412481300040,875803001,0.064559,2,103
29410774,2020-07-24,272412481300040,892970003,0.020966,2,103
29410775,2020-07-24,272412481300040,854619003,0.020966,2,103


In [32]:
# transactions[transactions['customer_id']==272412481300040]
customers[customers['customer_id']==272412481300040]

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
137308,272412481300040,1,1,0,1,48,333369


### Bestsellers candidates

In [33]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [34]:
mean_price

week  article_id
95    108775015     0.004729
      108775044     0.008458
      110065001     0.006085
      110065002     0.006085
      111565001     0.004288
                      ...   
104   952267001     0.013732
      952938001     0.048651
      953450001     0.016932
      953763001     0.021885
      956217002     0.059068
Name: price, Length: 196880, dtype: float32

In [35]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [36]:
""" query to get most recent items:
    select article_id, min(week) as week, rank() over (partition by article_id order by week desc) as recency_rank
    from transactions 
    group by article_id
    order by week desc    
"""
# query implemented in code
recency = transactions \
    .groupby('article_id')['week'].min()\
    .sort_values(ascending=False).rename('recency_rank').astype('int8')\
    .rank(method='dense', ascending=False) \
    # .groupby('recency_week').head(12).rename('recency_rank').astype('int8')
    

recency



article_id
746586002     1.0
703366001     1.0
889974002     1.0
889901002     1.0
889844001     1.0
             ... 
792517001    10.0
792515001    10.0
792507001    10.0
792490002    10.0
805370003    10.0
Name: recency_rank, Length: 38331, dtype: float64

In [37]:
# add a column with mean_price, min_price, max_price, std_price for each article_id
mean_price_by_article =  transactions \
    .groupby(['article_id'])['price'].mean().rename('mean_price')
min_price_by_article =  transactions \
    .groupby(['article_id'])['price'].min().rename('min_price')
max_price_by_article =  transactions \
    .groupby(['article_id'])['price'].max().rename('max_price')
std_price_by_article =  transactions \
    .groupby(['article_id'])['price'].std().rename('std_price')
# merge
price_statistics = pd.merge(mean_price_by_article, min_price_by_article, on=['article_id']).merge(max_price_by_article, on=['article_id']).merge(std_price_by_article, on=['article_id'])
price_statistics



Unnamed: 0_level_0,mean_price,min_price,max_price,std_price
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
108775015,0.003842,0.002068,0.005068,0.001573
108775044,0.008076,0.003373,0.008458,0.001120
110065001,0.006424,0.006085,0.006763,0.000479
110065002,0.005860,0.005186,0.006085,0.000449
110065011,0.009305,0.006763,0.011847,0.003595
...,...,...,...,...
952267001,0.014982,0.010153,0.016932,0.002385
952938001,0.048006,0.040661,0.050831,0.004300
953450001,0.016836,0.015305,0.016932,0.000395
953763001,0.021908,0.021169,0.022017,0.000251


In [38]:
sales

week  article_id
95    760084003      1
      866731001      2
      600886001      3
      706016001      4
      372860002      5
                    ..
104   915529003      8
      915529005      9
      448509014     10
      762846027     11
      714790020     12
Name: bestseller_rank, Length: 120, dtype: int8

In [39]:
sales.loc[95]

article_id
760084003     1
866731001     2
600886001     3
706016001     4
372860002     5
610776002     6
877278002     7
547780003     8
817354001     9
827968001    10
866731003    11
866383006    12
Name: bestseller_rank, dtype: int8

In [40]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [41]:
bestsellers_previous_week.pipe(lambda df: df[df['week']==96])

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.02298
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
5,96,610776002,6,0.008318
6,96,877278002,7,0.025036
7,96,547780003,8,0.024814
8,96,817354001,9,0.021913
9,96,827968001,10,0.016436


In [42]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [43]:
unique_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
29030503,2020-07-15,272412481300040,1,95
29064059,2020-07-15,1456826891333599,1,95
29067103,2020-07-15,2133687643102426,2,95
29027487,2020-07-15,6010692573790711,1,95
29046403,2020-07-15,6171059100114610,2,95
...,...,...,...,...
31760188,2020-09-22,18435221511488011015,1,104
31782234,2020-09-22,18436859303155335645,1,104
31787251,2020-09-22,18437941771381362708,2,104
31776022,2020-09-22,18438270306572912089,1,104


In [44]:
transactions.drop_duplicates(['week', 'customer_id'])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95
29064059,2020-07-15,1456826891333599,888294001,0.013542,1,95
29067103,2020-07-15,2133687643102426,843642001,0.042356,2,95
29027487,2020-07-15,6010692573790711,857812010,0.039661,1,95
29046403,2020-07-15,6171059100114610,815447007,0.006763,2,95
...,...,...,...,...,...,...
31760188,2020-09-22,18435221511488011015,573085055,0.033881,1,104
31782234,2020-09-22,18436859303155335645,801447001,0.030492,1,104
31787251,2020-09-22,18437941771381362708,907188001,0.050831,2,104
31776022,2020-09-22,18438270306572912089,751471043,0.033881,1,104


In [45]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [46]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [47]:
test_set_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
0,2020-07-15,272412481300040,1,105
1,2020-07-15,1456826891333599,1,105
2,2020-07-15,2133687643102426,2,105
3,2020-07-15,6010692573790711,1,105
4,2020-07-15,6171059100114610,2,105
...,...,...,...,...
437360,2020-09-22,18410229429441241008,2,105
437361,2020-09-22,18417769707947924979,2,105
437362,2020-09-22,18418054986721795659,2,105
437363,2020-09-22,18421175435799911749,2,105


In [48]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [49]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [50]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,200292573348128,2,96,760084003,0.025094
1,2020-07-22,200292573348128,2,96,866731001,0.024919
2,2020-07-22,200292573348128,2,96,600886001,0.022980
3,2020-07-22,200292573348128,2,96,706016001,0.033197
4,2020-07-22,200292573348128,2,96,372860002,0.013193
...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,1,105,915529003,0.033439
5248376,2020-09-22,18438270306572912089,1,105,915529005,0.033417
5248377,2020-09-22,18438270306572912089,1,105,448509014,0.041630
5248378,2020-09-22,18438270306572912089,1,105,762846027,0.025005


# Combining transactions and candidates / negative examples

In [51]:
transactions['purchased'] = 1

In [52]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [63]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,recency_rank,mean_price,...,garment_group_no,garment_group_name,detail_desc,has_color,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-07-26,28847241659200,887770001,0.016932,1,96,1.0,999.0,10.0,0.016389,...,1010,6,3692,True,1,1,0,1,21,57896
1,2020-07-18,28847241659200,762846001,0.025407,1,96,0.0,999.0,10.0,0.024408,...,1010,6,492,True,1,1,0,1,21,57896
2,2020-07-18,28847241659200,829308001,0.033881,1,96,0.0,999.0,10.0,0.033045,...,1005,0,9082,True,1,1,0,1,21,57896
3,2020-07-26,28847241659200,760084003,0.025094,1,96,0.0,1.0,10.0,0.024914,...,1009,5,847,True,1,1,0,1,21,57896
4,2020-07-26,28847241659200,866731001,0.024919,1,96,0.0,2.0,10.0,0.024726,...,1005,0,3130,True,1,1,0,1,21,57896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991757,2020-09-21,18446737527580148316,915529003,0.033439,2,105,0.0,8.0,7.0,0.033351,...,1003,3,10909,True,1,1,0,1,60,96323
17991758,2020-09-21,18446737527580148316,915529005,0.033417,2,105,0.0,9.0,4.0,0.033352,...,1003,3,10909,True,1,1,0,1,60,96323
17991759,2020-09-21,18446737527580148316,448509014,0.041630,2,105,0.0,10.0,10.0,0.041287,...,1009,5,255,True,1,1,0,1,60,96323
17991760,2020-09-21,18446737527580148316,762846027,0.025005,2,105,0.0,11.0,4.0,0.025018,...,1010,6,492,True,1,1,0,1,60,96323


In [54]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [55]:
data.purchased.mean()

0.13607582749165664

### Add bestseller information

In [56]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
).merge(
    recency,
    on='article_id',
    how='left'
).merge(
    price_statistics,
    on='article_id',
    how='left'
)


In [57]:

data.head(10)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,recency_rank,mean_price,min_price,max_price,std_price
0,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0,,10.0,0.00562,0.001678,0.008458,0.002007
1,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0,,10.0,0.016344,0.010153,0.016932,0.001248
2,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0,,10.0,0.033032,0.027102,0.033881,0.001641
3,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0,,10.0,0.024747,0.018627,0.025407,0.001481
4,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0,,10.0,0.024468,0.012695,0.025407,0.001917
5,2020-07-15,1456826891333599,888294001,0.013542,1,95,1.0,,10.0,0.022126,0.006763,0.025407,0.005261
6,2020-07-15,1456826891333599,895002002,0.015237,1,95,1.0,,10.0,0.012476,0.003373,0.015237,0.002595
7,2020-07-15,2133687643102426,843642001,0.042356,2,95,1.0,,10.0,0.041549,0.029153,0.042356,0.002169
8,2020-07-15,6010692573790711,857812010,0.039661,1,95,1.0,,10.0,0.034969,0.013542,0.042356,0.007702
9,2020-07-15,6010692573790711,372860001,0.012678,1,95,1.0,,10.0,0.013197,0.003373,0.013542,0.000962


In [58]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [59]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [60]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [64]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [48]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [49]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active', 'has_color',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank', 'recency_rank', 'std_price']

# columns_to_use += ['mean_price', 'min_price', 'max_price']
# columns_to_use += ['sales_channel_id']

In [62]:
%%time
train_X = train[columns_to_use]
print(train_X.columns)
train_y = train['purchased']

test_X = test[columns_to_use]
data_folder = '..\\..\\Input\\Dataset\\'
# export the dataframe test and train set for later use
train_X.to_parquet(data_folder + 'train_X.parquet')
# export series train_y for later use
train_y.to_frame().to_parquet(data_folder + 'train_y.parquet')
test_X.to_parquet(data_folder + 'test_X.parquet')

Index(['article_id', 'product_type_no', 'graphical_appearance_no',
       'colour_group_code', 'perceived_colour_value_id',
       'perceived_colour_master_id', 'department_no', 'index_code',
       'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
       'has_color', 'club_member_status', 'fashion_news_frequency', 'age',
       'postal_code', 'bestseller_rank', 'recency_rank', 'std_price'],
      dtype='object')
CPU times: total: 7.16 s
Wall time: 10.5 s


# Model training

In [15]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import MinMaxScaler
from keras_tuner.tuners import RandomSearch
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np 

# import train_x, train_y and test_x from parquet files if needed
if 'train_X' not in locals() or 'train_y' not in locals() or 'test_X' not in locals():
    data_folder = '..\\..\\Input\\Dataset\\'
    train_X = pd.read_parquet(data_folder + 'train_X.parquet')
    train_y = pd.read_parquet(data_folder + 'train_y.parquet').values.ravel()
    test_X = pd.read_parquet(data_folder + 'test_X.parquet')


In [16]:
# Convert the data to numpy arrays and make sure the data type is 'int32'
train_X_tf = np.asarray(train_X).astype('float32')
train_Y_tf = np.asarray(train_y).astype('float32')


In [17]:

# Create an imputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the training data
train_X_imputed = imputer.fit_transform(train_X)

# Transform the test data using the same imputer
test_X_imputed = imputer.transform(test_X)


# Create a scaler
scaler = MinMaxScaler()

# Fit and transform the training data
train_X_scaled = scaler.fit_transform(train_X_imputed)

# Transform the test data using the same scaler
test_X_scaled = scaler.transform(test_X_imputed)

In [None]:


# Define your neural network model for ranking
def create_ranking_model(input_dim):
    model_input = Input(shape=(input_dim,))
    dense_layer = Dense(64, activation='relu')(model_input)
    output_layer = Dense(1, activation='linear')(dense_layer)

    model = tf.keras.Model(inputs=model_input, outputs=output_layer)
    return model

# Create and compile the neural network model
ranking_model = create_ranking_model(input_dim=len(columns_to_use))
ranking_model.compile(loss='mean_squared_error', optimizer='adam')




# Train the ranking model
ranking_model.fit(train_X_scaled, train_y, batch_size=1024*8, epochs=10, verbose=1)




In [54]:

# Hyperparameter tuning

# Define a function to create the ranking model using the Functional API
def build_model(hp):
    input_layer = Input(shape=(len(columns_to_use),))
    x = Dense(units=hp.Int('units', min_value=32, max_value=512, step=32))(input_layer)
    output_layer = Dense(1)(x)
    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

tuner = RandomSearch(build_model, objective='val_loss', max_trials=10, directory='my_tuner_dir', project_name='my_ranking_model')
tuner.search(train_X_scaled, train_y, epochs=10, validation_data=(train_X_scaled, train_y), verbose=1, batch_size=1024*10)


Using TensorFlow backend
Reloading Tuner from my_tuner_dir\my_ranking_model\tuner0.json


In [55]:
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hyperparameters.values)
best_model = build_model(best_hyperparameters)
best_model.compile(loss='mean_squared_error', optimizer='adam')
best_model.fit(train_X_scaled, train_y, batch_size=1024*10, epochs=10, verbose=1)
best_model.save('best_ranking_model.keras')

{'units': 32}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


In [62]:
best_model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 21)]              0         
                                                                 
 dense_6 (Dense)             (None, 32)                704       
                                                                 
 dense_7 (Dense)             (None, 1)                 33        
                                                                 
Total params: 737 (2.88 KB)
Trainable params: 737 (2.88 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Calculate predictions

In [94]:
%time
# load the best model
best_model = tf.keras.models.load_model('best_ranking_model.h5')

# Predict using the trained ranking model
test['preds'] = best_model.predict(test_X_scaled, batch_size=1024*10, verbose=1)


CPU times: total: 0 ns
Wall time: 0 ns


In [128]:
c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

In [76]:
# perform precision at k evaluation on the test set
test_week_purchases_by_cust = test.groupby('customer_id')['article_id'].apply(list).to_dict()

[685816059,
 919786002,
 806388012,
 924243002,
 762846027,
 918522001,
 915529005,
 924243001,
 923758001,
 909370001,
 915529003,
 866731001,
 448509014,
 751471001,
 714790020]

In [84]:

# view the first 10 customers and their purchases
test_week_purchases_by_cust[42984229297455520]
# a = calculate_apk([c_id2predicted_article_ids[c_id] for c_id in test_week_purchases_by_cust.keys()], test_week_purchases_by_cust.values())

[919786002,
 806388012,
 685816059,
 924243001,
 924243002,
 918522001,
 923758001,
 866731001,
 909370001,
 751471001,
 915529003,
 915529005,
 448509014,
 762846027,
 714790020]

In [None]:
c_id2predicted_article_ids[42984229297455520]

In [91]:
# find the number of items in the test set that are also present in the predictions for customer 42984229297455520
len(set(c_id2predicted_article_ids[42984229297455520]).intersection(set(test_week_purchases_by_cust[42984229297455520])))
apk(c_id2predicted_article_ids[42984229297455520], test_week_purchases_by_cust[42984229297455520], k=15)


1.0

In [96]:
# calculate the mean average precision at k for the test set
calculate_apk([c_id2predicted_article_ids[c_id] for c_id in test_week_purchases_by_cust.keys()], test_week_purchases_by_cust.values())

1.0

AttributeError: module 'tensorflow.core.protobuf.error_codes_pb2' has no attribute 'tsl'

# Create submission

In [110]:
sub = pd.read_csv('..\\..\\Input\\Dataset\\sample_submission.csv')

In [109]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

NameError: name 'sub' is not defined

In [60]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [61]:
sub_name = 'basic_model_submission'
sub.to_csv(f'..\\..\\Output\\{sub_name}.csv', index=False)