Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd

In [4]:
%%time

transactions = pd.read_parquet('../input/warmup/transactions_train.parquet')
customers = pd.read_parquet('../input/parquet-new-assignement/new_customer_one.parquet')
articles = pd.read_parquet('../input/parquet-new-assignement/new_articles_one.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: user 1.4 s, sys: 1.47 s, total: 2.87 s
Wall time: 6.04 s


In [5]:
#adding of the column week to the dataframe transactions
transactions.t_dat = pd.to_datetime(transactions.t_dat, format='%Y-%m-%d')
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

In [6]:
#definition of the week for test
test_week = transactions.week.max() + 1 
#transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [7]:
'''deciding to work only with data of the months of September since probably articles
bought in this period could be the same and then the reccomendation could be more 
related also to the period in which we need to give them'''

#filtering only the weeks of the month of semptember for the years 2018-2020
filtered_transactions = transactions[(transactions['t_dat'].dt.month == 9) &
                                     (transactions['t_dat'].dt.year.between(2019, 2020))]

#groupby of candidates based on the customer_id and extraction of weeks
c2weeks = filtered_transactions.groupby('customer_id')['week'].unique()

#creation of a dictionary for shifted weeks
c2weeks2shifted_weeks = {}
candidates_last_purchase_september = filtered_transactions.copy()

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

#update of the dataframe candidates_last_purchase_september
weeks = []
for i, (c_id, week) in enumerate(zip(filtered_transactions['customer_id'], filtered_transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase_september.week = weeks

### candidates

In [8]:
#creation of the mean price of purchases for each candidate only in the filtered transactions
mean_price_purchases = filtered_transactions.groupby('customer_id')['price'].mean().rename('mean price purchases').reset_index()
mean_price_purchases

Unnamed: 0,customer_id,mean price purchases
0,28847241659200,0.062695
1,77117344919861,0.033233
2,116809474287335,0.033881
3,200292573348128,0.039305
4,272412481300040,0.029856
...,...,...
364719,18446571879212697038,0.031621
364720,18446590778427270109,0.029644
364721,18446630855572834764,0.061000
364722,18446705133201055310,0.050831


In [9]:
#identification of the gender group that is the most present in a specific geographic area
most_present_gender = customers.groupby(['customer_id', 'postal_code', 'numerical_gender']).size().reset_index(name='count')
idx = most_present_gender.groupby(['customer_id', 'postal_code'])['count'].transform(max) == most_present_gender['count']
most_present_gender = most_present_gender[idx][['customer_id', 'postal_code', 'numerical_gender']]
most_present_gender = most_present_gender.rename(columns = {'numerical_gender': 'most_present_gender_in_area'})
most_present_gender.drop(columns = 'postal_code', inplace = True)
most_present_gender

Unnamed: 0,customer_id,most_present_gender_in_area
0,4245900472157,2
1,23962613628581,0
2,25398598941468,2
3,28847241659200,2
4,41046458195168,2
...,...,...
1371975,18446630855572834764,0
1371976,18446662237889060501,2
1371977,18446705133201055310,0
1371978,18446723086055369602,2


In [10]:
#identification of the medium age for geographic area
median_age = customers.groupby(['customer_id', 'postal_code'])['age'].median().reset_index(name = 'median_age_area')
median_age.drop(columns = 'postal_code', inplace = True)
median_age

Unnamed: 0,customer_id,median_age_area
0,4245900472157,21.0
1,23962613628581,34.0
2,25398598941468,21.0
3,28847241659200,21.0
4,41046458195168,18.0
...,...,...
1371975,18446630855572834764,33.0
1371976,18446662237889060501,75.0
1371977,18446705133201055310,60.0
1371978,18446723086055369602,33.0


In [11]:
candidates_definitive = pd.merge(candidates_last_purchase_september, mean_price_purchases, on = 'customer_id') #merge of values of mean fo the purchases
candidates_definitive = pd.merge(candidates_definitive, most_present_gender, on = 'customer_id') #merge of values of the dominant gender in teh geographic area of the customer
candidates_definitive = pd.merge(candidates_definitive, median_age, on = 'customer_id') #merge of values of the dominant gender in teh geographic area of the customer

candidates_definitive.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'week', 'mean price purchases', 'most_present_gender_in_area',
       'median_age_area'],
      dtype='object')

In [12]:
candidates_definitive = candidates_definitive.drop_duplicates(subset = 'customer_id')

In [13]:
#dataset about candidates in the analyzed period
candidates_definitive.reset_index(drop = True, inplace = True)
candidates_definitive

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,mean price purchases,most_present_gender_in_area,median_age_area
0,2019-09-01,5341589326356166,767893004,0.016932,2,105,0.016085,2,20.0
1,2019-09-01,6553270874922497,708138021,0.067780,2,105,0.045181,2,24.0
2,2019-09-01,9020202416008033,696744001,0.049983,2,105,0.057126,2,34.0
3,2019-09-01,9395514529856290,779781003,0.042356,1,53,0.023712,2,54.0
4,2019-09-01,12485207671530100,715546002,0.011356,1,105,0.011346,0,25.0
...,...,...,...,...,...,...,...,...,...
364719,2020-09-22,18417769707947924979,729860001,0.022017,2,105,0.019475,2,40.0
364720,2020-09-22,18418054986721795659,873279003,0.042356,2,105,0.042356,0,28.0
364721,2020-09-22,18421175435799911749,863583001,0.033881,2,105,0.033881,2,28.0
364722,2020-09-22,18422784312842572958,639448011,0.050831,1,105,0.032186,2,25.0


In [14]:
#test set about transactions
test_set_transactions = filtered_transactions.drop_duplicates('customer_id')
test_set_transactions.reset_index(drop = True, inplace = True)
test_set_transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
0,2019-09-01,5341589326356166,767893004,0.016932,2,49
1,2019-09-01,6553270874922497,708138021,0.067780,2,49
2,2019-09-01,9020202416008033,696744001,0.049983,2,49
3,2019-09-01,9395514529856290,779781003,0.042356,1,49
4,2019-09-01,12485207671530100,715546002,0.011356,1,49
...,...,...,...,...,...,...
364719,2020-09-22,18417769707947924979,729860001,0.022017,2,104
364720,2020-09-22,18418054986721795659,873279003,0.042356,2,104
364721,2020-09-22,18421175435799911749,863583001,0.033881,2,104
364722,2020-09-22,18422784312842572958,639448011,0.050831,1,104


In [15]:
#bestsellers for septembers
mean_price = filtered_transactions \
    .groupby(['week', 'article_id'])['price'].mean()

sales = filtered_transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

bestsellers_september = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()

# Combining transactions and candidates / negative examples

In [16]:
filtered_transactions.loc[:, 'purchased'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [17]:
data = pd.concat([filtered_transactions, candidates_last_purchase_september])
data = pd.merge(data, bestsellers_september[['article_id', 'bestseller_rank']], on = 'article_id', how = 'left')
data.fillna(0, inplace = True)

In [18]:
#adding candidates info
data = pd.merge(data, candidates_definitive[['customer_id', 'mean price purchases', 'most_present_gender_in_area', 'median_age_area']], on = 'customer_id', how = 'left')

In [19]:
#adding initial datasets to my new dataset
data = pd.merge(data, articles, on = 'article_id', how = 'left')
data = pd.merge(data, customers, on = 'customer_id', how = 'left')

In [20]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [21]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [22]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [23]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'bestseller_rank', 'mean price purchases', 'most_present_gender_in_area', 'median_age_area']

In [24]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 48.2 ms, sys: 34 ms, total: 82.3 ms
Wall time: 82.4 ms


# Model training

In [25]:
from lightgbm.sklearn import LGBMRanker

In [26]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [27]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.940203
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.173201
[LightGBM] [Debug] init for col-wise cost 0.044369 seconds, init for row-wise cost 0.227277 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1112
[LightGBM] [Info] Number of data points in the train set: 2918658, number of used features: 19
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 13
CPU times: user 6.42 s, sys: 274 ms, total: 6.69 s
Wall time: 2.14 s


In [28]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

article_id 0.8722422174316217
department_no 0.038587174903066034
bestseller_rank 0.021223372398009654
product_type_no 0.017165271720638777
index_code 0.012796393446133906
graphical_appearance_no 0.011042943359889881
section_no 0.007719192940979198
mean price purchases 0.005560238069510006
garment_group_no 0.005365664105602769
colour_group_code 0.003067872673784903
index_group_no 0.0029463242620862283
perceived_colour_master_id 0.0022833346886769463
FN 0.0
most_present_gender_in_area 0.0
Active 0.0
club_member_status 0.0
perceived_colour_value_id 0.0
fashion_news_frequency 0.0
median_age_area 0.0


# Calculate predictions

In [29]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_september_all = \
    bestsellers_september[bestsellers_september.week == bestsellers_september.week.max()]['article_id'].tolist()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


# Create submission

In [30]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [31]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_september_all
    preds.append(pred[:12])

CPU times: user 4.72 s, sys: 147 ms, total: 4.87 s
Wall time: 4.87 s


In [32]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [33]:
sub_name = 'submission'
sub.to_csv(f'{sub_name}.csv', index=False)