Based on the baseline "Radek's LGBMRanker starter-pack”: https://www.kaggle.com/code/marcogorelli/radek-s-lgbmranker-starter-pack


# Helper functions

In [1]:
import numpy as np

def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
def precision(actual, predicted, k=12):
    if len(predicted) > k:
        predicted = predicted[:k]

    correct_predictions = [p for p in predicted if p in actual]

    return len(correct_predictions) / len(predicted)

def recall(actual, predicted, k=12):
    if len(predicted) > k:
        predicted = predicted[:k]

    correct_predictions = [p for p in predicted if p in actual]

    return len(correct_predictions) / len(actual)

def precision12(actual, predicted, k=12):
    return np.mean([precision(a,p,k) for a,p in zip(actual, predicted)])

def recall12(actual, predicted, k=12):
    return np.mean([recall(a,p,k) for a,p in zip(actual, predicted)])

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []

    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [4]:
import pandas as pd

# Read input files

In [5]:
%%time
pad = "/kaggle/input/makeparquet"
transactions = pd.read_parquet(pad+'/transactions_train.parquet')
customers = pd.read_parquet(pad+'/customers.parquet')
articles = pd.read_parquet(pad+'/articles.parquet')

CPU times: user 3.81 s, sys: 4.79 s, total: 8.6 s
Wall time: 6.4 s


In [6]:
val = transactions[transactions.week == transactions.week.max()]

# Testing week or latest week for submission

In [7]:
Testting = True
test_week = transactions.week.max()
transactions = transactions[transactions.week > transactions.week.max() - 9]
transactions = transactions[transactions.week < transactions.week.max()]

# Testting = False
# test_week = transactions.week.max()+1
# transactions = transactions[transactions.week > transactions.week.max() - 10]


# Generating candidates

### Last purchase candidates

In [8]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 30.2 s, sys: 225 ms, total: 30.4 s
Wall time: 30.3 s


In [9]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
95,2020-07-15,2020-07-21
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15


In [10]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 1.08 s, sys: 59.8 ms, total: 1.14 s
Wall time: 1.14 s


In [11]:
candidates_last_purchase = transactions.copy()

In [12]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])

candidates_last_purchase.week=weeks

CPU times: user 37.2 s, sys: 178 ms, total: 37.4 s
Wall time: 37.2 s


# Over write with my own implementation of candidate generation

In [13]:
# Generating candidates based on repurchasing
repurchase_candidates = transactions.copy()
repurchase_candidates['week'] += 1 

# Merge repurchase candidates with customer information
repurchase_candidates = pd.merge(repurchase_candidates, customers, on='customer_id', how='left')
repurchase_candidates = pd.merge(repurchase_candidates, articles, on='article_id', how='left')

# Drop unnecessary columns
repurchase_candidates.drop(['price', 'sales_channel_id', 'year', 'month'], axis=1, inplace=True)

# Rename columns to match the rest of the data
repurchase_candidates.rename(columns={'week': 'previous_purchase_week'}, inplace=True)

candidates_last_purchase = repurchase_candidates.copy()
# Fill NaN values in 'previous_purchase_week' with a value that represents no repurchase
candidates_last_purchase['previous_purchase_week'].fillna(-1, inplace=True)

# Create a new column 'repurchased' indicating repurchase
candidates_last_purchase['repurchased'] = (candidates_last_purchase['previous_purchase_week'] != -1).astype(int)

# Drop the 'previous_purchase_week' column as it's no longer needed
# candidates_last_purchase.drop('previous_purchase_week', axis=1, inplace=True)
candidates_last_purchase.rename(columns={'previous_purchase_week': 'week'}, inplace=True)

### Bestsellers candidates

In [14]:
# mean_price = transactions \
#     .groupby(['week', 'article_id'])['price'].mean()
# sales = transactions \
#     .groupby('week')['article_id'].value_counts() \
#     .groupby('week').rank(method='dense', ascending=False) \
#     .groupby('week').head(12).rename('bestseller_rank').astype('int8')
# bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
# bestsellers_previous_week.week += 1
# bestsellers_previous_week.week
# bestsellers_previous_week.pipe(lambda df: df[df['week']==96])

**Use either the above one or my own implmentation below**

In [15]:
# Calculate the number of unique customers who bought each article
unique_customers_per_article = transactions.groupby('article_id')['customer_id'].nunique().reset_index()
unique_customers_per_article.rename(columns={'customer_id': 'unique_customers'}, inplace=True)

# Rank articles based on the number of unique customers
bestsellers_ranked = unique_customers_per_article.sort_values('unique_customers', ascending=False).reset_index(drop=True)
bestsellers_ranked['bestseller_rank'] = bestsellers_ranked.index + 1

# Merge the bestseller ranks with the articles information
bestsellers_previous_week = pd.merge(bestsellers_ranked[['article_id', 'bestseller_rank']], articles, on='article_id')

# Calculate mean price for each article in each week
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean().reset_index()

# Identify the top 12 bestsellers for each week based on mean price
bestsellers_previous_week = mean_price.groupby('week', group_keys=False).apply(
    lambda x: x.sort_values('price', ascending=False).head(12)
).reset_index(drop=True)


bestsellers_previous_week['bestseller_rank'] = bestsellers_previous_week.groupby('week').cumcount() + 1
bestsellers_previous_week = pd.merge(bestsellers_previous_week, articles, on='article_id')

# Adjust week for the next week
bestsellers_previous_week['week'] += 1


In [16]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [17]:
transactions.drop_duplicates(['week', 'customer_id'])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,season,year,month,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,2,-28,7,95
29064059,2020-07-15,1456826891333599,888294001,0.013542,1,2,-28,7,95
29067103,2020-07-15,2133687643102426,843642001,0.042356,2,2,-28,7,95
29027487,2020-07-15,6010692573790711,857812010,0.039661,1,2,-28,7,95
29046403,2020-07-15,6171059100114610,815447007,0.006763,2,2,-28,7,95
...,...,...,...,...,...,...,...,...,...
31521960,2020-09-15,18439897732908966680,794321007,0.061000,2,3,-28,9,103
31531712,2020-09-15,18444276791873187543,867969008,0.033881,2,3,-28,9,103
31539937,2020-09-15,18444799607866739422,909721003,0.042356,2,3,-28,9,103
31543799,2020-09-15,18446250046654386343,869872006,0.033881,1,3,-28,9,103


In [18]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [19]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [20]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [21]:
candidates_bestsellers_test_week.columns

Index(['t_dat', 'customer_id', 'sales_channel_id', 'season', 'year', 'month',
       'week', 'article_id', 'price', 'bestseller_rank', 'product_code',
       'prod_name', 'product_type_no', 'product_type_name',
       'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc', 'fabric_jeans',
       'fabric_cotton', 'fabric_wool', 'fabric_polyester', 'fabric_silk',
       'fabric_denim', 'fabric_linen', 'fabric_spandex', 'fabric_rayon',
       'fabric_nylon', 'fabric_leather', 'fabric_suede'],
      dtype='object')

In [22]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [23]:
candidates_bestsellers.columns

Index(['t_dat', 'customer_id', 'sales_channel_id', 'season', 'year', 'month',
       'week', 'article_id', 'price', 'product_code', 'prod_name',
       'product_type_no', 'product_type_name', 'product_group_name',
       'graphical_appearance_no', 'graphical_appearance_name',
       'colour_group_code', 'colour_group_name', 'perceived_colour_value_id',
       'perceived_colour_value_name', 'perceived_colour_master_id',
       'perceived_colour_master_name', 'department_no', 'department_name',
       'index_code', 'index_name', 'index_group_no', 'index_group_name',
       'section_no', 'section_name', 'garment_group_no', 'garment_group_name',
       'detail_desc', 'fabric_jeans', 'fabric_cotton', 'fabric_wool',
       'fabric_polyester', 'fabric_silk', 'fabric_denim', 'fabric_linen',
       'fabric_spandex', 'fabric_rayon', 'fabric_nylon', 'fabric_leather',
       'fabric_suede'],
      dtype='object')

# Combining transactions and candidates / negative examples

In [24]:
transactions['purchased'] = 1

In [25]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [26]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [27]:
data.purchased.mean()

0.1352913103560726

### Add bestseller information

In [28]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [29]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [30]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [31]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [32]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [33]:
columns_to_use = ['article_id', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank',
'buys_womens_clothing', 'buys_mens_clothes', 'buys_kid_baby_clothes']
#  'year', 'month',, 'product_type_no'

In [34]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 694 ms, sys: 734 ms, total: 1.43 s
Wall time: 1.43 s


# Model training

In [35]:
from lightgbm.sklearn import LGBMRanker
from tqdm import tqdm


In [36]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [37]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.799041
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.439472
[LightGBM] [Debug] init for col-wise cost 0.821670 seconds, init for row-wise cost 2.279373 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1052
[LightGBM] [Info] Number of data points in the train set: 11337104, number of used features: 20
[LightGBM] [Debug] Trained a tree with leaves = 4 and depth = 2
CPU times: user 28.6 s, sys: 3.77 s, total: 32.4 s
Wall time: 13.6 s


In [38]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

graphical_appearance_no 0.9999999999999999
article_id 1.2566101113223014e-16
buys_mens_clothes 0.0
colour_group_code 0.0
perceived_colour_value_id 0.0
perceived_colour_master_id 0.0
department_no 0.0
index_code 0.0
index_group_no 0.0
section_no 0.0
buys_kid_baby_clothes 0.0
FN 0.0
Active 0.0
club_member_status 0.0
fashion_news_frequency 0.0
age 0.0
postal_code 0.0
bestseller_rank 0.0
buys_womens_clothing 0.0
garment_group_no 0.0


# Calculate predictions

In [39]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 9.06 µs


# Create submission

In [40]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [41]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 7.22 s, sys: 274 ms, total: 7.49 s
Wall time: 7.49 s


In [42]:
if Testting:
    positive_items_val = val.groupby(['customer_id'])['article_id'].apply(list)
    # creating validation set for metrics use case
    val_users = positive_items_val.keys()
    val_items = []
    for i,user in tqdm(enumerate(val_users)):
        val_items.append(positive_items_val[user])

    print("Total users in validation:", len(val_users))
    print("mAP12 Score on Validation set:", mapk(val_items, preds))
    print("precision Score on Validation set:", precision12(val_items, preds))
    print("recall Score on Validation set:", recall12(val_items, preds))

68984it [00:00, 143298.85it/s]


Total users in validation: 68984
mAP12 Score on Validation set: 0.00013450753137673294
precision Score on Validation set: 9.543275735436235e-05
recall Score on Validation set: 0.0004961334593930558


In [43]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [44]:
sub_name = 'candidateRepurchase_oldBestseller'
sub.to_csv(f'{sub_name}.csv.gz', index=False)