Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [None]:
import pandas as pd

In [None]:
%%time

transactions = pd.read_parquet('../input/warmup/transactions_train.parquet')
customers = pd.read_parquet('../input/warmup/customers.parquet')
articles = pd.read_parquet('../input/warmup/articles.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

# Test Week
Change the test week based on what we want to check, if we want to predict -> 'transactions.week.max() + 1'. 
If we want to check the recall (how well it can predict) -> value of the test_week we want to test for

In [None]:
test_week = 104
absolute_max_week = transactions.week.max()
print(test_week)
test_week_transactions = transactions[transactions.week == test_week]
transactions = transactions[(transactions.week > test_week - 11) & (transactions.week < test_week)].reset_index(drop=True)
# transactions = transactions[transactions.week < test_week]

# Generating candidates

### Last purchase candidates

In [None]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

In [None]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

In [None]:
c2weeks

In [None]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

In [None]:
c2weeks2shifted_weeks[28847241659200]

In [None]:
candidates_last_purchase = transactions.copy()

In [None]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

In [None]:
candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040]

In [None]:
transactions[transactions['customer_id']==272412481300040]

### Bestsellers candidates

get the mean of the prices of transactions sorted on the article_id and the week

In [None]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [None]:
mean_price

make sales which is the ranking of the 12 most bought article_ids in each week

In [None]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [None]:
sales

In [None]:
sales.loc[95]

bestsellers_previous_week will be a collection for all weeks where the mean price and the rankings are joined. Therefor the dataframe will look like the [week, article] combination with the ranking and then the average price. The week is increased with one.

In [None]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [None]:
bestsellers_previous_week.pipe(lambda df: df[df['week']==96])

Unique transactions is a dataframe containing one entry of the [week, customer_id] combination. Here the article_ids are dropped as well as the prices

In [None]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [None]:
unique_transactions

drop all duplicate [week, customer] baskets from transactions

In [None]:
transactions

In [None]:
transactions.drop_duplicates(['week', 'customer_id'])

In [None]:
transactions

bestsellers for each customer from previous week for all the weeks that are in the dataset

### !!!FOR AGE GROUP MERGE ON AGE

In [None]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

make a set of users that would buy in the test week

In [None]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [None]:
test_set_transactions

all the bestsellers based on the weeks for the test week

In [None]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

combine both lists of bestsellers

In [None]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [None]:
candidates_bestsellers

# Making a Recall evaluation function
For the recall evaluation we have several requirements that have to be met:
* We need to compare two sets of items, the first being the items that are in fact bought, the other set of items is the set of items that are candidates for the predictions
* Only compare recall -> amount of positives (candidates that are in fact bought) / total positives (amount of items bought)

!! If we would be to generate more candidates, this would automatically grow as the chance we include one is bigger

## Eerste eigen poging
Dit is de code die ik schreef om de recall handmatig te berekenen. Dit is immens traag vanwege de loop en de aparte dataframes die worden aangemaakt daarvoor. Hieronder is een recall functie die geinspireerd is door die van Noah Daniels die gebruik maakt van table en array operaties die vele malen sneller zijn.

In [None]:
# function that returns the recall value 
# It returns None if the predicted week is not in the purchases set
def recall (purchases, candidates):
    # check if the test_week is the same as the most recent week in the predictions
    if not candidates.week.max() == test_week:
        print("There is something wrong with the predictions")
    
    # check if the predicted week is in fact in the dataset
    if test_week > absolute_max_week:
        return None
    
    # Take the data in question
    test_week_purchases = purchases[purchases.week == test_week]
    test_week_candidates = candidates[candidates.week == test_week]
    
    # take all the users
    users = test_week_purchases.customer_id.drop_duplicates()
    
    # start the total recall value which will be averaged out at the end
    total_recall = 0
    
    # loop over all the users in this week
    for user in users:
        # get the corresponding purchases and candidates
        user_purchases = test_week_purchases[test_week_purchases.customer_id == user]
        user_candidates = test_week_candidates[test_week_candidates.customer_id == user]
        
        # get the article_ids in a Series
        user_purchased_articles = user_purchases.article_id.drop_duplicates()
        user_candidate_articles = user_candidates.article_id.drop_duplicates()
        
        # get the total positives
        total_positives = user_purchased_articles.count()
        
        # get the true positives
        intersection = user_candidate_articles[user_candidate_articles.isin(user_purchased_articles)]
        true_positives = intersection.count()
        
        # calculate the recall value for this user and add it to the total
        recall_value = true_positives / total_positives
        total_recall += recall_value
    
    # take the average of the recalls and return it
    total_recall = total_recall / users.count()
    return total_recall
        

In [None]:
# return the average recall of generated candidates versus the actual bought items
def average_recall(purchases, candidates):
    joined = pd.merge(purchases, candidates, how='inner').drop_duplicates()
    true_positives = joined.groupby('customer_id').count()
    total_positives = purchases.groupby('customer_id').count()
    recall = true_positives.divide(total_positives, fill_value=0)
    return recall.mean().values[0]

In [None]:
if not test_week > absolute_max_week:
    purchases = test_week_transactions[test_week_transactions.week == test_week][['customer_id', 'article_id']].drop_duplicates()
    candidates = candidates_bestsellers[candidates_bestsellers.week == test_week][['customer_id', 'article_id']].drop_duplicates()
    print(average_recall(purchases, candidates))

In [None]:
# test = candidates_bestsellers[candidates_bestsellers.week == test_week].customer_id.drop_duplicates()
# print(test.count())

# Making a new group of candidates based on age group
First make a new column and then apply the same process but instead group on both the age group and the 

In [None]:
# define age groups
def get_age_group(age):
    if age < 18:
        return 0
    elif age >= 18 and age < 25:
        return 1
    elif age >= 25 and age < 35:
        return 2
    elif age >= 35 and age < 45:
        return 3
    elif age >= 45 and age < 55:
        return 4
    elif age >= 55 and age < 65:
        return 5
    else:
        return 6

Created a function to apply on the ages in the table. Then created a new column containing this. 
For now the age groups are strings, this means they cannot be used as features in the ranker model.

In [None]:
#apply the age groups on transactions
customers["age_group"] = customers["age"].apply(get_age_group)

## Now make a similar popularity calculation with respect to the age_group popularity 
(how popular is the item in a certain age_group that week)
Then calculate similarly and evaluate

In [None]:
# firstly take the age_groups and the cutomer ids
age_groups_customers = customers[['customer_id', 'age_group']].drop_duplicates()

# now join them into the transactions to create a new transactions set to work with
age_group_transactions = pd.merge(transactions, age_groups_customers)
# now the age_group is included, we will have to change some values and names to ensure this is used

In [None]:
# Group the mean_price not per week/article but by week/article/age_group
# this is so we know
mean_price_age_group = age_group_transactions \
    .groupby(['week', 'age_group', 'article_id'])['price'].mean()

# group the sales by week AND the age group and so find the most popular article for each age group in each week
sales_age_group = age_group_transactions \
    .groupby(['week', 'age_group'])['article_id'].value_counts() \
    .groupby(['week', 'age_group']).rank(method='dense', ascending=False) \
    .groupby(['week', 'age_group']).head(12).rename('age_group_bestseller_rank').astype('int8')

In [None]:
mean_price_age_group

In [None]:
sales_age_group

In [None]:
# now calculate the bestsellers for these week - age_group combos
bestsellers_previous_week_age_group = pd.merge(sales_age_group, mean_price_age_group, on=['week', 'age_group', 'article_id']).reset_index()
bestsellers_previous_week_age_group.week += 1

In [None]:
bestsellers_previous_week_age_group.pipe(lambda df: df[(df['week']==96) & (df['age_group']=='18-24')])

In [None]:
unique_age_group_transactions = age_group_transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [None]:
age_group_candidates_bestsellers = pd.merge(
    unique_age_group_transactions,
    bestsellers_previous_week_age_group,
    on=['week', 'age_group'],
)

In [None]:
test_set_age_group_transactions = unique_age_group_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_age_group_transactions.week = test_week

In [None]:
age_group_candidates_bestsellers_test_week = pd.merge(
    test_set_age_group_transactions,
    bestsellers_previous_week_age_group,
    on=['week', 'age_group'],
)

In [None]:
age_group_candidates_bestsellers = pd.concat([age_group_candidates_bestsellers, age_group_candidates_bestsellers_test_week])
age_group_candidates_bestsellers.drop(columns='age_group_bestseller_rank', inplace=True)

In [None]:
age_group_candidates_bestsellers

# Check The Recall

In [None]:
if not test_week > absolute_max_week:
    purchases = test_week_transactions[test_week_transactions.week == test_week][['customer_id', 'article_id']].drop_duplicates()
    candidates = age_group_candidates_bestsellers[age_group_candidates_bestsellers.week == test_week][['customer_id', 'article_id']].drop_duplicates()
    print(average_recall(purchases, candidates))

1. # My code
First a bit of encoding. We will change the missing age value to the median. 

In [None]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit_transform(train['age'])

ages_copy = customers[customers.age != -1].copy()
median_age = ages_copy['age'].median()
median_age

In [None]:
customers.loc[customers['age'] == -1] = median_age 
customers['age'].min()

## Most popular item types

In [None]:
# transactions.loc[:'year'] = transactions['t_dat'].dt.year
# transactions['year'].value_counts()

## Find the most bought color for a user
We want to know the favourite color of the user based on the items bought.

In [None]:
articles_and_transactions = pd.merge(articles, transactions, on="article_id")
pop_colors = articles_and_transactions.groupby('customer_id')['colour_group_code'].value_counts().groupby('customer_id').rank(method='dense', ascending=False) \
    .groupby('customer_id').head(12).rename('favourite_color').astype('int8')

In [None]:
pop_colors

# Combining transactions and candidates / negative examples

## what are they doing here?

In [None]:
age_group_transactions['purchased'] = 1

In [None]:
data = pd.concat([age_group_transactions, candidates_last_purchase, candidates_bestsellers, age_group_candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

The age_group has many NaN values

In [None]:
data.isna().sum()

## checking for NAN age groups
here the results showed that there were NAN values in the age groups

In [None]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [None]:
data.purchased.mean()

### Add bestseller information

## why add the bestseller this way? How to do with age_group

In [None]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [None]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [None]:
data

Does age_group need to be included in the other bestseller ranking?

In [None]:
# merge the data with the bestsellers information from the age_group popularity study
data = pd.merge(
    data,
    bestsellers_previous_week_age_group[['week', 'age_group', 'article_id', 'age_group_bestseller_rank']],
    on=['week', 'age_group', 'article_id'],
    how='left'
)

In [None]:
data = data[data.week != data.week.min()]
data.age_group_bestseller_rank.fillna(999, inplace=True)

Merging the age group bestsellers in the same way as normal bestsellers should result in normal outputs. 
Only this would then be improved as the recall was higher.

If you look closely the data is first concatenated and then merged on age groups. But the non age group sources like repurchase do not have these age groups. They are NAN values and aren't merged correctly and later are wrongly used during training.

In [None]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on=['customer_id', 'age_group'], how='left')

In [None]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

# my code

In [None]:
# make a copy of the date column
# we use the date column to replace it with the days  so we still have the date, but now have a day column
data["day"] = data.loc[:,"t_dat"]
data.loc[:, "day"] = data["day"].dt.day
data["day"].value_counts()

In [None]:
data["t_dat"].value_counts()

In [None]:
data.loc[:, "day_sin"] = np.sin(2 * np.pi * data["day"]/365)
data["day_sin"]

In [None]:
data.loc[:, "day_cos"] = np.cos(2 * np.pi * data["day"]/365)
data["day_sin"]

In [None]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [None]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [None]:
train.columns

### change to the columns
after an attempt of using only age group bestseller rank, which lowered the results, I thought to maybe add the age groups as well. This later proved to be redundant, but at this moment resulted in a very (false) promising result in importance scores

In [None]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank', 'age_group_bestseller_rank', 'age_group']

In [None]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

# Model training

In [None]:
from lightgbm.sklearn import LGBMRanker

In [None]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [None]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

Here the results are not saved for some reason, but the top two most important features were age group bestseller and age group itself. The reason for this was the model learning that that candidates were either from age group candidates or not, indicated by the absence of an age group value.

# Calculate predictions

In [None]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

In [None]:
c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list)

In [None]:
c_id2predicted_article_ids

In [None]:
bestsellers_last_week

# Create submission

In [None]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [None]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

In [None]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [None]:
sub_name = 'added_2_features_model_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)