Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [29]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [30]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [None]:
# Copied from NickWils https://github.com/LienM/ai-project-23-24/blob/main/NickWils/Lecture6/candidate-repurchase.ipynb
def recall(actual, predicted, k=12):
    if len(predicted) > k:
        predicted = predicted[:k]

    correct_predictions = [p for p in predicted if p in actual]

    return len(correct_predictions) / len(actual)

def recall12(actual, predicted, k=12):
    return np.mean([recall(a,p, k) for a,p in zip(actual, predicted)])

In [None]:
def calculateRecall(expected, retrieved):
    """
    R(ecall) = TP/(TP+FN) how much of correct
    :param expected: list of expected values 
    :param retrieved: list of retrieved values
    :return:
    """
    # number of retrieved values that are also in expected (True positive)
    TP = len([ret for ret in retrieved if ret in expected])
    # number of expected values that aren't retrieved (False negative)
    FN = len([ex for ex in expected if ex not in retrieved])
    # recall calculation (by formula)
    recall = TP/(TP+FN)
    return recall

def mean_recall(expected, retrieved):
    """
    Calculate mean recall for all users
    :param expected: list of expected values
    :param retrieved: list of retrieved values
    :return:
    """
    recalls = [calculateRecall(ex, ret) for ex, ret in zip(expected, retrieved)]
    mean_recall = np.mean(recalls)
    return mean_recall

In [31]:
import pandas as pd
import time

transactions = pd.read_parquet('../input/transactions_train.parquet')
customers = pd.read_parquet('../input/customers.parquet')
articles = pd.read_parquet('../input/articles.parquet')

transactions_copy = transactions.copy()
validation = transactions[transactions.week == transactions.week.max()]

test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [32]:
c2weeks = transactions.groupby('customer_id')['week'].unique()
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
candidates_last_purchase.week=weeks

### Create three features: age, index_group_name and mean purchase price of the customer.

In [33]:
# first create mean price per customer
mean_price_per_c = transactions.groupby('customer_id')['price'].mean().rename('mean_price_per_c').reset_index()
# figure out the customers preferred index group name by looking at past transactions articles' index_group_name
ttransactions = pd.merge(transactions, articles[['article_id', 'index_group_name']], on='article_id', how='left')
# per customer count the number of purchases of each index_group_name
c2index_group_name = ttransactions.groupby('customer_id')['index_group_name'].value_counts().rename('count').reset_index()
c2index_group_name = c2index_group_name.sort_values('count', ascending=False).groupby('customer_id').head(1).rename(columns={'index_group_name': 'highest_count_ign_per_c'})
# merge the mean_price_per_c and c2index_group_name dataframes
t = pd.merge(c2index_group_name[['customer_id', 'highest_count_ign_per_c']], mean_price_per_c, on='customer_id', how='left')

# per article find the average age of the customers who bought it
transactions_with_age = pd.merge(transactions, customers[['customer_id', 'age']], on='customer_id', how='left')
mean_age_per_a = transactions_with_age.groupby('article_id')['age'].mean().rename('mean_age_per_a').astype('int8').reset_index()

# merge to get the mean_price_per_c, highest_count_ign_per_c and mean_age_per_a columns
transactions_with_age_feat = pd.merge(transactions_with_age, t, on='customer_id', how='left')
transactions_with_3feat = pd.merge(transactions_with_age_feat, mean_age_per_a, on='article_id', how='left')
transactions_with_3feat = pd.merge(transactions_with_3feat, articles[['article_id', 'index_group_name']], on='article_id', how='left')

In [34]:
transactions_with_3feat

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,age,highest_count_ign_per_c,mean_price_per_c,mean_age_per_a,index_group_name
0,2020-07-15,272412481300040,778064028,0.008458,1,95,48,0,0.030597,32,0
1,2020-07-15,272412481300040,816592008,0.016932,1,95,48,0,0.030597,37,2
2,2020-07-15,272412481300040,621381021,0.033881,1,95,48,0,0.030597,35,2
3,2020-07-15,272412481300040,817477003,0.025407,1,95,48,0,0.030597,33,0
4,2020-07-15,272412481300040,899088002,0.025407,1,95,48,0,0.030597,35,2
...,...,...,...,...,...,...,...,...,...,...,...
2762867,2020-09-22,18439937050817258297,891591003,0.084729,2,104,49,0,0.040379,41,0
2762868,2020-09-22,18439937050817258297,869706005,0.084729,2,104,49,0,0.040379,39,0
2762869,2020-09-22,18440902715633436014,918894002,0.016932,1,104,18,0,0.028232,30,0
2762870,2020-09-22,18440902715633436014,761269001,0.016932,1,104,18,0,0.028232,33,0


### Generate personalized candidates for each customer based on three features: age, index_group_name and mean purchase price of the customer.

In [35]:
# merge t with customers columns customer_id, age on customer_id for later use
t_merged = pd.merge(t, customers[['customer_id', 'age']], on='customer_id', how='left')
# Create a DataFrame with unique articles
unique_articles = transactions_with_3feat[['t_dat', 'sales_channel_id', 'article_id', 'price', 'index_group_name', 'week']].drop_duplicates()
# we only keep latest purchase of each article
unique_articles = unique_articles.sort_values('t_dat', ascending=False).drop_duplicates('article_id')

unique_customers = mean_price_per_c['customer_id'].unique()[:1000]
chunk_size = 1000  # Define the number of customers to process at once
chunks = [unique_customers[i:i + chunk_size] for i in range(0, len(unique_customers), chunk_size)]

topX_price=50
topX_age=25

def get_candidates(customer_chunks, topX_price=50, topX_age=25):
    result_candidates_3feat = pd.DataFrame()  # DataFrame to store final candidates
    result_candidates_3feat_chunks = []
    
    for idx, customer_chunk in enumerate(customer_chunks):
        start = time.time()
        # Cartesian product of unique articles and customers, since we want to choose candidates out of all unique articles for each customer
        candidate_articles = pd.merge(
            unique_articles,
            pd.DataFrame({'customer_id': customer_chunk}),
            how='cross'
        )
        # get the necessary columns to filter out the candidates
        candidate_articles = pd.merge(candidate_articles, t_merged, on='customer_id', how='left')
        candidate_articles = pd.merge(candidate_articles, mean_age_per_a, on='article_id', how='left')
        
        # Select all candidates per customer_id where highest_count_ign_per_c is equal to index_group_name
        candidate_articles = candidate_articles[candidate_articles['highest_count_ign_per_c']==candidate_articles['index_group_name']]
        
        # Calculate price difference for each combination
        candidate_articles['price_difference'] = abs(candidate_articles['mean_price_per_c'] - candidate_articles['price'])
        # Rank articles within each customer group based on price difference
        candidate_articles['price_rank'] = (
            candidate_articles
            .groupby(['week', 'customer_id'])['price_difference']
            .rank(ascending=True, method='min')
        )
        # Select the top 50 candidates for each customer
        top_candidates = (
            candidate_articles
            .sort_values(by=['customer_id', 'week', 'price_rank'])
            .groupby(['week', 'customer_id'])
            .head(topX_price)
        )
        
        # Calculate age difference for each combination
        top_candidates['age_difference'] = abs(top_candidates['age'] - top_candidates['mean_age_per_a'])
        # Rank articles within each customer group based on age difference
        top_candidates['age_rank'] = (
            top_candidates
            .groupby(['week', 'customer_id'])['age_difference']
            .rank(ascending=True, method='min')
        )
        # Select the top 12 candidates for each customer based on age difference
        top_candidates = (
            top_candidates
            .sort_values(by=['customer_id', 'week', 'age_rank'])
            .groupby(['week', 'customer_id'])
            .head(topX_age)
        )
        
        result_candidates_3feat_chunks.append(top_candidates) # Append current chunk's candidates to result
        
        print(f'Chunk {idx} processed in {time.time() - start:.2f} seconds')
    
    # Concatenate all chunks into the final result    
    result_candidates_3feat = pd.concat(result_candidates_3feat_chunks, ignore_index=True)

    top_candidates_3feat = result_candidates_3feat.drop(columns=['price_difference', 'age_difference'])
    return top_candidates_3feat

In [36]:
top_candidates_3feat = get_candidates(chunks, topX_price, topX_age)

Chunk 0 processed in 25.52 seconds


In [37]:
top_candidates_3feat

Unnamed: 0,t_dat,sales_channel_id,article_id,price,index_group_name,week,customer_id,highest_count_ign_per_c,mean_price_per_c,age,mean_age_per_a,price_rank,age_rank
0,2020-07-16,1,754852002,0.050831,0,95,28847241659200,0,0.046085,21,21,21.0,1.0
1,2020-07-21,1,747764002,0.047441,0,95,28847241659200,0,0.046085,21,22,2.0,2.0
2,2020-07-16,1,685944002,0.042356,0,95,28847241659200,0,0.046085,21,20,7.0,2.0
3,2020-07-19,1,746344001,0.033881,0,95,28847241659200,0,0.046085,21,20,36.0,2.0
4,2020-07-15,1,796042001,0.033881,0,95,28847241659200,0,0.046085,21,22,36.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,2020-09-22,2,761406005,0.022017,2,104,42984229297455520,2,0.021452,18,31,40.0,15.0
249996,2020-09-22,1,911034002,0.021678,2,104,42984229297455520,2,0.021452,18,32,12.0,22.0
249997,2020-09-22,1,670698020,0.021169,2,104,42984229297455520,2,0.021452,18,32,13.0,22.0
249998,2020-09-22,1,884405005,0.021169,2,104,42984229297455520,2,0.021452,18,32,13.0,22.0


In [38]:
top_candidates_3feat[top_candidates_3feat['customer_id'] == 272412481300040]

Unnamed: 0,t_dat,sales_channel_id,article_id,price,index_group_name,week,customer_id,highest_count_ign_per_c,mean_price_per_c,age,mean_age_per_a,price_rank,age_rank
1250,2020-07-20,1,578317003,0.027102,0,95,272412481300040,0,0.030597,48,48,26.0,1.0
1251,2020-07-18,1,690803001,0.033881,0,95,272412481300040,0,0.030597,48,47,11.0,2.0
1252,2020-07-17,1,559626001,0.025424,0,95,272412481300040,0,0.030597,48,47,45.0,2.0
1253,2020-07-21,2,594834018,0.030492,0,95,272412481300040,0,0.030597,48,46,1.0,4.0
1254,2020-07-15,2,757333004,0.030492,0,95,272412481300040,0,0.030597,48,50,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,2020-09-22,2,557994003,0.030492,0,104,272412481300040,0,0.030597,48,34,4.0,18.0
1496,2020-09-22,1,843380004,0.030492,0,104,272412481300040,0,0.030597,48,34,4.0,18.0
1497,2020-09-22,1,883689001,0.030492,0,104,272412481300040,0,0.030597,48,34,4.0,18.0
1498,2020-09-22,2,874264001,0.030492,0,104,272412481300040,0,0.030597,48,34,4.0,18.0


In [39]:
top_candidates_3feat_prev_week = top_candidates_3feat.copy()
del top_candidates_3feat
top_candidates_3feat_prev_week.week += 1

In [40]:
top_candidates_3feat_prev_week

Unnamed: 0,t_dat,sales_channel_id,article_id,price,index_group_name,week,customer_id,highest_count_ign_per_c,mean_price_per_c,age,mean_age_per_a,price_rank,age_rank
0,2020-07-16,1,754852002,0.050831,0,96,28847241659200,0,0.046085,21,21,21.0,1.0
1,2020-07-21,1,747764002,0.047441,0,96,28847241659200,0,0.046085,21,22,2.0,2.0
2,2020-07-16,1,685944002,0.042356,0,96,28847241659200,0,0.046085,21,20,7.0,2.0
3,2020-07-19,1,746344001,0.033881,0,96,28847241659200,0,0.046085,21,20,36.0,2.0
4,2020-07-15,1,796042001,0.033881,0,96,28847241659200,0,0.046085,21,22,36.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,2020-09-22,2,761406005,0.022017,2,105,42984229297455520,2,0.021452,18,31,40.0,15.0
249996,2020-09-22,1,911034002,0.021678,2,105,42984229297455520,2,0.021452,18,32,12.0,22.0
249997,2020-09-22,1,670698020,0.021169,2,105,42984229297455520,2,0.021452,18,32,13.0,22.0
249998,2020-09-22,1,884405005,0.021169,2,105,42984229297455520,2,0.021452,18,32,13.0,22.0


In [41]:
unique_transactions_ = transactions.groupby(['week', 'customer_id']).head(1).drop(columns=['article_id', 'price']).copy()
candidates_ = top_candidates_3feat_prev_week.copy()
candidates_.drop(columns=['price_rank', 'age_rank', 'highest_count_ign_per_c', 'mean_price_per_c', 'index_group_name', 'mean_age_per_a', 'age'], inplace=True)

In [43]:
candidates_

Unnamed: 0,t_dat,sales_channel_id,article_id,price,week,customer_id
0,2020-07-16,1,754852002,0.050831,96,28847241659200
1,2020-07-21,1,747764002,0.047441,96,28847241659200
2,2020-07-16,1,685944002,0.042356,96,28847241659200
3,2020-07-19,1,746344001,0.033881,96,28847241659200
4,2020-07-15,1,796042001,0.033881,96,28847241659200
...,...,...,...,...,...,...
249995,2020-09-22,2,761406005,0.022017,105,42984229297455520
249996,2020-09-22,1,911034002,0.021678,105,42984229297455520
249997,2020-09-22,1,670698020,0.021169,105,42984229297455520
249998,2020-09-22,1,884405005,0.021169,105,42984229297455520


In [44]:
transactions['purchased'] = 1
data = pd.concat([transactions, candidates_last_purchase, candidates_])
data.purchased.fillna(0, inplace=True)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [45]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0
...,...,...,...,...,...,...,...
249995,2020-09-22,42984229297455520,761406005,0.022017,2,105,0.0
249996,2020-09-22,42984229297455520,911034002,0.021678,1,105,0.0
249997,2020-09-22,42984229297455520,670698020,0.021169,1,105,0.0
249998,2020-09-22,42984229297455520,884405005,0.021169,1,105,0.0


In [46]:
data = pd.merge(
    data,
    top_candidates_3feat_prev_week[['week', 'article_id', 'price_rank', 'age_rank']],
    on=['week', 'article_id'],
    how='left'
)
data = data[data.week != data.week.min()]
data.price_rank.fillna(999, inplace=True)
data.age_rank.fillna(999, inplace=True)

In [47]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,price_rank,age_rank
261987,2020-07-22,200292573348128,880777001,0.025407,2,96,1.0,999.0,999.0
261988,2020-07-22,200292573348128,784332002,0.025407,2,96,1.0,999.0,999.0
261989,2020-07-22,200292573348128,827968001,0.016932,2,96,1.0,999.0,999.0
261990,2020-07-22,200292573348128,599580086,0.011847,2,96,1.0,999.0,999.0
261991,2020-07-22,248294615847351,720504008,0.031458,1,96,1.0,999.0,999.0
...,...,...,...,...,...,...,...,...,...
16993901,2020-09-22,42984229297455520,884405005,0.021169,1,105,0.0,13.0,22.0
16993902,2020-09-20,42984229297455520,850622002,0.021169,1,105,0.0,15.0,21.0
16993903,2020-09-20,42984229297455520,850622002,0.021169,1,105,0.0,15.0,21.0
16993904,2020-09-20,42984229297455520,850622002,0.021169,1,105,0.0,1.0,23.0


In [48]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [49]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [50]:
train

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,price_rank,age_rank,product_code,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-07-26,28847241659200,887770001,0.016932,1,96,1.0,999.0,999.0,887770,...,10,1010,6,3692,1,1,0,1,21,57896
1,2020-07-18,28847241659200,762846001,0.025407,1,96,0.0,999.0,999.0,762846,...,7,1010,6,492,1,1,0,1,21,57896
2,2020-07-18,28847241659200,829308001,0.033881,1,96,0.0,999.0,999.0,829308,...,21,1005,0,9082,1,1,0,1,21,57896
3,2020-07-16,28847241659200,754852002,0.050831,1,96,0.0,21.0,1.0,754852,...,45,1001,10,17015,1,1,0,1,21,57896
4,2020-07-16,28847241659200,754852002,0.050831,1,96,0.0,5.0,1.0,754852,...,45,1001,10,17015,1,1,0,1,21,57896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12340983,2020-09-20,18445641720816255142,923037003,0.033881,2,104,1.0,999.0,999.0,923037,...,31,1017,4,10043,1,1,0,1,31,147613
12340984,2020-09-21,18446737527580148316,547780001,0.023712,2,104,1.0,999.0,999.0,547780,...,27,1002,2,271,1,1,0,1,60,96323
12340985,2020-09-21,18446737527580148316,763988001,0.023712,2,104,1.0,999.0,999.0,763988,...,16,1002,2,1107,1,1,0,1,60,96323
12340986,2020-09-21,18446737527580148316,763988003,0.023712,2,104,1.0,999.0,999.0,763988,...,16,1002,2,1107,1,1,0,1,60,96323


In [51]:
test

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,price_rank,age_rank,product_code,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
12340988,2020-09-03,28847241659200,925246001,0.128797,2,105,0.0,999.0,999.0,925246,...,40,1007,9,27855,1,1,0,1,21,57896
12340989,2020-09-22,28847241659200,779068002,0.045746,2,105,0.0,7.0,1.0,779068,...,10,1003,3,15430,1,1,0,1,21,57896
12340996,2020-09-22,28847241659200,754323004,0.045746,2,105,0.0,7.0,2.0,754323,...,0,1003,3,4874,1,1,0,1,21,57896
12341003,2020-09-22,28847241659200,721481003,0.045746,2,105,0.0,7.0,3.0,721481,...,0,1010,6,11900,1,1,0,1,21,57896
12341010,2020-09-22,28847241659200,904225001,0.045746,1,105,0.0,7.0,3.0,904225,...,20,1020,7,26543,1,1,0,1,21,57896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16731910,2020-09-06,18446705133201055310,875784002,0.050831,2,105,0.0,999.0,999.0,875784,...,15,1003,3,11763,1,1,0,1,60,177867
16731911,2020-09-21,18446737527580148316,547780001,0.023712,2,105,0.0,13.0,15.0,547780,...,27,1002,2,271,1,1,0,1,60,96323
16731914,2020-09-21,18446737527580148316,763988001,0.023712,2,105,0.0,1.0,16.0,763988,...,16,1002,2,1107,1,1,0,1,60,96323
16731915,2020-09-21,18446737527580148316,763988003,0.023712,2,105,0.0,999.0,999.0,763988,...,16,1002,2,1107,1,1,0,1,60,96323


In [52]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [53]:
train_baskets

array([2172, 2121, 3607, ...,    2,    7,    4], dtype=int64)

In [54]:
extra_columns = ['price_rank', 'age_rank']
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code']
columns_to_use.extend(extra_columns)

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [55]:
test_X

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,price_rank,age_rank
12340988,925246001,262,1010016,9,4,5,1201,0,1,19,1007,1,1,0,1,21,57896,999.0,999.0
12340989,779068002,252,1010016,13,2,11,1610,0,1,6,1003,1,1,0,1,21,57896,7.0,1.0
12340996,754323004,252,1010010,14,4,11,1626,0,1,15,1003,1,1,0,1,21,57896,7.0,2.0
12341003,721481003,258,1010017,73,2,6,1522,0,1,15,1010,1,1,0,1,21,57896,7.0,3.0
12341010,904225001,87,1010016,9,4,5,3529,6,1,64,1020,1,1,0,1,21,57896,7.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16731910,875784002,252,1010016,9,4,12,1949,0,1,2,1003,1,1,0,1,60,177867,999.0,999.0
16731911,547780001,272,1010016,9,4,5,1643,1,2,51,1002,1,1,0,1,60,96323,13.0,15.0
16731914,763988001,272,1010016,9,4,5,5833,2,3,26,1002,1,1,0,1,60,96323,1.0,16.0
16731915,763988003,272,1010010,9,4,12,5833,2,3,26,1002,1,1,0,1,60,96323,999.0,999.0


In [56]:
from lightgbm.sklearn import LGBMRanker
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())
    
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

best_last_week = \
    top_candidates_3feat_prev_week[top_candidates_3feat_prev_week.week == top_candidates_3feat_prev_week.week.max()]['article_id'].tolist()

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.889245
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.155714
[LightGBM] [Debug] init for col-wise cost 0.070614 seconds, init for row-wise cost 0.109762 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.121514 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1158
[LightGBM] [Info] Number of data points in the train set: 12340988, number of used features: 19
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 13


In [59]:
sub = pd.read_csv('../input/sample_submission.csv')
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + best_last_week
    preds.append(pred[:12])

In [65]:
validation_corresp_customers = validation[validation['customer_id'].isin(unique_customers)]

# Get the corresponding candidates you generated for those 6000 customers in the last week
candidates_last_week = top_candidates_3feat_prev_week[
    (top_candidates_3feat_prev_week['week'] == validation_corresp_customers['week'].max()) &
    (top_candidates_3feat_prev_week['customer_id'].isin(validation_corresp_customers['customer_id'].unique()))
]

# # Filter validation to include only the corresponding 6000 customers
# validation = validation[validation['customer_id'].isin(unique_customers)]

# Merge validation with candidates_last_week to include only common customers
# validation = validation[validation['customer_id'].isin(candidates_last_week['customer_id'].unique())]

validation = validation_corresp_customers.sort_values(['customer_id', 'article_id'])
candidates_last_week = candidates_last_week.sort_values(['customer_id', 'article_id'])

if len(validation['customer_id'].unique())* == candidates_last_week.shape[0]:
    print("Validation and candidates_last_week have the same number of customers")
else:
    print("Validation and candidates_last_week do not have the same number of customers")

# Group purchases and candidates by customer_id
actual_purchases_last_week = validation_corresp_customers.groupby('customer_id')['article_id'].apply(list)
predicted_candidates_last_week = candidates_last_week.groupby('customer_id')['article_id'].apply(list)

In [68]:
candidates_last_week

Unnamed: 0,t_dat,sales_channel_id,article_id,price,index_group_name,week,customer_id,highest_count_ign_per_c,mean_price_per_c,age,mean_age_per_a,price_rank,age_rank
8465,2020-09-10,2,559607002,0.023712,0,104,1402273113592184,0,0.023712,20,31,1.0,15.0
8468,2020-09-15,2,567731001,0.023712,0,104,1402273113592184,0,0.023712,20,32,1.0,18.0
8460,2020-09-12,2,687034022,0.022864,0,104,1402273113592184,0,0.023712,20,29,45.0,10.0
8451,2020-09-10,1,699303001,0.024390,0,104,1402273113592184,0,0.023712,20,24,41.0,2.0
8457,2020-09-15,2,699423003,0.023712,0,104,1402273113592184,0,0.023712,20,28,1.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249471,2020-09-13,1,855262002,0.038966,0,104,42910407533866495,0,0.039249,49,37,3.0,22.0
249461,2020-09-11,2,856113002,0.040661,0,104,42910407533866495,0,0.039249,49,41,18.0,12.0
249468,2020-09-13,2,859399001,0.040661,0,104,42910407533866495,0,0.039249,49,38,18.0,17.0
249459,2020-09-15,1,860045002,0.040153,0,104,42910407533866495,0,0.039249,49,42,14.0,10.0


In [69]:
actual_purchases_last_week

customer_id
1402273113592184                                [611415001, 885951001]
1827730561464445                                [918603001, 921380001]
1951136007097426                                           [778745010]
2639747769247776                                           [819547001]
3177658828628418                                [866731001, 869331006]
                                           ...                        
42518724531228943    [896169002, 909370001, 918835001, 918836001, 9...
42590836772269170                               [829618005, 900388001]
42607389388893103         [855816004, 875350002, 899247005, 921090003]
42610653102618094                                          [873884006]
42910407533866495                    [771693001, 786336005, 912100001]
Name: article_id, Length: 159, dtype: object

In [70]:
predicted_candidates_last_week

customer_id
1402273113592184     [559607002, 567731001, 687034022, 699303001, 6...
1827730561464445     [568838009, 590919006, 610665001, 635049002, 7...
1951136007097426     [456163069, 752509001, 756209003, 781400001, 7...
2639747769247776     [628466001, 632159002, 652367002, 665481012, 7...
3177658828628418     [533404046, 609170005, 653706039, 669993016, 6...
                                           ...                        
42518724531228943    [522461016, 599719019, 599719038, 599719055, 5...
42590836772269170    [521805011, 567535019, 585130004, 615192004, 6...
42607389388893103    [558524013, 578113002, 640664005, 640664010, 6...
42610653102618094    [608007017, 614423001, 630339001, 698387016, 7...
42910407533866495    [533592001, 646846002, 659330001, 685468001, 7...
Name: article_id, Length: 159, dtype: object

In [71]:
# Calculate recall between actual purchases and predicted candidates for the last week
recall_last_week = mean_recall(actual_purchases_last_week, predicted_candidates_last_week)

print("Recall Score on Candidates for Last Week:", recall_last_week)

TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 1, recall: 0.0
TP: 0, FN: 1, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 13, recall: 0.0
TP: 0, FN: 1, recall: 0.0
TP: 0, FN: 9, recall: 0.0
TP: 0, FN: 1, recall: 0.0
TP: 0, FN: 10, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 11, recall: 0.0
TP: 0, FN: 3, recall: 0.0
TP: 0, FN: 4, recall: 0.0
TP: 0, FN: 5, recall: 0.0
TP: 0, FN: 1, recall: 0.0
TP: 0, FN: 3, recall: 0.0
TP: 0, FN: 3, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 3, recall: 0.0
TP: 0, FN: 3, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 6, recall: 0.0
TP: 0, FN: 1, recall: 0.0
TP: 0, FN: 1, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 4, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 15, recall: 0.0
TP: 0, FN: 1, recall: 0.0
TP: 0, FN: 2, recall: 0.0
TP: 0, FN: 1, recall: 0.0
TP: 0, FN: 5, recall: 0.0
TP: 0, FN: 3, recall: 0.0
TP: 0, FN: 12, recall: 0.0
TP: 0, 

In [72]:
from tqdm import tqdm
# Creating validation set for metrics use case
positive_items_val = validation.groupby(['customer_id'])['article_id'].apply(list)
val_users = positive_items_val.keys()
val_items = []
for i, user in tqdm(enumerate(val_users)):
    val_items.append(positive_items_val[user])
    
print("Total users in validation:", len(val_users))
print("mAP12 Score on Validation set:", mapk(val_items, preds))
print("recall Score on Validation set:", recall12(val_items, preds))

Total users in validation: 68984
mAP12 Score on Validation set: 0.00020435931735109805
recall Score on Validation set: 0.0008370739349937931


In [73]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds
sub.to_csv(f'basic_model_submission.csv.gz', index=False)

Radek

In [74]:
transactions = transactions_copy.copy()
del transactions_copy

In [75]:
### Bestsellers candidates
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [76]:
# bestsellers_previous_week.pipe(lambda df: df[df['week']==96])
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,1,685687003,1,0.016912
1,1,685687001,2,0.016904
2,1,685687004,3,0.016857
3,1,685687002,4,0.016922
4,1,562245001,5,0.023881
...,...,...,...,...
1255,105,915529003,8,0.033439
1256,105,915529005,9,0.033417
1257,105,448509014,10,0.041630
1258,105,762846027,11,0.025005


In [77]:
unique_transactions = transactions.groupby(['week', 'customer_id']).head(1).drop(columns=['article_id', 'price']).copy()
candidates_bestsellers = pd.merge(unique_transactions, bestsellers_previous_week, on='week',)
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [78]:
# candidates_bestsellers

In [79]:
# test_set_transactions
# unique_transactions

In [80]:
candidates_bestsellers_test_week = pd.merge(test_set_transactions, bestsellers_previous_week, on='week')
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [81]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2018-09-26,857913002275398,1,1,685687003,0.016912
1,2018-09-26,857913002275398,1,1,685687001,0.016904
2,2018-09-26,857913002275398,1,1,685687004,0.016857
3,2018-09-26,857913002275398,1,1,685687002,0.016922
4,2018-09-26,857913002275398,1,1,562245001,0.023881
...,...,...,...,...,...,...
16347367,2020-09-22,18417769707947924979,2,105,915529003,0.033439
16347368,2020-09-22,18417769707947924979,2,105,915529005,0.033417
16347369,2020-09-22,18417769707947924979,2,105,448509014,0.041630
16347370,2020-09-22,18417769707947924979,2,105,762846027,0.025005


# Combining transactions and candidates / negative examples

In [82]:
transactions['purchased'] = 1
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [83]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
25784,2018-09-20,1728846800780188,519773001,0.028458,2,0,1.0
25785,2018-09-20,1728846800780188,578472001,0.032525,2,0,1.0
5389,2018-09-20,2076973761519164,661795002,0.167797,2,0,1.0
5390,2018-09-20,2076973761519164,684080003,0.101678,2,0,1.0
47429,2018-09-20,2918879973994241,662980001,0.033881,1,0,1.0
...,...,...,...,...,...,...,...
16347367,2020-09-22,18417769707947924979,915529003,0.033439,2,105,0.0
16347368,2020-09-22,18417769707947924979,915529005,0.033417,2,105,0.0
16347369,2020-09-22,18417769707947924979,448509014,0.041630,2,105,0.0
16347370,2020-09-22,18417769707947924979,762846027,0.025005,2,105,0.0


### Add bestseller information

In [84]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [85]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [86]:
test

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
124800519,2020-04-10,4245900472157,924243001,0.041535,2,105,0.0,1.0,924243,19190,...,0,1003,3,13007,-1,-1,0,0,21,64922
124800520,2020-04-10,4245900472157,924243002,0.041877,2,105,0.0,2.0,924243,19190,...,0,1003,3,13007,-1,-1,0,0,21,64922
124800521,2020-04-10,4245900472157,918522001,0.041435,2,105,0.0,3.0,918522,26372,...,0,1003,3,28633,-1,-1,0,0,21,64922
124800522,2020-04-10,4245900472157,923758001,0.033462,2,105,0.0,4.0,923758,19359,...,0,1010,6,27869,-1,-1,0,0,21,64922
124800523,2020-04-10,4245900472157,866731001,0.025015,2,105,0.0,5.0,866731,3609,...,21,1005,0,3130,-1,-1,0,0,21,64922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142509656,2018-09-29,18446737527580148316,915529003,0.033439,1,105,0.0,8.0,915529,7046,...,0,1003,3,10909,1,1,0,1,60,96323
142509657,2018-09-29,18446737527580148316,915529005,0.033417,1,105,0.0,9.0,915529,7046,...,0,1003,3,10909,1,1,0,1,60,96323
142509658,2018-09-29,18446737527580148316,448509014,0.041630,1,105,0.0,10.0,448509,259,...,1,1009,5,255,1,1,0,1,60,96323
142509659,2018-09-29,18446737527580148316,762846027,0.025005,1,105,0.0,11.0,762846,472,...,7,1010,6,492,1,1,0,1,60,96323


In [87]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [88]:
train_baskets

array([18, 13, 17, ..., 14, 19, 16], dtype=int64)

In [89]:
extra_columns = []
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']
columns_to_use.extend(extra_columns)

In [90]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [91]:
test_X

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,bestseller_rank
124800519,924243001,252,1010016,13,1,1,1626,0,1,15,1003,-1,-1,0,0,21,64922,1.0
124800520,924243002,252,1010016,9,4,5,1626,0,1,15,1003,-1,-1,0,0,21,64922,2.0
124800521,918522001,252,1010016,11,3,9,1626,0,1,15,1003,-1,-1,0,0,21,64922,3.0
124800522,923758001,-1,1010016,10,3,9,1522,0,1,15,1010,-1,-1,0,0,21,64922,4.0
124800523,866731001,273,1010016,9,4,5,8310,9,26,5,1005,-1,-1,0,0,21,64922,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142509656,915529003,252,1010016,9,4,5,1626,0,1,15,1003,1,1,0,1,60,96323,8.0
142509657,915529005,252,1010016,13,1,1,1626,0,1,15,1003,1,1,0,1,60,96323,9.0
142509658,448509014,272,1010016,72,3,2,1747,1,2,53,1009,1,1,0,1,60,96323,10.0
142509659,762846027,259,1010016,13,1,1,1515,0,1,11,1010,1,1,0,1,60,96323,11.0


# Model training

In [92]:
from lightgbm.sklearn import LGBMRanker
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.111309
[LightGBM] [Debug] init for col-wise cost 0.000244 seconds, init for row-wise cost 1.156635 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.555679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 124800519, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8


In [93]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9997377003617672
article_id 0.0002089729146411682
age 2.4592794035160452e-05
index_group_no 9.986910440894015e-06
department_no 9.032626764532162e-06
colour_group_code 4.8489558499091254e-06
section_no 3.1959499676713425e-06
perceived_colour_master_id 1.6694865334151692e-06
fashion_news_frequency 0.0
club_member_status 0.0
Active 0.0
FN 0.0
garment_group_no 0.0
postal_code 0.0
index_code 0.0
perceived_colour_value_id 0.0
graphical_appearance_no 0.0
product_type_no 0.0


# Calculate predictions

In [94]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

# Create submission

In [95]:
sub = pd.read_csv('../input/sample_submission.csv')
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

In [96]:
from tqdm import tqdm
positive_items_val = validation.groupby(['customer_id'])['article_id'].apply(list)
# creating validation set for metrics use case
val_users = positive_items_val.keys()
val_items = []
for i, user in tqdm(enumerate(val_users)):
    val_items.append(positive_items_val[user])

print("Total users in validation:", len(val_users))
print("mAP12 Score on Validation set:", mapk(val_items, preds))
print("recall Score on Validation set:", recall12(val_items, preds))

159it [00:00, 314424.49it/s]

Total users in validation: 159
mAP12 Score on Validation set: 0.006231656184486374
recall Score on Validation set: 0.020789273619462298





In [97]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
# sub.prediction = preds
# sub_name = 'basic_model_submission'
# sub.to_csv(f'{sub_name}.csv.gz', index=False)