Based on the baseline "Radek's LGBMRanker starter-pack”: https://www.kaggle.com/code/marcogorelli/radek-s-lgbmranker-starter-pack


# Helper functions

In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []

    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd

# Read input files

In [4]:
%%time
pad = "/kaggle/input/makeparquet"
transactions = pd.read_parquet(pad+'/transactions_train.parquet')
customers = pd.read_parquet(pad+'/customers.parquet')
articles = pd.read_parquet(pad+'/articles.parquet')

CPU times: user 3.47 s, sys: 4.35 s, total: 7.81 s
Wall time: 7.12 s


In [5]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating bestseller

In [6]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
95,2020-07-15,2020-07-21
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15
104,2020-09-16,2020-09-22


### Bestsellers candidates

In [7]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [8]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(24).rename('bestseller_rank').astype('int8')

In [9]:
bestsellers_previous_week2 = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week2.week += 1

In [10]:
bestsellers_previous_week2.pipe(lambda df: df[df['week']==96])

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.02298
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
5,96,610776002,6,0.008318
6,96,877278002,7,0.025036
7,96,547780003,8,0.024814
8,96,817354001,9,0.021913
9,96,827968001,10,0.016436


In [11]:
bestsellers_previous_week2

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.022980
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
...,...,...,...,...
235,105,935541001,20,0.024353
236,105,934835001,20,0.024789
237,105,894780001,21,0.033416
238,105,673677002,22,0.024958


In [12]:
%time

bestsellers_last_week = \
    bestsellers_previous_week2[bestsellers_previous_week2.week == bestsellers_previous_week2.week.max()]['article_id'].tolist()

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 7.87 µs


# Create submission

In [13]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

Repurchase information

In [14]:
latest_bought_articles = pd.read_csv('/kaggle/input/workingpurchase/filename.csv')

In [15]:
latest_bought_articles = latest_bought_articles.values.tolist()


In [16]:
bestsellers_last_week

[924243001,
 924243002,
 918522001,
 923758001,
 866731001,
 909370001,
 751471001,
 915529003,
 915529005,
 448509014,
 762846027,
 714790020,
 918292001,
 865799006,
 850917001,
 929275001,
 896169005,
 919273002,
 889550002,
 935541001,
 934835001,
 894780001,
 673677002,
 788575004]

combine bestseller and repurchase

In [17]:
for i in range(len(latest_bought_articles)):
    temp = []
    for j in range(len(latest_bought_articles[i])):
        if latest_bought_articles[i][j]!=0:
            temp.append(latest_bought_articles[i][j])
        else:
            for k in bestsellers_last_week:
                if k not in temp:
                    temp.append(k)
                    latest_bought_articles[i][j]=k
                    break

            

In [18]:
print(latest_bought_articles[:3])

[[568601043, 924243001, 924243002, 918522001, 923758001, 866731001, 909370001, 751471001, 915529003, 915529005, 448509014, 762846027], [924243001, 924243002, 918522001, 923758001, 866731001, 909370001, 751471001, 915529003, 915529005, 448509014, 762846027, 714790020], [794321007, 924243001, 924243002, 918522001, 923758001, 866731001, 909370001, 751471001, 915529003, 915529005, 448509014, 762846027]]


In [19]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in latest_bought_articles]
sub.prediction = preds

In [20]:
sub_name = 'bestseller_repurchas_submission'
# sub_name = 'popularity_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)