# Popularity with repurchase information with age buckets

# Read input data

In [1]:
%%time
import pandas as pd
pad = "/kaggle/input/makeparquet"
transactions = pd.read_parquet(pad+'/transactions_train.parquet')
customers = pd.read_parquet(pad+'/customers.parquet')
articles = pd.read_parquet(pad+'/articles.parquet')

CPU times: user 3.96 s, sys: 4 s, total: 7.96 s
Wall time: 7.84 s


# Helper functions from Radek's LGBMRanker starter-pack

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []

    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

# get last week information

In [3]:

last_week_transactions = transactions[transactions['week'] >= transactions['week'].max()]


In [4]:
import pandas as pd


# Step 1: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count = last_week_transactions.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 2: Sort articles based on customer count in descending order
sorted_articles = article_customer_count.sort_values(by='customer_count', ascending=False)

# Step 3: Take the top 12 articles
top_12_articles = sorted_articles.head(12)
top_12_articles = (top_12_articles.article_id.to_list())
# Display the result
print(top_12_articles)

[924243001, 918522001, 924243002, 923758001, 866731001, 915529003, 909370001, 915529005, 751471001, 918292001, 762846027, 448509014]


# Make submission

In [5]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

Read repurchase info

In [6]:
latest_bought_articles = pd.read_csv('/kaggle/input/repurchase4weeks/repurchase4Weeks.csv')
latest_bought_articles = latest_bought_articles.values.tolist()

In [7]:
%%time
from tqdm import tqdm
pop_items = []
outputs = []
user_cnt=0
for c_id in tqdm(customer_hex_id_to_int(sub.customer_id)):
    extraOutput = []
    for articleRepurchase in latest_bought_articles[user_cnt]:
        if articleRepurchase !=0:
            extraOutput.append(articleRepurchase)

    user_output = extraOutput + top_12_articles

    user_output = [int(j) for j in user_output] 
    user_output = pd.Series(user_output).drop_duplicates().tolist()

    user_output = user_output[:12]
    outputs.append(user_output)
    user_cnt+=1


100%|██████████| 1371980/1371980 [05:03<00:00, 4525.41it/s]

CPU times: user 5min 1s, sys: 7.2 s, total: 5min 8s
Wall time: 5min 5s





In [8]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in outputs]
sub.prediction = preds

In [9]:
sub_name = 'Repurchase4pop'
sub.to_csv(f'{sub_name}.csv.gz', index=False)