# Popularity with repurchase information with age buckets

# Read input data

In [14]:
%%time
import pandas as pd
pad = "/kaggle/input/makeparquet"
transactions = pd.read_parquet(pad+'/transactions_train.parquet')
customers = pd.read_parquet(pad+'/customers.parquet')
articles = pd.read_parquet(pad+'/articles.parquet')

CPU times: user 3.47 s, sys: 2.6 s, total: 6.07 s
Wall time: 2.68 s


# Helper functions from Radek's LGBMRanker starter-pack

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []

    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

# get last week information

In [16]:
# Step 1: Merge transactions with selected columns from customers
selected_columns = ['customer_id', 'age','buys_kid_baby_clothes', 'buys_mens_clothes', 'buys_womens_clothing']
merged_data = pd.merge(transactions, customers[selected_columns], on='customer_id', how='inner')

# Step 2: Filter transactions for the last week
last_week_transactions = merged_data[merged_data['week'] >= merged_data['week'].max()]
# last_10week_transactions = merged_data[merged_data['week'] >= merged_data['week'].max()-9]

# Split transactions based on conditions

In [17]:
import pandas as pd

# Condition 1: age < 25
condition_1 = last_week_transactions[last_week_transactions['age'] < 25]

# Condition 2: 25 <= age < 50
condition_2 = last_week_transactions[(last_week_transactions['age'] >= 25) & (last_week_transactions['age'] < 50)]


# Condition 3: 50 <= age
condition_4 = last_week_transactions[last_week_transactions['age'] >= 50]


# Get popularity for each age bucket

In [18]:
import pandas as pd


# Step 1: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_countold = last_week_transactions.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 2: Sort articles based on customer count in descending order
sorted_articlesold = article_customer_countold.sort_values(by='customer_count', ascending=False)

# Step 3: Take the top 12 articles
top_12_articlesold = sorted_articlesold.head(12)
top_12_articlesold= (top_12_articlesold.article_id.to_list())
# Display the result
print(top_12_articlesold)

[924243001, 918522001, 924243002, 923758001, 866731001, 915529003, 909370001, 915529005, 751471001, 918292001, 762846027, 448509014]


In [19]:
import pandas as pd


# Step 1: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count1 = condition_1.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 2: Sort articles based on customer count in descending order
sorted_articles1 = article_customer_count1.sort_values(by='customer_count', ascending=False)

# Step 3: Take the top 12 articles
top_12_articles1 = sorted_articles1.head(12)
top_12_articles1= (top_12_articles1.article_id.to_list())
# Display the result
print(top_12_articles1)

[918522001, 924243001, 448509014, 866731001, 915526001, 915529005, 915529003, 924243002, 918292001, 923758001, 911699002, 715624001]


In [20]:
import pandas as pd


# Step 1: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count2 = condition_2.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 2: Sort articles based on customer count in descending order
sorted_articles2 = article_customer_count2.sort_values(by='customer_count', ascending=False)

# Step 3: Take the top 12 articles
top_12_articles2 = sorted_articles2.head(12)
top_12_articles2= (top_12_articles2.article_id.to_list())

# Display the result
print(top_12_articles2)

[924243001, 909370001, 866731001, 923758001, 924243002, 915529003, 918292001, 919273002, 915529005, 889550002, 762846027, 935541001]


In [22]:

import pandas as pd


# Step 1: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count4 = last_week_transactions.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 2: Sort articles based on customer count in descending order
sorted_articles4 = article_customer_count4.sort_values(by='customer_count', ascending=False)

# Step 3: Take the top 12 articles
top_12_articles4 = sorted_articles4.head(12)
top_12_articles4= (top_12_articles4.article_id.to_list())

# Display the result
print(top_12_articles4)

[924243001, 918522001, 924243002, 923758001, 866731001, 915529003, 909370001, 915529005, 751471001, 918292001, 762846027, 448509014]


# Make submission

In [None]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

Read repurchase info

In [26]:
latest_bought_articles = pd.read_csv('/kaggle/input/repurchase4weeks/repurchase4Weeks.csv')
latest_bought_articles = latest_bought_articles.values.tolist()

In [27]:
%%time
from tqdm import tqdm
pop_items = []
outputs = []
user_cnt=0
for c_id in tqdm(customer_hex_id_to_int(sub.customer_id)):
    extraOutput = []
    for articleRepurchase in latest_bought_articles[user_cnt]:
        if articleRepurchase !=0:
            extraOutput.append(articleRepurchase)
    # Check in which condition the target_customer_id falls
    if c_id in condition_1['customer_id'].values:
        pop_items = (top_12_articles1)
    elif c_id in condition_2['customer_id'].values:
        pop_items = (top_12_articlesold)
    elif c_id in condition_4['customer_id'].values:
        pop_items = (top_12_articles4)
    else:
        pop_items = (top_12_articlesold)
    user_output = extraOutput + pop_items

    user_output = [int(j) for j in user_output] 
    user_output = pd.Series(user_output).drop_duplicates().tolist()

    user_output = user_output[:12]
    outputs.append(user_output)
    user_cnt+=1


100%|██████████| 1371980/1371980 [05:01<00:00, 4549.93it/s]

CPU times: user 4min 59s, sys: 8.44 s, total: 5min 8s
Wall time: 5min 3s





In [28]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in outputs]
sub.prediction = preds

In [30]:
sub_name = 'Repurchase4pop'
sub.to_csv(f'{sub_name}.csv.gz', index=False)