# Read input files

In [1]:
%%time
import pandas as pd
pad = "/kaggle/input/makeparquet"
transactions = pd.read_parquet(pad+'/transactions_train.parquet')
customers = pd.read_parquet(pad+'/customers.parquet')
articles = pd.read_parquet(pad+'/articles.parquet')

CPU times: user 2.34 s, sys: 1.18 s, total: 3.53 s
Wall time: 5.43 s


In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []

    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
# Step 1: Merge transactions with selected columns from customers
selected_columns = ['customer_id','age', 'buys_kid_baby_clothes', 'buys_mens_clothes', 'buys_womens_clothing']
merged_data = pd.merge(transactions, customers[selected_columns], on='customer_id', how='inner')

# Step 1: Filter transactions for the last week
last_week_transactions = merged_data[merged_data['week'] >= merged_data['week'].max()]
last_10week_transactions = merged_data[merged_data['week'] >= merged_data['week'].max()-9]

# Make condiditons

In [4]:
import pandas as pd


# Condition 1: age < 25
condition_1 = last_week_transactions[last_week_transactions['age'] < 25]

# Condition 2: 25 <= age < 35
condition_2 = last_week_transactions[(last_week_transactions['age'] >= 25) & (last_week_transactions['age'] < 50)]

# Condition 4: 50 <= age
condition_3 = last_week_transactions[last_week_transactions['age'] >= 50]


# Get popularity for each condition and general popularity

In [5]:
import pandas as pd


# Step 1: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_countold = last_week_transactions.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 2: Sort articles based on customer count in descending order
sorted_articlesold = article_customer_countold.sort_values(by='customer_count', ascending=False)

# Step 3: Take the top 12 articles
top_12_articlesold = sorted_articlesold.head(12)
top_12_articlesold= (top_12_articlesold.article_id.to_list())
# Display the result
print(top_12_articlesold)

[924243001, 918522001, 924243002, 923758001, 866731001, 915529003, 909370001, 915529005, 751471001, 918292001, 762846027, 448509014]


In [6]:
import pandas as pd


# Step 1: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count1 = condition_1.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 2: Sort articles based on customer count in descending order
sorted_articles1 = article_customer_count1.sort_values(by='customer_count', ascending=False)

# Step 3: Take the top 12 articles
top_12_articles1 = sorted_articles1.head(12)
top_12_articles1= (top_12_articles1.article_id.to_list())
# Display the result
print(top_12_articles1)

[918522001, 924243001, 448509014, 866731001, 915526001, 915529005, 915529003, 924243002, 918292001, 923758001, 911699002, 715624001]


In [7]:
import pandas as pd


# Step 1: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count2 = condition_2.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 2: Sort articles based on customer count in descending order
sorted_articles2 = article_customer_count2.sort_values(by='customer_count', ascending=False)

# Step 3: Take the top 12 articles
top_12_articles2 = sorted_articles2.head(12)
top_12_articles2= (top_12_articles2.article_id.to_list())

# Display the result
print(top_12_articles2)

[924243001, 909370001, 866731001, 923758001, 924243002, 915529003, 918292001, 919273002, 915529005, 889550002, 762846027, 935541001]


In [8]:
import pandas as pd


# Step 1: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count3 = condition_3.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 2: Sort articles based on customer count in descending order
sorted_articles3 = article_customer_count3.sort_values(by='customer_count', ascending=False)

# Step 3: Take the top 12 articles
top_12_articles3 = sorted_articles3.head(12)
top_12_articles3= (top_12_articles3.article_id.to_list())

# Display the result
print(top_12_articles3)

[924243001, 930380001, 924243002, 928206001, 918522001, 751471043, 910601003, 751471001, 923758001, 863646001, 865799006, 673677002]


In [9]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

# Make submission

In [10]:
%%time
from tqdm import tqdm
latest_bought_articles = []
for c_id in tqdm(customer_hex_id_to_int(sub.customer_id)):
    # Check in which condition the target_customer_id falls
    if c_id in condition_1['customer_id'].values:
        latest_bought_articles.append(top_12_articles1)
    elif c_id in condition_2['customer_id'].values:
        latest_bought_articles.append(top_12_articles2)
    elif c_id in condition_3['customer_id'].values:
        latest_bought_articles.append(top_12_articles3)
    else:
        latest_bought_articles.append(top_12_articlesold)




100%|██████████| 1371980/1371980 [01:54<00:00, 11938.70it/s]

CPU times: user 1min 55s, sys: 472 ms, total: 1min 56s
Wall time: 1min 55s





In [11]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in latest_bought_articles]
sub.prediction = preds

In [12]:
sub_name = 'Popularity_ageGroup'
sub.to_csv(f'{sub_name}.csv.gz', index=False)