# Experiment with different time cut-offs

# Helper functions from Radek's LGBMRanker starter-pack

In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

# Read input data

In [2]:
%%time
import pandas as pd
import tqdm as tqdm
pad = "/kaggle/input/makeparquet"
transactions = pd.read_parquet(pad+'/transactions_train.parquet')
customers = pd.read_parquet(pad+'/customers.parquet')
articles = pd.read_parquet(pad+'/articles.parquet')

CPU times: user 2.37 s, sys: 2.4 s, total: 4.77 s
Wall time: 6.18 s


# Validation last x week information

In [3]:
val = transactions[transactions['week'] >= transactions['week'].max()]


In [4]:
positive_items_val = val.groupby(['customer_id'])['article_id'].apply(list)

In [5]:
# creating validation set for metrics use case
val_users = positive_items_val.keys()
val_items = []

for i,user in (enumerate(val_users)):
    val_items.append(positive_items_val[user])
    
print("Total users in validation:", len(val_users))

Total users in validation: 68984


Use the last x week to calculate popularity

In [6]:
last_x_weeks = 1
last_x_weeks -= 1

In [7]:
import pandas as pd

# Step 1: Filter transactions for the last week
last_week_transactions = transactions[transactions['week'] >= transactions['week'].max()-(last_x_weeks+1)]

# Step 2: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count = last_week_transactions.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 3: Sort articles based on customer count in descending order
sorted_articles = article_customer_count.sort_values(by='customer_count', ascending=False)

# Step 4: Take the top 12 articles
top_12_articles = sorted_articles.head(12)

# Display the result
print(top_12_articles)


       article_id  customer_count
20744   909370001            1474
21887   924243001            1364
21513   918522001            1212
15285   865799006            1020
5999    751471001             905
21495   918292001             895
21866   923758001             891
21305   915529003             872
535     448509014             844
6706    762846027             837
21888   924243002             830
3116    673677002             757


In [8]:
x = (top_12_articles.article_id.to_list())


In [9]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [10]:
latest_bought_articles=[]
for i in range(len(sub)):
    latest_bought_articles.append(x)

In [11]:
print("mAP Score on Validation set:", mapk(val_items, latest_bought_articles))

mAP Score on Validation set: 0.008877847770291514


# get last x week information

In [12]:
import pandas as pd

# Step 1: Filter transactions for the last week
last_week_transactions = transactions[transactions['week'] >= transactions['week'].max()-last_x_weeks]

# Step 2: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count = last_week_transactions.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 3: Sort articles based on customer count in descending order
sorted_articles = article_customer_count.sort_values(by='customer_count', ascending=False)

# Step 4: Take the top 12 articles
top_12_articles = sorted_articles.head(12)

# Display the result
print(top_12_articles)


       article_id  customer_count
17308   924243001             763
16961   918522001             569
17309   924243002             533
17287   923758001             513
11827   866731001             481
16773   915529003             456
16264   909370001             444
16774   915529005             426
4496    751471001             418
16944   918292001             402
5030    762846027             397
430     448509014             388


In [13]:
x = (top_12_articles.article_id.to_list())


# Make submission

In [14]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [15]:
latest_bought_articles=[]
for i in range(len(sub)):
    latest_bought_articles.append(x)

In [16]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in latest_bought_articles]
sub.prediction = preds

In [17]:
sub_name = 'popularityLast2Years_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)