# Experiment with different time cut-offs

# Helper functions from Radek's LGBMRanker starter-pack

In [24]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

# Read input data

In [25]:
%%time
import pandas as pd
import tqdm as tqdm
pad = "/kaggle/input/makeparquet"
transactions = pd.read_parquet(pad+'/transactions_train.parquet')
customers = pd.read_parquet(pad+'/customers.parquet')
articles = pd.read_parquet(pad+'/articles.parquet')

CPU times: user 3.73 s, sys: 3.29 s, total: 7.01 s
Wall time: 2.96 s


# Validation last x week information

In [26]:
val = transactions[transactions['week'] >= transactions['week'].max()]


In [27]:
positive_items_val = val.groupby(['customer_id'])['article_id'].apply(list)

In [28]:
# creating validation set for metrics use case
val_users = positive_items_val.keys()
val_items = []

for i,user in (enumerate(val_users)):
    val_items.append(positive_items_val[user])
    
print("Total users in validation:", len(val_users))

Total users in validation: 68984


In [29]:
last_x_weeks = 1

In [30]:
import pandas as pd

# Step 1: Filter transactions for the last week
last_week_transactions = transactions[transactions['week'] >= transactions['week'].max()-(last_x_weeks+1)]

# Step 2: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count = last_week_transactions.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 3: Sort articles based on customer count in descending order
sorted_articles = article_customer_count.sort_values(by='customer_count', ascending=False)

# Step 4: Take the top 12 articles
top_12_articles = sorted_articles.head(12)

# Display the result
print(top_12_articles)


       article_id  customer_count
24733   918522001            1516
23926   909370001            1514
7135    751471001            1502
25121   924243001            1472
24711   918292001            1403
24515   915529003            1398
24512   915526001            1392
7152    751471043            1256
604     448509014            1237
4936    706016001            1170
17617   863595006            1158
17963   865799006            1127


In [31]:
x = (top_12_articles.article_id.to_list())


In [32]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [33]:
latest_bought_articles=[]
for i in range(len(sub)):
    latest_bought_articles.append(x)

In [34]:
print("mAP Score on Validation set:", mapk(val_items, latest_bought_articles))

mAP Score on Validation set: 0.00863305437926649


# get last x week information

In [35]:
import pandas as pd

# Step 1: Filter transactions for the last week
last_week_transactions = transactions[transactions['week'] >= transactions['week'].max()-last_x_weeks]

# Step 2: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count = last_week_transactions.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 3: Sort articles based on customer count in descending order
sorted_articles = article_customer_count.sort_values(by='customer_count', ascending=False)

# Step 4: Take the top 12 articles
top_12_articles = sorted_articles.head(12)

# Display the result
print(top_12_articles)


       article_id  customer_count
20744   909370001            1474
21887   924243001            1364
21513   918522001            1212
15285   865799006            1020
5999    751471001             905
21495   918292001             895
21866   923758001             891
21305   915529003             872
535     448509014             844
6706    762846027             837
21888   924243002             830
3116    673677002             757


In [36]:
x = (top_12_articles.article_id.to_list())


# Make submission

In [37]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [38]:
latest_bought_articles=[]
for i in range(len(sub)):
    latest_bought_articles.append(x)

In [39]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in latest_bought_articles]
sub.prediction = preds

In [40]:
sub_name = 'popularityLastWeek_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)