# Prune the leaderbord by just giving recommendations for customers that only bought 1 article

In [1]:
import numpy as np
import pandas as pd

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []

    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)


# Load input

In [2]:
%%time
pad = "/kaggle/input/makeparquet"
transactions = pd.read_parquet(pad+'/transactions_train.parquet')
customers = pd.read_parquet(pad+'/customers.parquet')
articles = pd.read_parquet(pad+'/articles.parquet')


CPU times: user 3.75 s, sys: 4.42 s, total: 8.17 s
Wall time: 5.94 s


# Items bought by most customers

In [3]:
import pandas as pd

# Step 1: Filter transactions for the last week
last_week_transactions = transactions[transactions['week'] >= transactions['week'].max()]

# Step 2: Group transactions by 'article_id' and count unique 'customer_id'
article_customer_count = last_week_transactions.groupby('article_id')['customer_id'].nunique().reset_index(name='customer_count')

# Step 3: Sort articles based on customer count in descending order
sorted_articles = article_customer_count.sort_values(by='customer_count', ascending=False)

# Step 4: Take the top 12 articles
top_12_articles = sorted_articles.head(12)

# Display the result
print(top_12_articles)
article_ids = top_12_articles['article_id'].tolist()



       article_id  customer_count
17308   924243001             763
16961   918522001             569
17309   924243002             533
17287   923758001             513
11827   866731001             481
16773   915529003             456
16264   909370001             444
16774   915529005             426
4496    751471001             418
16944   918292001             402
5030    762846027             397
430     448509014             388


# Only look at customers bying only 1 item

In [4]:
# Count the number of unique article_ids for each customer
customer_article_count = transactions.groupby('customer_id')['article_id'].nunique()

# Get customers with only one unique article_id
customers_with_one_article = customer_article_count[customer_article_count == 1].index

# Filter the customers dataframe
result = customers[customers['customer_id'].isin(customers_with_one_article)]

print(len(result))

154947


In [5]:
print(len(customers))

1371980


In [6]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

# Make submission

In [7]:
latest_bought_articles = pd.read_csv('/kaggle/input/workingpurchase/filename.csv')
latest_bought_articles = latest_bought_articles.values.tolist()

In [8]:
%%time

preds = []
i=-1
for c_id in customer_hex_id_to_int(sub.customer_id):
    i+=1
    if c_id in result['customer_id'].values:
        pred = latest_bought_articles[i]
        while len(pred)<12:
            pred.append(0)
        pred = article_ids
    else:
        zeros = [0,0,0,0,0,0,0,0,0,0,0,0]
        pred = zeros
    #     pred = article_ids

        # Take the top 12 predictions (excluding any replaced '000000000' articles)
    preds.append(pred[:12])

CPU times: user 1min 54s, sys: 499 ms, total: 1min 54s
Wall time: 1min 54s


In [9]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [10]:
sub_name = 'bought1articlePop_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)