In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []

    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd

# Read input files

In [4]:
%%time
pad = "/kaggle/input/makeparquet"
transactions = pd.read_parquet(pad+'/transactions_train.parquet')
customers = pd.read_parquet(pad+'/customers.parquet')
articles = pd.read_parquet(pad+'/articles.parquet')

CPU times: user 2.13 s, sys: 2.34 s, total: 4.47 s
Wall time: 5.19 s


# Choose how many last weeks to look at

In [5]:
how_many_last_weeks = 4

In [6]:
how_many_last_weeks -= 1
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week >= transactions.week.max() - how_many_last_weeks]

In [7]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

# Find the last 12 owned items in the timeframe for each customers fill up with 0000

In [8]:
# Create a dictionary of owned articles for each customer
owned_articles = transactions.groupby('customer_id')['article_id'].apply(list).to_dict()

def get_latest_bought_articles2(customer_id, owned_articles):
    new_predictions = []
    bought_articles = owned_articles.get(customer_id, [])
    
    # Create a DataFrame with the bought articles for the specific customer
    customer_transactions = transactions[(transactions['customer_id'] == customer_id) & (transactions['article_id'].isin(bought_articles))]
    
    # Sort by purchase date in descending order
    customer_transactions = customer_transactions.sort_values(by='t_dat', ascending=False)
    
    # Take the latest 12 unique bought articles
    latest_bought = customer_transactions['article_id'].unique()[:12]
    
    for i in range(12):
        if i<len(latest_bought):
            new_predictions.append(latest_bought[i])
        else:
            new_predictions.append('000000000')
    return new_predictions


In [9]:
%%time
from tqdm import tqdm
preds = []
for c_id in tqdm(customer_hex_id_to_int(sub.customer_id)):
    
    pred = get_latest_bought_articles2(c_id, owned_articles)
    preds.append(pred[:12])

100%|██████████| 1371980/1371980 [48:26<00:00, 471.97it/s]

CPU times: user 48min 28s, sys: 11.9 s, total: 48min 40s
Wall time: 48min 27s





# Write Dictionary away

In [10]:
df = pd.DataFrame(preds)
df.to_csv(f"repurchase{how_many_last_weeks}Weeks.csv", index=False)

# Make Submission

In [11]:
preds2 = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds2

In [12]:
sub_name = f"Repurchase_sub_{how_many_last_weeks+1}weeks"
sub.to_csv(f'{sub_name}.csv.gz', index=False)