In [1]:
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pandas as pd

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def filter_transactions_by_weeks(transactions, column="article_id_mapped"):
    # Function to filter transactions by training weeks
    col = column if column != "article_id" else "article_id_mapped"
    _transactions = transactions[["customer_id", col, "week"]]
    filtered_transactions = _transactions[_transactions.week.between(MAX_WEEK - N_TRAINING_WEEKS, MAX_WEEK - 1)]
    filtered_transactions = filtered_transactions.groupby("customer_id")[col].apply(list).reset_index(name="history")
    return filtered_transactions

In [3]:
class LSTMRecommender(nn.Module):
    """
    LSTM Recommender model.
    A simple LSTM model that takes a sequence of feature values and outputs a probability distribution over the possible values.

    Args:
        embedding_dim (int): The size of the embedding vector.
        input_dim (int): The size of the input vector.
        hidden_dim (int): The size of the hidden layer.
        n_articles (int): The number of possible values for the feature.
        num_layers (int): The number of layers in the LSTM model.
        bidirectional (bool): Whether the LSTM model is bidirectional or not.
        dropout (float): The dropout probability.
    """
    def __init__(self, embedding_dim, input_dim, hidden_dim, n_articles, num_layers=2, bidirectional=True, dropout=0.2):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.n_articles = n_articles
        self.n_directions = 2 if bidirectional else 1
        self.num_layers = num_layers

        # Embedding articles to a lower dimension
        self.embedding = nn.Embedding(n_articles, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_dim * num_layers, n_articles)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.n_directions * self.num_layers, x.size(0), self.hidden_dim, requires_grad=True, device=device)
        c0 = torch.zeros(self.n_directions * self.num_layers, x.size(0), self.hidden_dim, requires_grad=True, device=device)
        # Embed
        embedded_sequence = self.embedding(x)
        # Forward propagate LSTM
        out, (hn, cn) = self.lstm(embedded_sequence, (h0.detach(), c0.detach()))
        # Dropout
        out = self.dropout(out)
        # Decode hidden state of last time step
        out = self.fc(out[:, -1, :])
        return out

In [4]:
class TransactionsDataset(Dataset):
    """
    Transactions dataset.
    Contains the filtered transactions dataframe to be used in the training process. (to use in the dataloader)

    Args:
        transactions (DataFrame): Filtered transactions dataframe.
        padding_value (int): Padding value, not necessarily article id.
        num_articles_in_sequence (int): Number of articles in a sequence.
    """
    def __init__(self, transactions, padding_value, num_articles_in_sequence):
        self.transactions_df = transactions
        self.padding_value = padding_value
        self.num_articles_in_sequence = num_articles_in_sequence

    def __len__(self):
        return len(self.transactions_df)

    def __getitem__(self, idx):
        customer_id, history = self.transactions_df.iloc[idx]
        if len(history) < 12:
            history = [self.padding_value] * (self.num_articles_in_sequence - len(history)) + history
        return torch.tensor(history[-12:], dtype=torch.int32)

In [5]:
class SequenceDataset(Dataset):
    """
    Sequence dataset.
    Contains the sequences and targets of customers

    Args:
        sequences: List of sequences.
        targets: List of targets.
    """
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets
 
    def __len__(self):
        return len(self.sequences)
 
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

In [6]:
class HistoryDataset(Dataset):
    """
    History dataset.
    Contains the histories of customers, like the TransactionsDataset, but uses default values
    """
    def __init__(self, history):
        self.histories = history

    def __len__(self):
        return len(self.histories)

    def __getitem__(self, idx):
        history = self.histories[idx]
        if len(history) < 12:
            history = [PADDING_ARTICLE] * (NUM_ARTICLES_IN_SEQUENCE - len(history)) + history
        return torch.tensor(history[-12:], dtype=torch.int32)

In [7]:
def combine_sequences(user_transactions):
    combined_sequence_batch = []
    combined_target_batch = []
    
    for idx, (customer, history) in user_transactions.iterrows():
        history_batch, target_batch = create_batch(history)
        if history_batch is None or target_batch is None:
            continue
        combined_sequence_batch.extend(history_batch)
        combined_target_batch.extend(target_batch)
    
    sequence_dataset = SequenceDataset(combined_sequence_batch, combined_target_batch)
    dataloader = DataLoader(sequence_dataset, batch_size=BATCH_SIZE, shuffle=True)
    return dataloader

In [8]:
def create_batch(history):
    # Create batch of sequences
    if len(history) <= 1:
        return None, None
    history_batch = []
    target_batch = []
    for i in range(1, len(history)):
        if i < 12:
            # Add padding to the beginning of the sequence
            history_batch.append(torch.tensor([PADDING_ARTICLE] * (NUM_ARTICLES_IN_SEQUENCE - i) + history[:i], dtype=torch.int32))
        else:
            history_batch.append(torch.tensor(history[i-12:i], dtype=torch.int32))
        target_batch.append(torch.tensor(history[i], dtype=torch.float32, requires_grad=True))
    return history_batch, target_batch

In [None]:
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

In [None]:
def sample_lstm(logits, temperature=1.0):
    # Temperature scaled sampling
    scaled = logits / temperature
    probabilities = F.softmax(scaled, dim=1)
    return torch.multinomial(probabilities, 1)

In [None]:
def process_predictions(predictions, articles_from_column, default_value):
    # Most popular article selection from predicted feature values
    # i.e. product_type_name -> article_id based on popularity
    customer_preds = []
    for pred in predictions:
        articles = articles_from_column.get(pred, [])
        if pred != default_value:
            articles += articles_from_column.get(default_value, [])

        for article in articles:
            if article not in customer_preds:
                customer_preds.append(article)
                break
    
    assert len(customer_preds) == 12, f"Could not generate 12 recommendations with the provided predictions. Only {len(customer_preds)} can be generated"
    
    return customer_preds

In [None]:
def process_mapped_predictions(predictions, inverse_article_id_map, most_popular):
    # Translates predicted articles to actual article ids
    # note: articles start from 0 to n_articles + 1 (padding) in the model so we need to map them back to the original ids
    customer_preds = []
    
    for pred in predictions:
        article = inverse_article_id_map.get(pred, None)
        if article is not None:
            customer_preds.append(article)
            continue
        for item in most_popular:
            if item not in customer_preds:
                customer_preds.append(item)
                break
    
    assert len(customer_preds) == 12, f"Could not generate 12 recommendations with the provided predictions. Only {len(customer_preds)} can be generated"
    
    return customer_preds

# Util functions for baseline

In [None]:
def get_purchases(transactions):
    """
    Convert a dataframe containing transactions to a dataframe where each row has a customer_id and a list of purchases for that customer.

    @param transactions: a dataframe of transactions
    """
    return (
        transactions.groupby("customer_id", as_index=False)
        .article_id.apply(set)
        .rename(columns={"article_id": "purchases"})[["customer_id", "purchases"]]
    )

In [None]:
def get_predictions(candidates, features, ranker, k=12):
    """
    Uses a dataframe of candidates, a dataframe of features belonging to the candidates, and a trained ranker to generate k predictions for each customer represented in the candidates.
    The candidates dataframe must have the same index as the features dataframe.

    The ranker must have a predict method that takes a dataframe of features and returns a series of scores.

    @candidates: a dataframe of candidates (customer_id, article_id)
    @features: a dataframe of features belonging to the candidates
    @ranker: a trained ranker
    @k: the number of predictions to generate for each customer
    """
    scored_candidates = candidates.copy()
    scored_candidates["score"] = ranker.predict(features)

    return (
        scored_candidates.sort_values(["customer_id", "score"], ascending=False)
        .groupby("customer_id")
        .head(k)
        .groupby("customer_id", as_index=False)
        .article_id.apply(list)
        .rename(columns={"article_id": "prediction"})[["customer_id", "prediction"]]
    )

In [None]:
def fill_missing_predictions(predictions, customers, prediction):
    """
    Add predictions for customers that are not in the predictions dataframe.

    @param predictions: the original predictions dataframe
    @param customers: a list of customer ids for which the prediction should be added if they are missing
    @param prediction: a list of article ids that is to be used as the prediction
    """
    missing_customers = pd.Series(
        list(set(customers) - set(predictions.customer_id)),
        name="customer_id",
    )
    missing_predictions = pd.merge(
        missing_customers, pd.Series([prediction], name="prediction"), how="cross"
    )

    return pd.concat((predictions, missing_predictions))


In [None]:
def mean_average_precision(predictions, purchases, k=12):
    """
    Calculates the mean average precision for a set of predictions and purchases.
    Each row in the predictions and purchases has a customer_id and a list of purchases or predictions.

    @param predictions: a dataframe of predictions
    @param purchases: a dataframe of ground truth purchases
    """

    def average_precision(row):
        score = 0
        num_hits = 0

        for i, p in enumerate(row.prediction[:k]):
            if p in row.purchases and p not in row.prediction[:i]:
                num_hits += 1
                score += num_hits / (i + 1)

        return score / min(len(row.purchases), k)

    result = pd.merge(purchases, predictions, on="customer_id", how="inner")
    result["average_precision"] = result.apply(average_precision, axis=1)

    return result.average_precision.sum() / len(purchases)

In [None]:
def create_submission(predictions, sample_submission):
    predictions = predictions.set_index("customer_id").prediction.to_dict()
    preds = []
    result = sample_submission.copy()
    for customer_id in customer_hex_id_to_int(result.customer_id):
        preds.append(" ".join(f"0{x}" for x in predictions[customer_id]))
    result.prediction = preds
    return result

In [None]:
def print_importance(ranker, features):
    for i in ranker.feature_importances_.argsort()[::-1]:
        imp = ranker.feature_importances_[i] / ranker.feature_importances_.sum()
        print(f"{features[i]:>30} {imp:.5f}")