### This notebook was originally written by Noah Daniels implementing Radek's LGBMRanker algorithm for this competition
I adapted this notebook to work with my LSTM model and extra engineered features

In [1]:
import numpy as np
import pandas as pd
from lightgbm.sklearn import LGBMRanker

# make external scripts auto reload
%load_ext autoreload
%autoreload 2

import torch
from tqdm.notebook import trange, tqdm
import pickle

import sklearn.metrics as metrics

from experiment_template import *
from LSTMRecommender import *

In [2]:
BASE_PATH = '../data/'

# make sure the same data preprocessing as in the radek notebook have been performed
# (see 02 FE/DataProcessingRadek.ipynb)
transactions = pd.read_parquet(BASE_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(BASE_PATH + 'customers.parquet')
articles = pd.read_parquet(BASE_PATH + 'articles.parquet')
sample_submission = pd.read_csv(BASE_PATH + 'sample_submission.csv')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEQUENCE_COLUMN = "prod_name"
# SEQUENCE_COLUMN = "product_type_name"
N_PREDICTIONS = 6
MOST_POPULAR_VALUE = articles[articles["article_id"] == transactions["article_id"] \
                              .value_counts().index[0]][SEQUENCE_COLUMN].item()
EMBEDDING_DIM = 64
HIDDEN_DIM = 100

BATCH_SIZE = 512

PADDING_ARTICLE = articles[SEQUENCE_COLUMN].nunique()

NUM_ARTICLES_IN_SEQUENCE = 12
N_ARTICLES = articles[SEQUENCE_COLUMN].nunique()

model = LSTMRecommender(
    input_dim=NUM_ARTICLES_IN_SEQUENCE,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    # Output dim is only the number of articles while n_articles is for the embedding and has to include the padding
    n_articles=N_ARTICLES+1,
    bidirectional=False,
    num_layers=1,
    dropout=0.2
    )

model.load_state_dict(torch.load(f"./models/{SEQUENCE_COLUMN}_model/LSTM_Model_Epoch_19.pt"))
model.to(device)

LSTMRecommender(
  (embedding): Embedding(45876, 64)
  (lstm): LSTM(64, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=45876, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [4]:
def get_padded_sequence(sequence, max_length=12, padding_value=PADDING_ARTICLE):
    if len(sequence) > max_length:
        return sequence[-max_length:]
    else:
        return [padding_value] * (max_length - len(sequence)) + sequence

In [5]:
def add_lstm_ranker_features(data, test_week, lstm_data):
    data = pd.merge(data, articles, how='left', on='article_id')
    model.eval()
    
    data["lstm_score"] = 0.0

    # Define batches
    unique_customers = data["customer_id"].unique()
    n_batches = int(np.ceil(len(unique_customers) / BATCH_SIZE))

    for batch_idx in tqdm(range(n_batches)):
        batch_customers = unique_customers[batch_idx * BATCH_SIZE : (batch_idx + 1) * BATCH_SIZE]
        sequences = []
        index_to_seq_map = []

        for customer_id in batch_customers:
            customer_data = data[data["customer_id"] == customer_id]
            for week in customer_data["week"].unique():
                previous_weeks = customer_data[customer_data["week"] < week].copy()
                previous_weeks.sort_values("t_dat", inplace=True)
                sequence = get_padded_sequence(lstm_data.get(customer_id, []) + (previous_weeks[SEQUENCE_COLUMN].tolist()))
                sequences.append(sequence)

                current_week_indices = customer_data[customer_data['week'] == week].index
                for idx in current_week_indices:
                    index_to_seq_map.append((idx, len(sequences) - 1))
            
        input_tensor = torch.tensor(sequences, dtype=torch.long, device=device)
        with torch.no_grad():
            logits = model(input_tensor)
            probs = F.softmax(logits, dim=1)

        for idx, seq_idx in index_to_seq_map:
            col_record = data.at[idx, SEQUENCE_COLUMN]
            data.loc[idx, "lstm_score"] = probs[seq_idx, col_record].item()
    # data.week += 1

    return data[["customer_id", "week", "article_id", "lstm_score"]]

In [6]:
# Candidate generation of Radek notebook
def get_data(data, test_week):
    ### Add lstm score
    candidates_lstm = add_lstm_ranker_features(data, test_week, lstm_data)
    data["lstm_score"] = candidates_lstm["lstm_score"]
    
    ### repurchase
    # each week is seen as a basket
    # the items bought in one basket, will be example for the next basket
    # the items bought in the last basket, will be candidates for the test basket
    c2weeks = data.groupby('customer_id')['week'].unique()
    c2weeks2shifted_weeks = {}
    for c_id, weeks in c2weeks.items():
        c2weeks2shifted_weeks[c_id] = {}
        for i in range(weeks.shape[0]-1):
            c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
        c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    candidates_last_purchase = data.copy()
    weeks = []
    for i, (c_id, week) in enumerate(zip(data['customer_id'], data['week'])):
        weeks.append(c2weeks2shifted_weeks[c_id][week])
    candidates_last_purchase.week=weeks

    ### bestseller
    # if a user bought an item in a given week, the 12 most popular items in the previous week are example for that week
    # the best selling items in the last week are candidates for all users
    mean_price = data \
        .groupby(['week', 'article_id'])['price'].mean()
    
    sales = data \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')

    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    bestsellers_previous_week.week += 1
    unique_transactions = data \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()
    candidates_bestsellers = pd.merge(
        unique_transactions,
        bestsellers_previous_week,
        on='week',
    )
    test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_transactions.week = test_week
    candidates_bestsellers_test_week = pd.merge(
        test_set_transactions,
        bestsellers_previous_week,
        on='week'
    )
    candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
    candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

    ### combine
    d = data.copy()
    d['purchased'] = True
    
    result = pd.concat([
        d, candidates_last_purchase, candidates_bestsellers
    ])
    result.purchased.fillna(False, inplace=True)
    result.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

    result = pd.merge(
        result,
        bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
        on=['week', 'article_id'],
        how='left'
    )

    result = result[result.week != result.week.min()]
    result.bestseller_rank.fillna(999, inplace=True)
    result["lstm_score"].fillna(0.0, inplace=True)
    result["lstm_score"] = result["lstm_score"] / result["lstm_score"].max()

    result.sort_values(['week', 'customer_id'], inplace=True)
    result.reset_index(drop=True, inplace=True)
    return result

def get_examples(data, test_week):
    return data[data.week != test_week]

def get_candidates(data, test_week):
    return data[data.week == test_week]

def add_features(data):
    columns_to_use = [
        'article_id', 
        'prod_name',
        'product_type_name',
        'product_type_no', 
        'graphical_appearance_no', 
        'colour_group_code', 
        'perceived_colour_value_id',
        'perceived_colour_master_id', 
        'department_no', 
        'index_code',
        'index_group_no', 
        'section_no', 
        'garment_group_no', 
        'FN', 
        'Active',
        'club_member_status', 
        'fashion_news_frequency', 
        'age', 
        'postal_code',
        'bestseller_rank',
        'discount',
        'price',
        'price_sensitivity',
        "lstm_score"
    ]

    result = data
    result = pd.merge(result, customers, how='left', on='customer_id')
    result = pd.merge(result, articles, how='left', on='article_id')

    # features from assignment 2 could go here

    # Clipping price
    result["price"] = result["price"].clip(0, 0.17)

    customer_avg_price = transactions.groupby('customer_id')['price'].mean().to_frame('preferred_price')
    result = pd.merge(result, customer_avg_price, how="left", on="customer_id")

    # Customer price senstivity
    customer_data = result.groupby('customer_id')
    price_sensitivity = customer_data['price'].std() / customer_data['price'].mean()
    price_sensitivity_df = pd.DataFrame({'customer_id': price_sensitivity.index, 'price_sensitivity': price_sensitivity.values})
    result = pd.merge(result, price_sensitivity_df, on='customer_id', how='left')

    max_price = transactions.groupby("article_id")["price"].max().reset_index().rename(columns={"price": "max_price"})
    result["discount"] = 1 - result["price"] / result.merge(max_price, on="article_id", how="left")["max_price"]

    
    return result[columns_to_use]

In [7]:
### split into training and testing
# one week is used for testing
# a number of weeks leading up to the test week are used to train the ranker
test_week = 105
num_training_weeks = 10
testing_weeks = np.arange(test_week-num_training_weeks, test_week)
train_data = transactions[transactions.week.isin(testing_weeks)].reset_index(drop=True)

# Get 5 weeks before first training week for lstm
lstm_weeks = np.arange(test_week-num_training_weeks-5, test_week-num_training_weeks)
lstm_data = transactions[transactions.week.isin(lstm_weeks)].reset_index(drop=True)
lstm_data = lstm_data.merge(articles, how='left', on='article_id')
lstm_data = lstm_data.groupby('customer_id')[SEQUENCE_COLUMN].apply(list).reset_index(name='history').set_index('customer_id')['history'].to_dict()

##### assemble training data (positive + negative examples)
each example has at least a customer_id, article_id and whether it was purchased or not (positive/negative)

add_features extracts and adds features to the examples

In [174]:
data = get_data(train_data, test_week)
pickle.dump(data, open(f"./data_lstm_{SEQUENCE_COLUMN}_{test_week}.pkl", "wb"))

In [175]:
data = pickle.load(open(f"./data_lstm_{SEQUENCE_COLUMN}_{test_week}.pkl", "rb"))
train_examples = get_examples(data, test_week)
X_train = add_features(train_examples)
Y_train = train_examples['purchased']

In [176]:
### fit ranker
# training_groups tells LGBM that each (week, customer_id) combination is a seperate basket
# !!! it is important that the training_examples are sorted according to week, customer_id for this to work
ranker = LGBMRanker(
    force_row_wise=True,
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
train_groups = train_examples.groupby(['week', 'customer_id'])['article_id'].count().values
ranker.fit(X_train, Y_train, group=train_groups)
print_importance(ranker, X_train.columns)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.139723
[LightGBM] [Info] Total Bins 1755
[LightGBM] [Info] Number of data points in the train set: 11557594, number of used features: 21
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
               bestseller_rank 0.99602
                    lstm_score 0.00255
                     prod_name 0.00043
              garment_group_no 0.00041
                    article_id 0.00022
                           age 0.00011
               product_type_no 0.00009
            club_member_status 0.00005
                   postal_code 0.00004
        fashion_news_frequency 0.00003
             product_type_name 0.00003
                        Active 0.00000
                            FN 0.00000
                    section_no 0.00000
                    index_code 0.00000
                 department_no 0.00000
    perceived_colour_master_id 0.00000
     perceived_colour_value_id 0.00000
             colour_group_

In [177]:
### test
# candidates are generated similarly to the examples, only we don't know whether they are purchased
# the same features are extracted and added
# each candidate is scored by the ranker and predictions are generated using the highest scoring candidates
test_candidates = get_candidates(data, test_week)
X_test = add_features(test_candidates)
predictions = get_predictions(test_candidates, X_test, ranker, 12)

## LSTM prediction replacement part

In [None]:
# Get purchases in testing weeks as a list sorted by date
# Merge articles into transactions
df = pd.merge(transactions, articles, how='left', on='article_id')
df = df.sort_values(by=['customer_id', 't_dat'], ascending=[True, True])

# Group by "customer_id" and get the last 12 transactions for each customer
df_grouped = df.groupby('customer_id')[SEQUENCE_COLUMN].apply(list)
transactions_filtered = pd.DataFrame({"customer_id": df_grouped.index, "sequence": df_grouped.apply(lambda x: x[-12:])})

In [None]:
transactions_filtered.reset_index(drop=True, inplace=True)

In [None]:
history_dataset = TransactionsDataset(transactions_filtered, PADDING_ARTICLE, NUM_ARTICLES_IN_SEQUENCE)
history_dataloader = DataLoader(history_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
def sample_lstm(logits, temperature=1.0):
    scaled = logits / temperature
    probabilities = F.softmax(scaled, dim=1)
    return torch.multinomial(probabilities, 1)

In [None]:
preds = []
counter = 0
history_batches = []
history_batch = []

for idx, sequences in enumerate(tqdm(history_dataloader)):
    sequences = sequences.to(device)
    
    for i in range(N_PREDICTIONS):
        # Pass padded batches to the model
        with torch.no_grad():
            out = model(sequences[:, -12:])
            out = sample_lstm(out, temperature=0.1)
            # out = torch.argmax(out, dim=1).unsqueeze(1)

        # Append model's output to each transaction in the batch
        sequences = torch.cat((sequences, out), dim=1)
    for i in range(sequences.shape[0]):
        preds.append(sequences[i, -N_PREDICTIONS:].tolist())
transactions_filtered["preds"] = preds

In [None]:
def replace_last_k(predictions, k):
    most_popular_filtered = df.groupby([SEQUENCE_COLUMN, "article_id"]).size().reset_index(name="count").sort_values([SEQUENCE_COLUMN, "count"], ascending=[True, False])
    most_popular_filtered = most_popular_filtered.groupby(SEQUENCE_COLUMN)["article_id"].apply(list).reset_index(name="articles")
    popular_articles_dict = dict(zip(most_popular_filtered[SEQUENCE_COLUMN], most_popular_filtered["articles"]))

    # lstm preds to dict
    lstm_prediction_dict = transactions_filtered.set_index("customer_id")["preds"].to_dict()

    for idx, (customer_id, prediction) in tqdm(predictions.iterrows(), total=predictions.shape[0]):
        new_preds = prediction[:-k]
        lstm_preds = lstm_prediction_dict.get(customer_id, [])

        for i in range(k):
            cat_value = lstm_preds[i] if i < len(lstm_preds) and lstm_preds[i] != PADDING_ARTICLE else MOST_POPULAR_VALUE
            for article in popular_articles_dict.get(cat_value, []):
                if article not in new_preds:
                    new_preds.append(article)
                    break

        if len(new_preds) != 12:
            new_preds += prediction[-k:]
            new_preds = new_preds[:12]
        predictions.at[idx, "prediction"] = new_preds

In [None]:
predictions.head(5)

In [None]:
# replace_most_popular(predictions, 5)
replace_last_k(predictions, N_PREDICTIONS)

In [None]:
predictions.head(5)

In [178]:
def recall(preds, purchases, k=12):
    df = pd.merge(preds, purchases, how='left', on='customer_id')
    # Recall = TP / (TP + FN) => TP / 12
    return df.apply(lambda x: len(set(x["prediction"][:k]).intersection(x["purchases"])) / k, axis=1).mean()

def hits(preds, purchases, k=12):
    df = pd.merge(preds, purchases, how='left', on='customer_id')
    # Hits = TP / (TP + FP) => TP / 12
    return df.apply(lambda x: len(set(x["prediction"][:k]).intersection(x["purchases"])), axis=1).mean()

In [179]:
### evaluate
if test_week < transactions.week.max() + 1:
    # get ground truth data for test week
    purchases = get_purchases(transactions[transactions.week == test_week])
    
    # fill missing prediction for customers in test set with popular items in last week
    # only for customers in test set because only those are evaluated
    popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
    predictions = fill_missing_predictions(predictions, purchases.customer_id, popular)
    
    # calculate score
    score = mean_average_precision(predictions, purchases, 12)
    print(f"MAP@12: {score:.4f}    Recall@12: {recall(purchases, predictions):.4f}    Hits@12: {hits(purchases, predictions):.4f}")

### submit
else:
    # fill missing predictions for all customers with popular items in last week
    # all customers because we don't know which ones will be evaluated
    popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
    predictions = fill_missing_predictions(predictions, customers.customer_id, popular)

    # write submission
    sub = create_submission(predictions, sample_submission)
    sub.to_csv(BASE_PATH + 'sub1.csv.gz', index=False)

MAP@12: 0.0256    Recall@12: 0.0102    Hits@12: 0.1225
