In [6]:
import numpy as np
import pandas as pd
from lightgbm.sklearn import LGBMRanker

# make external scripts auto reload
%load_ext autoreload
%autoreload 2

from experiment_template import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
BASE_PATH = '../input/'

# make sure the same data preprocessing as in the radek notebook have been performed
# (see 02 FE/DataProcessingRadek.ipynb)
transactions = pd.read_parquet(BASE_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(BASE_PATH + 'customers.parquet')
articles = pd.read_parquet(BASE_PATH + 'articles.parquet')
sample_submission = pd.read_csv(BASE_PATH + 'sample_submission.csv')

In [8]:
# Candidate generation of Radek notebook
def get_data(data, test_week):
    ### repurchase
    # each week is seen as a basket
    # the items bought in one basket, will be example for the next basket
    # the items bought in the last basket, will be candidates for the test basket
    c2weeks = data.groupby('customer_id')['week'].unique()
    c2weeks2shifted_weeks = {}
    for c_id, weeks in c2weeks.items():
        c2weeks2shifted_weeks[c_id] = {}
        for i in range(weeks.shape[0]-1):
            c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
        c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    candidates_last_purchase = data.copy()
    weeks = []
    for i, (c_id, week) in enumerate(zip(data['customer_id'], data['week'])):
        weeks.append(c2weeks2shifted_weeks[c_id][week])
    candidates_last_purchase.week=weeks

    ### bestseller
    # if a user bought an item in a given week, the 12 most popular items in the previous week are example for that week
    # the best selling items in the last week are candidates for all users
    mean_price = data \
        .groupby(['week', 'article_id'])['price'].mean()
    sales = data \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')
    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    bestsellers_previous_week.week += 1
    unique_transactions = data \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()
    candidates_bestsellers = pd.merge(
        unique_transactions,
        bestsellers_previous_week,
        on='week',
    )
    test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_transactions.week = test_week
    candidates_bestsellers_test_week = pd.merge(
        test_set_transactions,
        bestsellers_previous_week,
        on='week'
    )
    candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
    candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

    ### combine
    d = data.copy()
    d['purchased'] = True
    
    result = pd.concat([
        d, candidates_last_purchase, candidates_bestsellers
    ])
    result.purchased.fillna(False, inplace=True)
    result.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

    result = pd.merge(
        result,
        bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
        on=['week', 'article_id'],
        how='left'
    )
    result = result[result.week != result.week.min()]
    result.bestseller_rank.fillna(999, inplace=True)

    result.sort_values(['week', 'customer_id'], inplace=True)
    result.reset_index(drop=True, inplace=True)
    return result
    

# these functions don't necessarily need to use the same underlying data function, but this is how Radek did it
# !!! it is important that the examples are sorted according to (week, customer_id) for LGBM ranker
def get_examples(data, test_week):
    data = get_data(data, test_week)
    return data[data.week != test_week]

def get_candidates(data, test_week):
    data = get_data(data, test_week)
    return data[data.week == test_week]

def add_features(data):
    columns_to_use = [
        'article_id', 
        'product_type_no', 
        'graphical_appearance_no', 
        'colour_group_code', 
        'perceived_colour_value_id',
        'perceived_colour_master_id', 
        'department_no', 
        'index_code',
        'index_group_no', 
        'section_no', 
        'garment_group_no', 
        'FN', 
        'Active',
        'club_member_status', 
        'fashion_news_frequency', 
        'age', 
        'postal_code',
        'bestseller_rank'
    ]

    result = data
    result = pd.merge(result, customers, how='left', on='customer_id')
    result = pd.merge(result, articles, how='left', on='article_id')

    # features from assignment 2 could go here
    customer_avg_price = transactions.groupby('customer_id')['price'].mean().to_frame('preferred_price')
    result = pd.merge(result, customer_avg_price, how="left", on="customer_id")
    
    return result[columns_to_use]

In [9]:
import networkx as nx
import matplotlib.pyplot as plt

user_item_graph = nx.Graph()

user_ids = customers['customer_id'].unique()
article_ids = articles['article_id'].unique()

user_item_graph.add_nodes_from(user_ids, bipartite=0)
user_item_graph.add_nodes_from(article_ids, bipartite=1)

# # edges representing interactions (e.g., purchases) between customers and articles
# for _, transaction in transactions.iterrows():
#     user_id = transaction['customer_id']
#     article_id = transaction['article_id']
#     user_item_graph.add_edge(user_id, article_id)

# Create a DataFrame with unique user-article interactions
interactions = transactions[['customer_id', 'article_id']].drop_duplicates()
# Add edges representing interactions
user_item_graph.add_edges_from(interactions.to_records(index=False))

print('Number of nodes: %d' % user_item_graph.number_of_nodes())
print('Number of edges: %d' % user_item_graph.number_of_edges())
# nx.draw(user_item_graph)

Number of nodes: 1477522
Number of edges: 27306439


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split

# Convert the interaction graph to a sparse matrix
from scipy.sparse import csr_matrix

num_users = len(user_ids)
num_items = len(article_ids)

# Create a mapping from user and article IDs to non-negative integers
user_id_to_index = {customer_id: i for i, customer_id in enumerate(user_ids)}
item_id_to_index = {article_id: i for i, article_id in enumerate(article_ids)}

# Update the interaction DataFrame with non-negative integer indices
interactions['customer_id'] = interactions['customer_id'].map(user_id_to_index)
interactions['article_id'] = interactions['article_id'].map(item_id_to_index)

# Create a CSR sparse matrix
interaction_matrix = csr_matrix(
    (np.ones(len(interactions), dtype=np.float32), (interactions['customer_id'], interactions['article_id'])),
    shape=(len(user_ids), len(article_ids))
)

# Split data into training and validation
train_data, val_data = train_test_split(interaction_matrix, test_size=0.2, random_state=42)

# LightGCN Model
class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embed_dim):
        super(LightGCN, self).__init__()
        self.embed_dim = embed_dim
        self.embedding_user = nn.Embedding(num_users, embed_dim)
        self.embedding_item = nn.Embedding(num_items, embed_dim)
        
    def forward(self, interaction_matrix):
        user_embedding = self.embedding_user.weight
        item_embedding = self.embedding_item.weight
        embeddings = torch.cat([user_embedding, item_embedding], dim=0)
        
        user_output = interaction_matrix @ embeddings
        item_output = interaction_matrix.t() @ embeddings
        
        return user_output, item_output

# Instantiate the model
embed_dim = 64
model = LightGCN(num_users, num_items, embed_dim)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    user_output, item_output = model(interaction_matrix)
    loss = criterion(user_output, interaction_matrix) + criterion(item_output, interaction_matrix.t())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


  arr = np.asarray(values, dtype=dtype)


ValueError: negative row index found

In [None]:
# Function to generate personalized candidates for a list of customers using LightGCN
def generate_personalized_candidates_lightgcn(model, customer_ids, num_candidates):
    personalized_candidates = {}

    user_embeddings, _ = model(interaction_matrix)
    
    for customer_id in customer_ids:
        user_idx = np.where(user_ids == customer_id)[0][0]  # Find the index of the user in the user_ids array
        user_embedding = user_embeddings[user_idx]
        
        # Calculate item scores by dot product of user embedding and item embeddings
        item_scores = torch.matmul(user_embedding, model.embedding_item.weight.t())
        
        # Get the indices of the top-k items with the highest scores
        top_item_indices = torch.topk(item_scores, num_candidates).indices
        
        # Map item indices back to article IDs
        recommended_items = [article_ids[i] for i in top_item_indices]
        personalized_candidates[customer_id] = recommended_items

    return personalized_candidates

# Usage example
customer_ids = user_ids
personalized_candidates_lightgcn = generate_personalized_candidates_lightgcn(model, customer_ids, num_candidates=100)


In [None]:
personalized_candidates_lightgcn

In [None]:
### split into training and testing
# one week is used for testing
# a number of weeks leading up to the test week are used to train the ranker
test_week = 104
num_training_weeks = 10
testing_weeks = np.arange(test_week-num_training_weeks, test_week)
train_data = transactions[transactions.week.isin(testing_weeks)].reset_index(drop=True)

### assemble training data (positive + negative examples)
# each example has at least a customer_id, article_id and whether it was purchased or not (positive/negative)
# add_features extracts and adds features to the examples
train_examples = get_examples(train_data, test_week)
X_train = add_features(train_examples)
Y_train = train_examples['purchased']

### fit ranker
# training_groups tells LGBM that each (week, customer_id) combination is a seperate basket
# !!! it is important that the training_examples are sorted according to week, customer_id for this to work
ranker = LGBMRanker(
    force_row_wise=True,
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
train_groups = train_examples.groupby(['week', 'customer_id'])['article_id'].count().values
ranker.fit(X_train, Y_train, group=train_groups)
print_importance(ranker, X_train.columns)

### test
# candidates are generated similarly to the examples, only we don't know whether they are purchased
# the same features are extracted and added
# each candidate is scored by the ranker and predictions are generated using the highest scoring candidates
test_candidates = get_candidates(train_data, test_week)
X_test = add_features(test_candidates)
predictions = get_predictions(test_candidates, X_test, ranker, 12)

### evaluate
if test_week < transactions.week.max() + 1:
    # get ground truth data for test week
    purchases = get_purchases(transactions[transactions.week == test_week])
    
    # fill missing prediction for customers in test set with popular items in last week
    # only for customers in test set because only those are evaluated
    popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
    predictions = fill_missing_predictions(predictions, purchases.customer_id, popular)
    
    # calculate score
    score = mean_average_precision(predictions, purchases, 12)
    print(score)

### submit
else:
    # fill missing predictions for all customers with popular items in last week
    # all customers because we don't know which ones will be evaluated
    popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
    predictions = fill_missing_predictions(predictions, customers.customer_id, popular)

    # write submission
    sub = create_submission(predictions)
    sub.to_csv(BASE_PATH + 'sub1.csv.gz', index=False)

Scores from using various weeks as the test week:

+ 105: 0.02087 (kaggle)
+ 104: 0.025080605661718477
+ 103: 0.023774082148643252
+ 102: 0.022159069556621
+ 101: 0.01881722188115503
+ 100: 0.019754936922870146

I am pretty sure that my implementation of MAP@12 is correct and these deviations are due to noise in the dataset. The submission generated by this code for week 105 has the same score as the submission from the Radek notebook.