Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd

transactions = pd.read_parquet('../input/transactions_train.parquet')
customers = pd.read_parquet('../input/customers.parquet')
articles = pd.read_parquet('../input/articles.parquet')

test_week = transactions.week.max() # + 1
transactions = transactions[transactions.week > transactions.week.max() - 2]


In [4]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
31317806,2020-09-09,4920151714340210,564358058,0.033881,1,103
31317807,2020-09-09,4920151714340210,568601044,0.050831,1,103
31317808,2020-09-09,4920151714340210,779781013,0.042356,1,103
31317809,2020-09-09,4920151714340210,843465004,0.050831,1,103
31317810,2020-09-09,4920151714340210,715828013,0.033881,1,103
...,...,...,...,...,...,...
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,104
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,104
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,104
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,104


# Generating candidates

### Last purchase candidates

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

# transactions = pd.DataFrame({
#     'customer_id': [1, 1, 1, 2, 2, 3, 3, 3, 3],
#     'article_id': [11, 22, 33, 11, 22, 11, 22, 33, 44],
# })
# user_ids = transactions['customer_id'].unique()
# article_ids = transactions['article_id'].unique()
# # Create a mapping from user and article IDs to non-negative integers
# user_id_to_index = {customer_id: i for i, customer_id in enumerate(user_ids)}
# item_id_to_index = {article_id: i for i, article_id in enumerate(article_ids)}
# 
# # Update the interaction DataFrame with non-negative integer indices
# transactions['customer_id'] = transactions['customer_id'].map(user_id_to_index)
# transactions['article_id'] = transactions['article_id'].map(item_id_to_index)
# print(transactions)
# num_users = transactions['customer_id'].nunique()
# num_items = transactions['article_id'].nunique()
# # create a small example matrix that represents the user-item interaction matrix
# interactions = np.zeros((num_users, num_items), dtype=np.float32)
# for _, row in transactions.iterrows():
#     user_id = row['customer_id']
#     item_id = row['article_id']
#     interactions[user_id, item_id] = 1.0
#     
# # print interactions np adjacency matrix in a table form
# print(interactions)
#     
# # draw this matrix in graph form
# import networkx as nx
# import matplotlib.pyplot as plt
# 
# # create graph from biadjacency matrix, where rows are users and columns are items
# 
# # create a graph and then loop over the adjacency matrix and add edges where there is an interaction
# G = nx.Graph()
# 
# for i in range(interactions.shape[0]):
#     for j in range(interactions.shape[1]):
#         if interactions[i, j] == 1:
#             G.add_edge(i, j + num_users)
#                         
# # draw the bipartite graph
# nx.draw(G, with_labels=True)
# plt.show()



In [6]:
# import pandas as pd
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from sklearn.model_selection import train_test_split
# 
# user_ids = transactions['customer_id'].unique()
# article_ids = transactions['article_id'].unique()
# interactions_ = transactions[['customer_id', 'article_id']].drop_duplicates()
# 
# # Create a user-item interaction matrix
# num_users = transactions['customer_id'].nunique()
# num_items = transactions['article_id'].nunique()
# 
# # Create a mapping from user and article IDs to non-negative integers
# user_id_to_index = {customer_id: i for i, customer_id in enumerate(user_ids)}
# item_id_to_index = {article_id: i for i, article_id in enumerate(article_ids)}
# 
# # map user and article IDs to non-negative integers
# interactions_['customer_id'] = interactions_['customer_id'].map(user_id_to_index)
# interactions_['article_id'] = interactions_['article_id'].map(item_id_to_index)
# interactions = np.zeros((num_users, num_items), dtype=np.float32)
# for _, row in interactions_.iterrows():
#     user_id = row['customer_id']
#     item_id = row['article_id']
#     interactions[user_id, item_id] = 1.0
# 
# # Define a simple LightGCN model
# class LightGCN(nn.Module):
#     def __init__(self, num_users, num_items, embedding_dim):
#         super(LightGCN, self).__init__()
#         self.embedding_dim = embedding_dim
#         self.user_embedding = nn.Embedding(num_users, embedding_dim)
#         self.item_embedding = nn.Embedding(num_items, embedding_dim)
# 
#     def forward(self, user_indices, item_indices):
#         user_embed = self.user_embedding(user_indices)
#         item_embed = self.item_embedding(item_indices)
#         return user_embed, item_embed
# 
# # Define training parameters
# embedding_dim = 64
# num_epochs = 10
# learning_rate = 0.001
# 
# # Split data into train and test sets
# train_interactions, test_interactions = train_test_split(interactions, test_size=0.2, random_state=42)
# 
# # Initialize the LightGCN model
# model = LightGCN(num_users, num_items, embedding_dim)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# 
# # Training loop
# for epoch in range(num_epochs):
#     for user_id in range(num_users):
#         for item_id in range(num_items):
#             if train_interactions[user_id, item_id] == 1:
#                 user_indices = torch.tensor([user_id], dtype=torch.long)
#                 item_indices = torch.tensor([item_id], dtype=torch.long)
# 
#                 user_embed, item_embed = model(user_indices, item_indices)
#                 
#                 # Compute loss (e.g., BPR loss) and perform backpropagation
#                 loss = -torch.log(torch.sigmoid(torch.sum(user_embed * item_embed)))
#                 optimizer.zero_grad()
#                 loss.backward()
#                 optimizer.step()
# 
# # Inference for candidate generation
# def generate_candidates(user_id, num_candidates=10):
#     user_indices = torch.tensor([user_id], dtype=torch.long)
#     user_embed, _ = model(user_indices, torch.arange(num_items, dtype=torch.long))
#     
#     # Calculate scores for all items
#     scores = torch.mm(user_embed, model.item_embedding.weight.t())
#     scores = torch.sigmoid(scores)
#     
#     # Get top-k recommended item indices
#     _, top_item_indices = torch.topk(scores, num_candidates)
#     return top_item_indices.numpy()[0]
# 
# # Evaluate the model
# map_scores = []
# for user_id in range(num_users):
#     recommended_items = generate_candidates(user_id)
#     true_items = np.where(test_interactions[user_id] == 1)[0]
#     # Calculate MAP score
#     map_score = apk(true_items, recommended_items, k=10)
#     map_scores.append(map_score)
# 
# # Calculate the mean MAP score
# mean_map = np.mean(map_scores)
# print("Mean Average Precision (MAP):", mean_map)

In [7]:
# data = pd.merge(transactions, customers, on='customer_id', how='left')
# data = pd.merge(data, articles, on='article_id', how='left')
data = transactions.copy()
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

user_ids = data['customer_id'].unique()
article_ids = data['article_id'].unique()

# Create a mapping from user and article IDs to non-negative integers
user_id_to_index = {customer_id: i for i, customer_id in enumerate(user_ids)}
item_id_to_index = {article_id: i for i, article_id in enumerate(article_ids)}

# map user and article IDs to non-negative integers
data['customer_id'] = data['customer_id'].map(user_id_to_index)
data['article_id'] = data['article_id'].map(item_id_to_index)

train = data[data.week != test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()
test = data[data.week == test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

# make a text file and write (customer_id, article_id) pairs separated by a tab to it from the train dataset
with open('../input/train.txt', 'w') as f:
    idx = 0
    for _, row in train[['customer_id', 'article_id']].iterrows():
        f.write(f'{row["customer_id"]}\t{row["article_id"]}\n')
        if idx == 100:
            break
        idx += 1
        
f.close()
# do the same for the test dataset
with open('../input/test.txt', 'w') as file:
    idx = 0
    for i, row in test[['customer_id', 'article_id']].iterrows():
        file.write(f'{row["customer_id"]}\t{row["article_id"]}\n')
        if idx == 100:
            break
        idx += 1
        
file.close()


In [8]:
# train

In [9]:
# test

In [10]:
import model
import dataloader

dataset = dataloader.LastFM(path="../input")

LastFm Sparsity : 3.868428021135469e-09
(1362281, 38331)
  (0, 0)	1.0
  (1, 1)	1.0
  (1, 2)	1.0
  (1, 3)	1.0
  (2, 4)	1.0
  (2, 5)	1.0
  (3, 6)	1.0
  (3, 7)	1.0
  (4, 8)	1.0
  (4, 9)	1.0
  (5, 10)	1.0
  (6, 11)	1.0
  (7, 12)	1.0
  (7, 13)	1.0
  (7, 14)	1.0
  (7, 15)	1.0
  (7, 16)	1.0
  (8, 17)	1.0
  (8, 18)	1.0
  (8, 19)	1.0
  (8, 20)	1.0
  (8, 21)	1.0
  (9, 22)	1.0
  (9, 23)	1.0
  (10, 24)	1.0
  :	:
  (24, 76)	1.0
  (24, 77)	1.0
  (25, 78)	1.0
  (25, 79)	1.0
  (25, 80)	1.0
  (26, 81)	1.0
  (26, 82)	1.0
  (27, 83)	1.0
  (27, 84)	1.0
  (27, 85)	1.0
  (27, 86)	1.0
  (28, 87)	1.0
  (28, 88)	1.0
  (29, 89)	1.0
  (29, 90)	1.0
  (29, 91)	1.0
  (30, 92)	1.0
  (30, 93)	1.0
  (31, 64)	1.0
  (31, 94)	1.0
  (31, 95)	1.0
  (31, 96)	1.0
  (31, 97)	1.0
  (31, 98)	1.0
  (31, 99)	1.0
LastFm is ready to go


In [11]:
lightgcn_model = model.LightGCN(dataset)

RuntimeError: [enforce fail at alloc_cpu.cpp:80] data. DefaultCPUAllocator: not enough memory: you tried to allocate 7846855898176 bytes.

In [None]:
user_ids = [0, 1, 2] # [272412481300040, 1456826891333599, 2133687643102426]  # Replace with user IDs for which you want recommendations
ratings = lightgcn_model.getUsersRating(user_ids)  # Get recommendations for users
print(ratings)

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch_geometric.data import Data
# from torch_geometric.nn import SAGEConv
# from torch_geometric.utils import train_test_split
# import networkx as nx
# import numpy as np
# 
# knowledge_graph = nx.Graph()
# knowledge_graph.add_node('CustomerA', type='Customer')
# knowledge_graph.add_node('CustomerB', type='Customer')
# knowledge_graph.add_node('ArticleX', type='Article')
# knowledge_graph.add_node('ArticleY', type='Article')
# 
# knowledge_graph.add_edge('CustomerA', 'ArticleX', interaction='purchased')
# knowledge_graph.add_edge('CustomerA', 'ArticleY', interaction='purchased')
# knowledge_graph.add_edge('CustomerB', 'ArticleX', interaction='purchased')
# 
# class GraphSAGENet(nn.Module):
#     def __init__(self, in_channels, hidden_channels, out_channels):
#         super(GraphSAGENet, self).__init__()
#         self.conv1 = SAGEConv(in_channels, hidden_channels)
#         self.conv2 = SAGEConv(hidden_channels, out_channels)
# 
#     def forward(self, data):
#         x, edge_index = data.x, data.edge_index
#         x = self.conv1(x, edge_index)
#         x = nn.functional.relu(x)
#         x = self.conv2(x, edge_index)
#         return x
# 
# data = Data(x=torch.eye(4), edge_index=torch.tensor([[0, 0, 1, 1, 2, 3], [2, 3, 2, 3, 0, 1]]).long())
# data.train_mask = torch.tensor([0, 1, 2, 3], dtype=torch.bool)
# 
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = GraphSAGENet(4, 16, 1).to(device)
# optimizer = optim.Adam(model.parameters(), lr=0.01)
# criterion = nn.BCEWithLogitsLoss()
# 
# def train(data):
#     model.train()
#     optimizer.zero_grad()
#     out = model(data)
#     loss = criterion(out[data.train_mask], data.y[data.train_mask].view(-1, 1).to(torch.float))
#     loss.backward()
#     optimizer.step()
# 
# for epoch in range(1000):
#     train(data)
# 
# def generate_recommendations(data, user_id):
#     # You can modify this part to suit your use case
#     user_idx = int(user_id[7:])  # Extract the user index from user_id
#     user_embeddings = model(data)[user_idx]
#     item_scores = torch.matmul(model(data), user_embeddings)
#     recommendations = np.argsort(item_scores.cpu().detach().numpy())[::-1]  # Sort in descending order
#     return recommendations
# 
# user_id = 'CustomerA'
# recommendations = generate_recommendations(data, user_id)
# print(f"Recommendations for {user_id}: {recommendations}")

In [None]:
c2weeks = transactions.groupby('customer_id')['week'].unique()
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
candidates_last_purchase.week=weeks

In [None]:
### Bestsellers candidates
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [None]:
bestsellers_previous_week.pipe(lambda df: df[df['week']==96])

In [None]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [None]:
# test_set_transactions

In [None]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [None]:
candidates_bestsellers

# Combining transactions and candidates / negative examples

In [None]:
transactions['purchased'] = 1
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

### Add bestseller information

In [None]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [None]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [None]:
extra_columns = []
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']
columns_to_use.extend(extra_columns)

In [None]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [None]:
test_X

# Model training

In [None]:
from lightgbm.sklearn import LGBMRanker
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

# Calculate predictions

In [None]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

# Create submission

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds
sub_name = 'basic_model_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)