In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelBinarizer

pd.options.mode.chained_assignment = None
dataDir = Path.cwd().parent/'Data/'
np.random.seed(66)

transactions_usecols = ["t_dat", "customer_id", "article_id"]
articles_usecols = ["article_id", "product_group_name", "colour_group_name", "index_name"]
customers_usecols = ["customer_id", "club_member_status", "age"]

transactions = pd.read_csv(dataDir/'transactions.csv', usecols=transactions_usecols)
articles = pd.read_csv(dataDir/'articles.csv', usecols=articles_usecols)
customers = pd.read_csv(dataDir/'customers.csv', usecols=customers_usecols)

In [None]:
len(transactions.customer_id.unique())

In [None]:
""" Preprocessing on articles and customers data
    Articles: product_group_name, colour_group_name, index_name
    Customers: club_member_status, age
"""

articles = articles.loc[articles.product_group_name != "Unknown"]
articles = articles.loc[articles.colour_group_name != "Unknown"]
customers.dropna(axis=0, how='any', subset=["club_member_status", "age"], inplace=True)
# filter out transactions data with articles and customers ID
transactions = transactions.loc[transactions['article_id'].isin(articles.article_id.unique())]
transactions = transactions.loc[transactions['customer_id'].isin(customers.customer_id.unique())]

# filter out duplicated user-item interactions within transactions data
transactions.drop_duplicates(subset=["customer_id", "article_id"], inplace=True)

In [None]:
customers_list = transactions.customer_id.unique()
# randomly select part of the transaction data
np.random.seed(35)
rand_userIds = np.random.choice(customers_list, 
                                size=int(len(customers_list)*0.007), 
                                replace=False)
transactions = transactions.loc[transactions['customer_id'].isin(rand_userIds)]

In [None]:
""" Ensure that dataset contain sufficient user-item interactions to avoid cold-start problem """

# avoid cold-start problem
min_article_interactions = 8
min_customer_interactions = 8
transactions["interaction"] = 1

while True:
    # Filter articles with more than min_article_interactions interactions
    comb_transactions_article = transactions[["article_id", "interaction"]].groupby(by=["article_id"], sort=False, as_index=False).sum(["interaction"])
    comb_transactions_article = comb_transactions_article.loc[comb_transactions_article.interaction >= min_article_interactions]
    articles_list = comb_transactions_article.article_id.unique()
    
    # Filter customers with more than min_customer_interactions interactions
    comb_transactions_customer = transactions[["customer_id", "interaction"]].groupby(by=["customer_id"], sort=False, as_index=False).sum(["interaction"])
    comb_transactions_customer = comb_transactions_customer.loc[comb_transactions_customer.interaction >= min_customer_interactions]
    customers_list = comb_transactions_customer.customer_id.unique()
    
    # Apply the filters
    filtered_transactions = transactions.loc[transactions['article_id'].isin(articles_list) & transactions['customer_id'].isin(customers_list)]
    
    # Check if the filtered transactions are the same as the original transactions
    if len(filtered_transactions) == len(transactions):
        break
    else:
        transactions = filtered_transactions

print('There are {} rows of data from {} users (users with suffication data)'.format(len(transactions), len(transactions.customer_id.unique())))

In [None]:
# binarize categorical features club_member_status and sales_channel_id in customer data
lb = LabelBinarizer()
customers.club_member_status = lb.fit_transform(customers.club_member_status)
# standardize numerical features age and price in transaction data
std = StandardScaler()
customers[["age"]] = std.fit_transform(customers[["age"]])

# merge transaction data with article and customer data
transactions = transactions.merge(customers, how='left', left_on=["customer_id"], right_on=["customer_id"])
transactions = transactions.merge(articles, how='left', left_on=["article_id"], right_on=["article_id"])

### Still exist cold-start problem, it could be further relieved by external information

In [None]:
# Get the value_counts for the 'article_id' column
article_id_counts = transactions['article_id'].value_counts()
print(len(article_id_counts))
# Filter the article_ids with count 1
articles_with_count_1 = article_id_counts[article_id_counts == 1].index.tolist()
print(len(articles_with_count_1))

In [None]:
article_id_counts

In [None]:
# fetch IDs from filtered transaction data
trans_article_ids = transactions.article_id.unique()
trans_customer_ids = transactions.customer_id.unique()
# get the articles and customers data based on the IDs from filtered transaction data
article_data = articles.loc[articles.article_id.isin(trans_article_ids)].copy()
customer_data = customers.loc[customers.customer_id.isin(trans_customer_ids)].copy()

"""reindex"""
# map string type customer_id to int type
customer_mapper = {}
customer_keys = transactions.customer_id.unique()
customer_values = list(range(len(transactions.customer_id.unique())))
customer_mapper.update(zip(customer_keys, customer_values))

# map string type article_id to int type
product_mapper = {}
product_keys = transactions.article_id.unique()
product_values = list(range(len(transactions.article_id.unique())))
product_mapper.update(zip(product_keys, product_values))

# map color_group_name to int type
color_mapper = {}
color_keys = transactions.colour_group_name.unique()
color_values = list(range(len(transactions.colour_group_name.unique())))
color_mapper.update(zip(color_keys, color_values))

# map color_group_name to int type
product_group_mapper = {}
product_group_keys = transactions.product_group_name.unique()
product_group_values = list(range(len(transactions.product_group_name.unique())))
product_group_mapper.update(zip(product_group_keys, product_group_values))

# map index_name to int type
index_name_mapper = {}
index_name_keys = transactions.index_name.unique()
index_name_values = list(range(len(transactions.index_name.unique())))
index_name_mapper.update(zip(index_name_keys, index_name_values))

# reindex categorical features based on feature mappers
transactions["customer_id"] = transactions["customer_id"].map(customer_mapper)
transactions["article_id"] = transactions["article_id"].map(product_mapper)
transactions["colour_group_name"] = transactions["colour_group_name"].map(color_mapper)
transactions["product_group_name"] = transactions["product_group_name"].map(product_group_mapper)
transactions["index_name"] = transactions["index_name"].map(index_name_mapper)

# reindex features in article and customer dataset
article_data["article_id"] = article_data["article_id"].map(product_mapper)
article_data["colour_group_name"] = article_data["colour_group_name"].map(color_mapper)
article_data["product_group_name"] = article_data["product_group_name"].map(product_group_mapper)
article_data["index_name"] = article_data["index_name"].map(index_name_mapper)
customer_data["customer_id"] = customer_data["customer_id"].map(customer_mapper)

# training set and test set
transactions['rank_latest'] = transactions.groupby(['customer_id'])['t_dat'].rank(method='first', ascending=False)
train_transactions = transactions[transactions['rank_latest'] != 1]
test_transactions = transactions[transactions['rank_latest'] == 1]

# drop articles that do not exist in training set
test_product_list = list(set(test_transactions.article_id.unique()) & set(train_transactions.article_id.unique()))
test_transactions = test_transactions.loc[test_transactions['article_id'].isin(test_product_list)]

# drop columns that we no longer need
drop_cols = ["t_dat", "interaction", "rank_latest"]
train_transactions.drop(labels=drop_cols, axis=1, inplace=True)
test_transactions.drop(labels=drop_cols, axis=1, inplace=True)

# get a list of all articles id
all_products_id = transactions["article_id"].unique()

train_transactions.head()

In [31]:
len(all_products_id)

2362

In [32]:
# article_data.to_csv(dataDir/"article_data_large.csv", index=False)
# customer_data.to_csv(dataDir/"customer_data_large.csv", index=False)
# train_transactions.to_csv(dataDir/"train_transactions_large.csv", index=False)
# test_transactions.to_csv(dataDir/"test_transactions_large.csv", index=False)

In [8]:
from scipy.sparse import csr_matrix

def sparse_to_tensor(sparse_matrix):
    """
    Transform scipy coo matrix to pytorch sparse tensor
    """
    sparse_matrix = sparse_matrix.tocoo()
    values = sparse_matrix.data
    indices = (sparse_matrix.row, sparse_matrix.col) # np.vstack
    shape = sparse_matrix.shape

    i = torch.LongTensor(indices)
    v = torch.DoubleTensor(values)
    s = torch.Size(shape)

    return torch.sparse.DoubleTensor(i, v, s)

def sparse_batch_collate(batch): 
    """
    Collate function which to transform scipy csr matrix to pytorch sparse tensor
    """
    # batch[0] since it is returned as a one element list

    customer_batch, product_batch, targets_batch = batch[0]
    
    if type(customer_batch[0]) == csr_matrix:
        customer_batch = customer_batch.tocoo() # removed vstack
        customer_batch = sparse_to_tensor(customer_batch)
    else:
        customer_batch = torch.DoubleTensor(customer_batch)

    if type(product_batch[0]) == csr_matrix:
        product_batch = product_batch.tocoo() # removed vstack
        product_batch = sparse_to_tensor(product_batch)
    else:
        product_batch = torch.DoubleTensor(product_batch)
    
    if type(targets_batch[0]) == csr_matrix:
        targets_batch = targets_batch.tocoo() # removed vstack
        targets_batch = sparse_to_tensor(targets_batch)
    else:
        targets_batch = torch.DoubleTensor(targets_batch)
    
    return customer_batch, product_batch, targets_batch

In [2]:
class HMSaleTrainDataLoader(Dataset):
    """HMSaleTrainDataLoader Training set of HM sales data

    Args:
        transactions (pd.DataFrame): Dataframe of transaction records
        all_products_id (list): A list contains all product ids
    """
    def __init__(self, transactions, all_products_id, all_customer_product_set):
        self.customers, self.products, self.club_status, self.age_groups, self.product_groups, self.color_groups, \
        self.index_name, self.labels = self.get_dataset(transactions, all_products_id, all_customer_product_set)

    def __len__(self):
        return len(self.customers)
    
    def __getitem__(self, idx):
        return self.customers[idx], self.products[idx], self.club_status[idx], self.age_groups[idx], self.product_groups[idx], \
        self.color_groups[idx], self.index_name[idx], self.labels[idx]
    
    def get_dataset(self, transactions, all_products_id, all_customer_product_set):
        customers, products, club_status, age_groups, product_groups, color_groups, index_name, labels  = [], [], [], [], [], [], [], []
        
        """negative sampling"""
        # set up negative:positive ratio as 5:1
        negative_samples = 3
        customer_product_set = set(zip(transactions["customer_id"], transactions["article_id"], 
                                       transactions["club_member_status"], transactions["age"], 
                                       transactions["product_group_name"], transactions["colour_group_name"], transactions["index_name"]))

        for u, i, club, age, product, color, index in tqdm(customer_product_set):
            customers.append(u)
            products.append(i)
            club_status.append(club)
            age_groups.append(age)
            product_groups.append(product)
            color_groups.append(color)
            index_name.append(index)
            labels.append(1)
            for _ in range(negative_samples):
                negative_product = np.random.choice(all_products_id)
                while (u, negative_product, club, age, product, color, index) in all_customer_product_set:
                    negative_product = np.random.choice(all_products_id)
                customers.append(u)
                products.append(negative_product)
                club_status.append(club)
                age_groups.append(age)
                product_groups.append(product)
                color_groups.append(color)
                index_name.append(index)
                labels.append(0)
        return customers, products, club_status, age_groups, product_groups, color_groups, index_name, torch.tensor(labels)

def binary_acc(y_pred, y_test):
    y_pred_label = torch.softmax(y_pred, dim=1)
    _, y_pred_label = torch.max(y_pred_label, dim = 1)
    correct_pred = (y_pred_label == y_test).sum()
    acc = correct_pred/y_test.shape[0]
    return acc  
    

In [3]:
class NCF(nn.Module):
    """NCF - Neural Collaborative Filtering proposed by He et al.

    Args:
        num_users (int): Number of users
        num_items (iut): Number of products
        num_product_groups (int): Number of product groups
        num_color_groups: (int): Number of color groups
        num_index_name: (int): Number of index name
    """
    def __init__(
            self, 
            num_users: int, 
            num_items: int,
            num_product_groups: int,
            num_color_groups: int,
            num_index_name: int,
            user_embedding_dim: int = 32,
            item_embedding_dim: int = 64,
            product_group_embedding_dim: int = 8,
            color_group_embedding_dim: int = 16,
            index_name_embedding_dim: int = 6,
            input_size: int = 128,
            hidden_size_1: int = 128,
            hidden_size_2: int = 256,
            output_size: int = 1,
        ):
            super().__init__()
            # embedding layers for categorical features and user-item interaction
            self.user_embedding_layer = nn.Embedding(num_embeddings=num_users, embedding_dim=user_embedding_dim)
            self.item_embedding_layer = nn.Embedding(num_embeddings=num_items, embedding_dim=item_embedding_dim)
            self.product_group_embedding_layer = nn.Embedding(num_embeddings=num_product_groups, embedding_dim=product_group_embedding_dim)
            self.color_group_embedding_layer = nn.Embedding(num_embeddings=num_color_groups, embedding_dim=color_group_embedding_dim)
            self.index_name_embedding_layer = nn.Embedding(num_embeddings=num_index_name, embedding_dim=index_name_embedding_dim)

            self.relu = nn.LeakyReLU()
            if user_embedding_dim and item_embedding_dim is not None:
                in_channels = (
                    [input_size] 
                    + [hidden_size_2]
                    + [hidden_size_1]
                    + [hidden_size_1]
                    + [output_size]
                )
            else:
                raise ValueError
            self.fcs = nn.Sequential(
                *[nn.Linear(in_features=in_channels[i], out_features=in_channels[i+1]) for i in range(len(in_channels)-1) if i != len(in_channels)-1]
            )
            
            # in_channels_1 = (
            #     [48] 
            #     + [hidden_size_2]
            #     + [output_size]
            # )
            # in_channels_2 = (
            #     [30] 
            #     + [hidden_size_1]
            #     + [16]
            # )
            
            # self.encoder1 = nn.Sequential(
            #     *[nn.Linear(in_features=in_channels_1[i], out_features=in_channels_1[i+1]) for i in range(len(in_channels_1)-1) if i != len(in_channels_1)-1]
            # )    
            
            # self.encoder2 = nn.Linear(in_features=2, out_features=5, bias=True)
            
            # self.encoder3 = nn.Sequential(
            # *[nn.Linear(in_features=in_channels_2[i], out_features=in_channels_2[i+1]) for i in range(len(in_channels_2)-1) if i != len(in_channels_2)-1]
            # )
            
    
    def forward(self, user_input, item_input, club_status, age_groups, product_groups, color_groups, index_name):

        user_embedding = self.user_embedding_layer(user_input)
        item_embedding = self.item_embedding_layer(item_input)
        product_group_embedding = self.product_group_embedding_layer(product_groups)
        color_group_embedding = self.color_group_embedding_layer(color_groups)
        index_name_embedding = self.index_name_embedding_layer(index_name)
        concat_embedding = torch.cat([user_embedding, item_embedding, product_group_embedding, color_group_embedding, index_name_embedding], dim=-1)
        
        latent_vec_1 = torch.cat([user_embedding, item_embedding], dim=-1)
        # latent_vec_1 = self.encoder1(latent_vec_1)

        latent_vec_2 = torch.cat([club_status.float().reshape(-1, 1), age_groups.reshape(-1, 1)], dim=-1)
        # latent_vec_2 = self.encoder2(latent_vec_2)

        latent_vec_3 = torch.cat([product_group_embedding, color_group_embedding, index_name_embedding], dim=-1)
        # latent_vec_3 = self.encoder3(latent_vec_3)

        concat_embedding = torch.cat([torch.squeeze(latent_vec_1), torch.squeeze(latent_vec_2), torch.squeeze(latent_vec_3)], dim=-1)
        
#         concat_input = torch.cat([prices.reshape(-1, 1), sales_channels.reshape(-1, 1), club_status.reshape(-1, 1), age_groups.reshape(-1, 1)], dim=-1)
#         concat_embedding = torch.cat([concat_embedding, concat_input], dim=-1)

        for fc_layer in self.fcs:
            concat_embedding = fc_layer(concat_embedding)
            concat_embedding = self.relu(concat_embedding)
        #pred = F.softmax(concat_embedding)
        return concat_embedding
    

In [4]:
def binary_acc(y_pred, y_test):
    acc = 0.0
    
    y_pred_label = torch.round(torch.sigmoid(y_pred))
    correct_pred = (y_pred_label == y_test).sum()
    acc = correct_pred.item()/y_test.shape[0]
    return acc

In [5]:
train_transactions = pd.read_csv(dataDir/'train_transactions_large.csv')
test_transactions = pd.read_csv(dataDir/'test_transactions_large.csv')
transactions = pd.concat([train_transactions, test_transactions], ignore_index=True)
# get a list of all articles id
article_data = pd.read_csv(dataDir/'article_data_large.csv')
all_products_id = article_data["article_id"].unique()

customer_product_set = set(zip(transactions["customer_id"], transactions["article_id"], 
                               transactions["club_member_status"], transactions["age"], 
                               transactions["product_group_name"], transactions["colour_group_name"], transactions["index_name"]))

# set up hyper-parameters
learning_rate = 0.005
epoch = 200
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
# device = torch.device("cpu")

torch.autograd.set_detect_anomaly(True)

# set up dataset for training
num_users = len(train_transactions.customer_id.unique())
print("num_users:", num_users)
num_items = len(all_products_id)
print("num_items:", num_items)
num_product_groups = len(train_transactions.product_group_name.unique())
print("num_product_groups:", num_product_groups)
num_color_groups = len(train_transactions.colour_group_name.unique())
print("num_color_groups:", num_color_groups)
num_index_name = len(train_transactions.index_name.unique())
print("num_index_name:", num_index_name)
train_data = HMSaleTrainDataLoader(train_transactions, all_products_id, customer_product_set)
train_loader = DataLoader(train_data, batch_size=1024, shuffle=True)

device: cuda:0
num_users: 10346
num_items: 15516
num_product_groups: 11
num_color_groups: 48
num_index_name: 9


  0%|          | 0/293769 [00:00<?, ?it/s]

In [9]:
# initiate model for training
model = NCF(num_users=num_users, num_items=num_items, num_product_groups=num_product_groups, num_color_groups=num_color_groups, num_index_name=num_index_name)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
weights = torch.tensor([1.0, 3.0]).to(device)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=weights[1])
model.train()
best_acc = 0.0  
for e in tqdm(range(epoch)):
    epoch_loss = 0.0
    epoch_acc = 0.0
    for customer_batch, product_batch, club_status_batch, age_groups_batch, product_groups_batch, color_groups_batch, index_name_batch, label_batch in train_loader:
        customer_batch, product_batch, club_status_batch, age_groups_batch, product_groups_batch, color_groups_batch, index_name_batch, label_batch \
        = customer_batch.to(device), product_batch.to(device), club_status_batch.to(device), age_groups_batch.to(device), \
        product_groups_batch.to(device), color_groups_batch.to(device), index_name_batch.to(device), label_batch.to(device), 
        optimizer.zero_grad()
        y_pred = model(customer_batch, product_batch, club_status_batch.float(), age_groups_batch.float(), product_groups_batch, color_groups_batch, index_name_batch)
        loss = loss_fn(y_pred, label_batch.reshape(-1, 1).float())
        acc = binary_acc(y_pred, label_batch.reshape(-1, 1).float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc
        cur_acc = epoch_acc/len(train_loader)
    if cur_acc > best_acc:
        best_acc = cur_acc
        torch.save(model.state_dict(), 'best_model_large.pt')
    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.6f} | Acc: {epoch_acc/len(train_loader):.4f}')

print(f'\nBest Accuracy: {best_acc:.3f}')


NCF(
  (user_embedding_layer): Embedding(10346, 32)
  (item_embedding_layer): Embedding(15516, 64)
  (product_group_embedding_layer): Embedding(11, 8)
  (color_group_embedding_layer): Embedding(48, 16)
  (index_name_embedding_layer): Embedding(9, 6)
  (relu): LeakyReLU(negative_slope=0.01)
  (fcs): Sequential(
    (0): Linear(in_features=128, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=128, bias=True)
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): Linear(in_features=128, out_features=1, bias=True)
  )
)


  0%|          | 0/200 [00:00<?, ?it/s]

Epoch 000: | Loss: 0.764069 | Acc: 0.7938
Epoch 001: | Loss: 0.245110 | Acc: 0.9340
Epoch 002: | Loss: 0.132892 | Acc: 0.9673
Epoch 003: | Loss: 0.100629 | Acc: 0.9762
Epoch 004: | Loss: 0.085627 | Acc: 0.9801
Epoch 005: | Loss: 0.078762 | Acc: 0.9817
Epoch 006: | Loss: 0.071113 | Acc: 0.9833
Epoch 007: | Loss: 0.068036 | Acc: 0.9840
Epoch 008: | Loss: 0.063859 | Acc: 0.9845
Epoch 009: | Loss: 0.058819 | Acc: 0.9853
Epoch 010: | Loss: 0.052431 | Acc: 0.9862
Epoch 011: | Loss: 0.047454 | Acc: 0.9872
Epoch 012: | Loss: 0.042727 | Acc: 0.9884
Epoch 013: | Loss: 0.038584 | Acc: 0.9897
Epoch 014: | Loss: 0.032041 | Acc: 0.9919
Epoch 015: | Loss: 0.029138 | Acc: 0.9928
Epoch 016: | Loss: 0.024775 | Acc: 0.9940
Epoch 017: | Loss: 0.019388 | Acc: 0.9953
Epoch 018: | Loss: 0.017251 | Acc: 0.9960
Epoch 019: | Loss: 0.014865 | Acc: 0.9966
Epoch 020: | Loss: 0.016057 | Acc: 0.9964
Epoch 021: | Loss: 0.013459 | Acc: 0.9972
Epoch 022: | Loss: 0.009250 | Acc: 0.9980
Epoch 023: | Loss: 0.009564 | Acc:

KeyboardInterrupt: 

In [6]:
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum((2**r - 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

In [7]:
# model test
train_transactions = pd.read_csv(dataDir/'train_transactions_large.csv')
test_transactions = pd.read_csv(dataDir/'test_transactions_large.csv')
article_data = pd.read_csv(dataDir/'article_data_large.csv')
customer_data = pd.read_csv(dataDir/'customer_data_large.csv')

all_products_id = article_data["article_id"].unique()
customer_product_test = set(zip(test_transactions["customer_id"], test_transactions["article_id"], 
                                test_transactions["club_member_status"], test_transactions["age"], 
                                test_transactions["product_group_name"], test_transactions["colour_group_name"], 
                                test_transactions["index_name"]))

# set up dataset for training
num_users = len(train_transactions.customer_id.unique())
print("num_users:", num_users)
num_items = len(all_products_id)
print("num_items:", num_items)
num_product_groups = len(train_transactions.product_group_name.unique())
print("num_product_groups:", num_product_groups)
num_color_groups = len(train_transactions.colour_group_name.unique())
print("num_color_groups:", num_color_groups)
num_index_name = len(train_transactions.index_name.unique())
print("num_index_name:", num_index_name)

# Dict of all items that are interacted with by each user
user_interacted_items = train_transactions.groupby('customer_id')['article_id'].apply(list).to_dict()

model = NCF(num_users=num_users, num_items=num_items, num_product_groups=num_product_groups, num_color_groups=num_color_groups, num_index_name=num_index_name)

model.load_state_dict(torch.load('best_model_large.pt'))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

model.eval()
hits_10 = []
hits_5 = []
pred_prob = []
recall = []
ndcgs = []

with torch.no_grad():
    for customer, product, club_status, age_groups, product_groups, color_groups, index_name in tqdm(customer_product_test):
        
        # select 29 products from item set that customer has no interactions
        interacted_items = user_interacted_items[customer]
        interacted_items = interacted_items + [product]
        not_interacted_items = set(all_products_id) - set(interacted_items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 10, replace=False))
        test_items = [product] + selected_not_interacted

        # get the other product features based on the selected test items
        product_groups_batch = torch.tensor(article_data.loc[article_data["article_id"].isin(test_items)]["product_group_name"].to_numpy()).reshape(-1, 1).to(device)
        color_groups_batch = torch.tensor(article_data.loc[article_data["article_id"].isin(test_items)]["colour_group_name"].to_numpy()).reshape(-1, 1).to(device)
        index_name_batch = torch.tensor(article_data.loc[article_data["article_id"].isin(test_items)]["index_name"].to_numpy()).reshape(-1, 1).to(device)

        test_items = torch.tensor(test_items).reshape(-1, 1).to(device)
        customer_batch = torch.tensor([customer]*11).reshape(-1, 1).to(device)
        club_status_batch = torch.tensor([club_status]*11).reshape(-1, 1).to(device)
        age_groups_batch = torch.tensor([age_groups]*11).reshape(-1, 1).to(device)

        y_single_pred = model(torch.tensor(customer).to(device), torch.tensor(product).to(device), torch.tensor(club_status).to(device), torch.tensor(age_groups).to(device), torch.tensor(product_groups).to(device), torch.tensor(color_groups).to(device), torch.tensor(index_name).to(device))
        y_pred = model(customer_batch, test_items, club_status_batch, age_groups_batch, product_groups_batch, color_groups_batch, index_name_batch)
#         y_pred = torch.sigmoid(y_pred)
        y_pred = y_pred.squeeze()

        top10_probs, top10_indices = torch.topk(y_pred, 10)
        # Convert the top 10 probabilities to a list
        top10_items = [test_items[i].item() for i in top10_indices]
        if product in top10_items:
            hits_10.append(1)
        else:
            hits_10.append(0)

        top5_probs, topk_indices = torch.topk(y_pred, 5)
        # Convert the top 5 probabilities to a list
        top5_items = [test_items[i].item() for i in topk_indices]
        if product in top5_items:
            hits_5.append(1)
        else:
            hits_5.append(0)

        
        pred_prob.append(torch.sigmoid(y_single_pred))
        if torch.round(torch.sigmoid(y_single_pred)) == 1:
            recall.append(1)
        else:
            recall.append(0)
        
        # Calculate the NDCG
        relevance_scores = np.zeros(11)
        relevance_scores[0] = 1  # The first item is the ground truth (relevant) item
        top10_items = [test_items[i].item() for i in top10_indices]
        ndcg_score = ndcg_at_k(relevance_scores[top10_indices.cpu().numpy()], 10)
        ndcgs.append(ndcg_score)

            
print("The Hit Rate@5 is {:.3f}".format(np.average(hits_5)))
print("The Hit Rate@10 is {:.3f}".format(np.average(hits_10)))
print("The average NDCG@10 is {:.3f}".format(np.average(ndcgs)))
print("The Recall is {:.3f}".format(np.average(recall)))

num_users: 10346
num_items: 15516
num_product_groups: 11
num_color_groups: 48
num_index_name: 9
cuda:0


  0%|          | 0/10338 [00:00<?, ?it/s]

The Hit Rate@5 is 0.405
The Hit Rate@10 is 0.878
The average NDCG@10 is 0.392
The Recall is 0.982
