In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelBinarizer

pd.options.mode.chained_assignment = None
dataDir = Path.cwd().parent.parent.parent/'backup/HM_data'
np.random.seed(66)

transactions_usecols = ["t_dat", "customer_id", "article_id"]
articles_usecols = ["article_id", "product_group_name", "colour_group_name", "index_name"]
customers_usecols = ["customer_id", "club_member_status", "age"]

transactions = pd.read_csv(dataDir/'transactions.csv', usecols=transactions_usecols)
articles = pd.read_csv(dataDir/'articles.csv', usecols=articles_usecols)
customers = pd.read_csv(dataDir/'customers.csv', usecols=customers_usecols)

In [422]:
len(transactions.customer_id.unique())

1362281

In [7]:
""" Preprocessing on articles and customers data
    Articles: product_group_name, colour_group_name, index_name
    Customers: club_member_status, age
"""

articles = articles.loc[articles.product_group_name != "Unknown"]
articles = articles.loc[articles.colour_group_name != "Unknown"]
customers.dropna(axis=0, how='any', subset=["club_member_status", "age"], inplace=True)
# filter out transactions data with articles and customers ID
transactions = transactions.loc[transactions['article_id'].isin(articles.article_id.unique())]
transactions = transactions.loc[transactions['customer_id'].isin(customers.customer_id.unique())]

# filter out duplicated user-item interactions within transactions data
transactions.drop_duplicates(subset=["customer_id", "article_id"], inplace=True)

In [8]:
customers_list = transactions.customer_id.unique()

# randomly select part of the transaction data
rand_userIds = np.random.choice(customers_list, 
                                size=int(len(customers_list)*0.003), 
                                replace=False)
transactions = transactions.loc[transactions['customer_id'].isin(rand_userIds)]

In [9]:
""" Ensure that dataset contain sufficient user-item interactions to avoid cold-start problem """

# avoid cold-start problem
min_article_interactions = 3
min_customer_interactions = 5
transactions["interaction"] = 1

while True:
    # Filter articles with more than min_article_interactions interactions
    comb_transactions_article = transactions[["article_id", "interaction"]].groupby(by=["article_id"], sort=False, as_index=False).sum(["interaction"])
    comb_transactions_article = comb_transactions_article.loc[comb_transactions_article.interaction >= min_article_interactions]
    articles_list = comb_transactions_article.article_id.unique()
    
    # Filter customers with more than min_customer_interactions interactions
    comb_transactions_customer = transactions[["customer_id", "interaction"]].groupby(by=["customer_id"], sort=False, as_index=False).sum(["interaction"])
    comb_transactions_customer = comb_transactions_customer.loc[comb_transactions_customer.interaction >= min_customer_interactions]
    customers_list = comb_transactions_customer.customer_id.unique()
    
    # Apply the filters
    filtered_transactions = transactions.loc[transactions['article_id'].isin(articles_list) & transactions['customer_id'].isin(customers_list)]
    
    # Check if the filtered transactions are the same as the original transactions
    if len(filtered_transactions) == len(transactions):
        break
    else:
        transactions = filtered_transactions

print('There are {} rows of data from {} users (users with suffication data)'.format(len(transactions), len(transactions.customer_id.unique())))

There are 48316 rows of data from 2152 users (users with suffication data)


In [10]:
# binarize categorical features club_member_status and sales_channel_id in customer data
lb = LabelBinarizer()
customers.club_member_status = lb.fit_transform(customers.club_member_status)
# standardize numerical features age and price in transaction data
std = StandardScaler()
customers[["age"]] = std.fit_transform(customers[["age"]])

# merge transaction data with article and customer data
transactions = transactions.merge(customers, how='left', left_on=["customer_id"], right_on=["customer_id"])
transactions = transactions.merge(articles, how='left', left_on=["article_id"], right_on=["article_id"])

### Still exist cold-start problem, it could be further relieved by external information

In [11]:
# Get the value_counts for the 'article_id' column
article_id_counts = transactions['article_id'].value_counts()
print(len(article_id_counts))
# Filter the article_ids with count 1
articles_with_count_1 = article_id_counts[article_id_counts == 1].index.tolist()
print(len(articles_with_count_1))

9006
0


In [12]:
article_id_counts

706016001    83
372860001    75
706016002    71
610776002    60
351484002    56
             ..
627197001     3
708258002     3
752379002     3
407653002     3
911981001     3
Name: article_id, Length: 9006, dtype: int64

In [13]:
# fetch IDs from filtered transaction data
trans_article_ids = transactions.article_id.unique()
trans_customer_ids = transactions.customer_id.unique()
# get the articles and customers data based on the IDs from filtered transaction data
article_data = articles.loc[articles.article_id.isin(trans_article_ids)].copy()
customer_data = customers.loc[customers.customer_id.isin(trans_customer_ids)].copy()

"""reindex"""
# map string type customer_id to int type
customer_mapper = {}
customer_keys = transactions.customer_id.unique()
customer_values = list(range(len(transactions.customer_id.unique())))
customer_mapper.update(zip(customer_keys, customer_values))

# map string type article_id to int type
product_mapper = {}
product_keys = transactions.article_id.unique()
product_values = list(range(len(transactions.article_id.unique())))
product_mapper.update(zip(product_keys, product_values))

# map color_group_name to int type
color_mapper = {}
color_keys = transactions.colour_group_name.unique()
color_values = list(range(len(transactions.colour_group_name.unique())))
color_mapper.update(zip(color_keys, color_values))

# map color_group_name to int type
product_group_mapper = {}
product_group_keys = transactions.product_group_name.unique()
product_group_values = list(range(len(transactions.product_group_name.unique())))
product_group_mapper.update(zip(product_group_keys, product_group_values))

# map index_name to int type
index_name_mapper = {}
index_name_keys = transactions.index_name.unique()
index_name_values = list(range(len(transactions.index_name.unique())))
index_name_mapper.update(zip(index_name_keys, index_name_values))

# reindex categorical features based on feature mappers
transactions["customer_id"] = transactions["customer_id"].map(customer_mapper)
transactions["article_id"] = transactions["article_id"].map(product_mapper)
transactions["colour_group_name"] = transactions["colour_group_name"].map(color_mapper)
transactions["product_group_name"] = transactions["product_group_name"].map(product_group_mapper)
transactions["index_name"] = transactions["index_name"].map(index_name_mapper)

# reindex features in article and customer dataset
article_data["article_id"] = article_data["article_id"].map(product_mapper)
article_data["colour_group_name"] = article_data["colour_group_name"].map(color_mapper)
article_data["product_group_name"] = article_data["product_group_name"].map(product_group_mapper)
article_data["index_name"] = article_data["index_name"].map(index_name_mapper)
customer_data["customer_id"] = customer_data["customer_id"].map(customer_mapper)

# training set and test set
transactions['rank_latest'] = transactions.groupby(['customer_id'])['t_dat'].rank(method='first', ascending=False)
train_transactions = transactions[transactions['rank_latest'] != 1]
test_transactions = transactions[transactions['rank_latest'] == 1]

# drop articles that do not exist in training set
test_product_list = list(set(test_transactions.article_id.unique()) & set(train_transactions.article_id.unique()))
test_transactions = test_transactions.loc[test_transactions['article_id'].isin(test_product_list)]

# drop columns that we no longer need
drop_cols = ["t_dat", "interaction", "rank_latest"]
train_transactions.drop(labels=drop_cols, axis=1, inplace=True)
test_transactions.drop(labels=drop_cols, axis=1, inplace=True)

# get a list of all articles id
all_products_id = transactions["article_id"].unique()

train_transactions.head()

Unnamed: 0,customer_id,article_id,club_member_status,age,product_group_name,colour_group_name,index_name
0,0,0,1,1.230802,0,0,0
1,0,1,1,1.230802,0,1,1
2,0,2,1,1.230802,1,2,2
3,1,3,1,0.043197,2,3,1
4,2,4,1,-0.585535,0,4,3


In [14]:
len(all_products_id)

9006

In [431]:
# article_data.to_csv(dataDir/"article_data_large.csv", index=False)
# customer_data.to_csv(dataDir/"customer_data_large.csv", index=False)
# train_transactions.to_csv(dataDir/"train_transactions_large.csv", index=False)
# test_transactions.to_csv(dataDir/"test_transactions_large.csv", index=False)

In [432]:
from scipy.sparse import csr_matrix

def sparse_to_tensor(sparse_matrix):
    """
    Transform scipy coo matrix to pytorch sparse tensor
    """
    sparse_matrix = sparse_matrix.tocoo()
    values = sparse_matrix.data
    indices = (sparse_matrix.row, sparse_matrix.col) # np.vstack
    shape = sparse_matrix.shape

    i = torch.LongTensor(indices)
    v = torch.DoubleTensor(values)
    s = torch.Size(shape)

    return torch.sparse.DoubleTensor(i, v, s)

def sparse_batch_collate(batch): 
    """
    Collate function which to transform scipy csr matrix to pytorch sparse tensor
    """
    # batch[0] since it is returned as a one element list

    customer_batch, product_batch, targets_batch = batch[0]
    
    if type(customer_batch[0]) == csr_matrix:
        customer_batch = customer_batch.tocoo() # removed vstack
        customer_batch = sparse_to_tensor(customer_batch)
    else:
        customer_batch = torch.DoubleTensor(customer_batch)

    if type(product_batch[0]) == csr_matrix:
        product_batch = product_batch.tocoo() # removed vstack
        product_batch = sparse_to_tensor(product_batch)
    else:
        product_batch = torch.DoubleTensor(product_batch)
    
    if type(targets_batch[0]) == csr_matrix:
        targets_batch = targets_batch.tocoo() # removed vstack
        targets_batch = sparse_to_tensor(targets_batch)
    else:
        targets_batch = torch.DoubleTensor(targets_batch)
    
    return customer_batch, product_batch, targets_batch

In [15]:
class HMSaleTrainDataLoader(Dataset):
    """HMSaleTrainDataLoader Training set of HM sales data

    Args:
        transactions (pd.DataFrame): Dataframe of transaction records
        all_products_id (list): A list contains all product ids
    """
    def __init__(self, transactions, all_products_id, all_customer_product_set):
        self.customers, self.products, self.club_status, self.age_groups, self.product_groups, self.color_groups, \
        self.index_name, self.labels = self.get_dataset(transactions, all_products_id, all_customer_product_set)

    def __len__(self):
        return len(self.customers)
    
    def __getitem__(self, idx):
        return self.customers[idx], self.products[idx], self.club_status[idx], self.age_groups[idx], self.product_groups[idx], \
        self.color_groups[idx], self.index_name[idx], self.labels[idx]
    
    def get_dataset(self, transactions, all_products_id, all_customer_product_set):
        customers, products, club_status, age_groups, product_groups, color_groups, index_name, labels  = [], [], [], [], [], [], [], []
        
        """negative sampling"""
        # set up negative:positive ratio as 4:1
        negative_samples = 4
        customer_product_set = set(zip(transactions["customer_id"], transactions["article_id"], 
                                       transactions["club_member_status"], transactions["age"], 
                                       transactions["product_group_name"], transactions["colour_group_name"], transactions["index_name"]))

        for u, i, club, age, product, color, index in tqdm(customer_product_set):
            customers.append(u)
            products.append(i)
            club_status.append(club)
            age_groups.append(age)
            product_groups.append(product)
            color_groups.append(color)
            index_name.append(index)
            labels.append(1)
            for _ in range(negative_samples):
                negative_product = np.random.choice(all_products_id)
                while (u, negative_product, club, age, product, color, index) in all_customer_product_set:
                    negative_product = np.random.choice(all_products_id)
                customers.append(u)
                products.append(negative_product)
                club_status.append(club)
                age_groups.append(age)
                product_groups.append(product)
                color_groups.append(color)
                index_name.append(index)
                labels.append(0)
        return customers, products, club_status, age_groups, product_groups, color_groups, index_name, torch.tensor(labels)

def binary_acc(y_pred, y_test):
    y_pred_label = torch.softmax(y_pred, dim=1)
    _, y_pred_label = torch.max(y_pred_label, dim = 1)
    correct_pred = (y_pred_label == y_test).sum()
    acc = correct_pred/y_test.shape[0]
    return acc  
    

In [16]:
class NCF(nn.Module):
    """NCF - Neural Collaborative Filtering proposed by He et al.

    Args:
        num_users (int): Number of users
        num_items (iut): Number of products
        num_product_groups (int): Number of product groups
        num_color_groups: (int): Number of color groups
        num_index_name: (int): Number of index name
    """
    def __init__(
            self, 
            num_users: int, 
            num_items: int,
            num_product_groups: int,
            num_color_groups: int,
            num_index_name: int,
            user_embedding_dim: int = 16,
            item_embedding_dim: int = 32,
            product_group_embedding_dim: int = 8,
            color_group_embedding_dim: int = 16,
            index_name_embedding_dim: int = 6,
            input_size: int = 80,
            hidden_size_1: int = 128,
            hidden_size_2: int = 256,
            output_size: int = 1,
        ):
            super().__init__()
            # embedding layers for categorical features and user-item interaction
            self.user_embedding_layer = nn.Embedding(num_embeddings=num_users, embedding_dim=user_embedding_dim)
            self.item_embedding_layer = nn.Embedding(num_embeddings=num_items, embedding_dim=item_embedding_dim)
            self.product_group_embedding_layer = nn.Embedding(num_embeddings=num_product_groups, embedding_dim=product_group_embedding_dim)
            self.color_group_embedding_layer = nn.Embedding(num_embeddings=num_color_groups, embedding_dim=color_group_embedding_dim)
            self.index_name_embedding_layer = nn.Embedding(num_embeddings=num_index_name, embedding_dim=index_name_embedding_dim)

            self.relu = nn.LeakyReLU()
            if user_embedding_dim and item_embedding_dim is not None:
                in_channels = (
                    [input_size] 
                    + [hidden_size_2]
                    + [hidden_size_2]
                    + [hidden_size_1]
                    + [output_size]
                )
            else:
                raise ValueError
            self.fcs = nn.Sequential(
                *[nn.Linear(in_features=in_channels[i], out_features=in_channels[i+1]) for i in range(len(in_channels)-1) if i != len(in_channels)-1]
            )
            
            # in_channels_1 = (
            #     [48] 
            #     + [hidden_size_2]
            #     + [output_size]
            # )
            # in_channels_2 = (
            #     [30] 
            #     + [hidden_size_1]
            #     + [16]
            # )
            
            # self.encoder1 = nn.Sequential(
            #     *[nn.Linear(in_features=in_channels_1[i], out_features=in_channels_1[i+1]) for i in range(len(in_channels_1)-1) if i != len(in_channels_1)-1]
            # )    
            
            # self.encoder2 = nn.Linear(in_features=2, out_features=5, bias=True)
            
            # self.encoder3 = nn.Sequential(
            # *[nn.Linear(in_features=in_channels_2[i], out_features=in_channels_2[i+1]) for i in range(len(in_channels_2)-1) if i != len(in_channels_2)-1]
            # )
            
    
    def forward(self, user_input, item_input, club_status, age_groups, product_groups, color_groups, index_name):

        user_embedding = self.user_embedding_layer(user_input)
        item_embedding = self.item_embedding_layer(item_input)
        product_group_embedding = self.product_group_embedding_layer(product_groups)
        color_group_embedding = self.color_group_embedding_layer(color_groups)
        index_name_embedding = self.index_name_embedding_layer(index_name)
        concat_embedding = torch.cat([user_embedding, item_embedding, product_group_embedding, color_group_embedding, index_name_embedding], dim=-1)
        
        latent_vec_1 = torch.cat([user_embedding, item_embedding], dim=-1)
        # latent_vec_1 = self.encoder1(latent_vec_1)

        latent_vec_2 = torch.cat([club_status.reshape(-1, 1), age_groups.reshape(-1, 1)], dim=-1)
        # latent_vec_2 = self.encoder2(latent_vec_2)

        latent_vec_3 = torch.cat([product_group_embedding, color_group_embedding, index_name_embedding], dim=-1)
        # latent_vec_3 = self.encoder3(latent_vec_3)

        concat_embedding = torch.cat([torch.squeeze(latent_vec_1), torch.squeeze(latent_vec_2), torch.squeeze(latent_vec_3)], dim=-1)
        
#         concat_input = torch.cat([prices.reshape(-1, 1), sales_channels.reshape(-1, 1), club_status.reshape(-1, 1), age_groups.reshape(-1, 1)], dim=-1)
#         concat_embedding = torch.cat([concat_embedding, concat_input], dim=-1)

        for fc_layer in self.fcs:
            concat_embedding = fc_layer(concat_embedding)
            concat_embedding = self.relu(concat_embedding)
        #pred = F.softmax(concat_embedding)
        return concat_embedding
    

In [17]:
def binary_acc(y_pred, y_test):
    acc = 0.0
    
    y_pred_label = torch.round(torch.sigmoid(y_pred))
    correct_pred = (y_pred_label == y_test).sum()
    acc = correct_pred.item()/y_test.shape[0]
    return acc

In [18]:
train_transactions = pd.read_csv(dataDir/'train_transactions_medium.csv')
# get a list of all articles id
all_products_id = transactions["article_id"].unique()
customer_product_set = set(zip(transactions["customer_id"], transactions["article_id"], 
                               transactions["club_member_status"], transactions["age"], 
                               transactions["product_group_name"], transactions["colour_group_name"], transactions["index_name"]))

# set up hyper-parameters
learning_rate = 0.005
epoch = 120
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
# device = torch.device("cpu")

torch.autograd.set_detect_anomaly(True)

# set up dataset for training
num_users = len(train_transactions.customer_id.unique())
print("num_users:", num_users)
num_items = len(all_products_id)
print("num_items:", num_items)
num_product_groups = len(train_transactions.product_group_name.unique())
print("num_product_groups:", num_product_groups)
num_color_groups = len(train_transactions.colour_group_name.unique())
print("num_color_groups:", num_color_groups)
num_index_name = len(train_transactions.index_name.unique())
print("num_index_name:", num_index_name)
train_data = HMSaleTrainDataLoader(train_transactions, all_products_id, customer_product_set)
train_loader = DataLoader(train_data, batch_size=512, shuffle=True)

device: cuda:0
num_users: 2152
num_items: 9006
num_product_groups: 11
num_color_groups: 48
num_index_name: 9


  0%|          | 0/46164 [00:00<?, ?it/s]

In [441]:
# initiate model for training
model = NCF(num_users=num_users, num_items=num_items, num_product_groups=num_product_groups, num_color_groups=num_color_groups, num_index_name=num_index_name)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
weights = torch.tensor([1.0, 4.0]).to(device)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=weights[1])
model.train()
best_acc = 0.0  
for e in tqdm(range(epoch)):
    epoch_loss = 0.0
    epoch_acc = 0.0
    for customer_batch, product_batch, club_status_batch, age_groups_batch, product_groups_batch, color_groups_batch, index_name_batch, label_batch in train_loader:
        customer_batch, product_batch, club_status_batch, age_groups_batch, product_groups_batch, color_groups_batch, index_name_batch, label_batch \
        = customer_batch.to(device), product_batch.to(device), club_status_batch.to(device), age_groups_batch.to(device), \
        product_groups_batch.to(device), color_groups_batch.to(device), index_name_batch.to(device), label_batch.to(device), 
        optimizer.zero_grad()
        y_pred = model(customer_batch, product_batch, club_status_batch.float(), age_groups_batch.float(), product_groups_batch, color_groups_batch, index_name_batch)
        loss = loss_fn(y_pred, label_batch.reshape(-1, 1).float())
        acc = binary_acc(y_pred, label_batch.reshape(-1, 1).float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc
        cur_acc = epoch_acc/len(train_loader)
    if cur_acc > best_acc:
        best_acc = cur_acc
        torch.save(model.state_dict(), 'best_model.pt')
    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.6f} | Acc: {epoch_acc/len(train_loader):.4f}')

print(f'\nBest Accuracy: {best_acc:.3f}')


NCF(
  (user_embedding_layer): Embedding(2152, 16)
  (item_embedding_layer): Embedding(9006, 32)
  (product_group_embedding_layer): Embedding(11, 8)
  (color_group_embedding_layer): Embedding(48, 16)
  (index_name_embedding_layer): Embedding(9, 6)
  (relu): LeakyReLU(negative_slope=0.01)
  (fcs): Sequential(
    (0): Linear(in_features=80, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=256, bias=True)
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): Linear(in_features=128, out_features=1, bias=True)
  )
)


  0%|          | 0/120 [00:00<?, ?it/s]

Epoch 000: | Loss: 1.105720 | Acc: 0.7943
Epoch 001: | Loss: 0.959788 | Acc: 0.7599
Epoch 002: | Loss: 0.673682 | Acc: 0.8108
Epoch 003: | Loss: 0.461173 | Acc: 0.8702
Epoch 004: | Loss: 0.329104 | Acc: 0.9084
Epoch 005: | Loss: 0.250944 | Acc: 0.9325
Epoch 006: | Loss: 0.198203 | Acc: 0.9492
Epoch 007: | Loss: 0.161478 | Acc: 0.9598
Epoch 008: | Loss: 0.142326 | Acc: 0.9647
Epoch 009: | Loss: 0.127131 | Acc: 0.9686
Epoch 010: | Loss: 0.110005 | Acc: 0.9734
Epoch 011: | Loss: 0.097026 | Acc: 0.9762
Epoch 012: | Loss: 0.091868 | Acc: 0.9783
Epoch 013: | Loss: 0.087041 | Acc: 0.9792
Epoch 014: | Loss: 0.081309 | Acc: 0.9804
Epoch 015: | Loss: 0.075202 | Acc: 0.9817
Epoch 016: | Loss: 0.064519 | Acc: 0.9844
Epoch 017: | Loss: 0.066544 | Acc: 0.9842
Epoch 018: | Loss: 0.053912 | Acc: 0.9870
Epoch 019: | Loss: 0.056987 | Acc: 0.9866
Epoch 020: | Loss: 0.054893 | Acc: 0.9871
Epoch 021: | Loss: 0.047375 | Acc: 0.9885
Epoch 022: | Loss: 0.045348 | Acc: 0.9897
Epoch 023: | Loss: 0.045707 | Acc:

In [57]:
# model test
train_transactions = pd.read_csv(dataDir/'train_transactions_medium.csv')
test_transactions = pd.read_csv(dataDir/'test_transactions_medium.csv')
article_data = pd.read_csv(dataDir/'article_data_medium.csv')
customer_data = pd.read_csv(dataDir/'customer_data_medium.csv')

all_products_id = article_data["article_id"].unique()
customer_product_test = set(zip(test_transactions["customer_id"], test_transactions["article_id"], 
                                test_transactions["club_member_status"], test_transactions["age"], 
                                test_transactions["product_group_name"], test_transactions["colour_group_name"], 
                                test_transactions["index_name"]))

# Dict of all items that are interacted with by each user
user_interacted_items = train_transactions.groupby('customer_id')['article_id'].apply(list).to_dict()

model = NCF(num_users=num_users, num_items=num_items, num_product_groups=num_product_groups, num_color_groups=num_color_groups, num_index_name=num_index_name)

model.load_state_dict(torch.load('best_model.pt'))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

model.eval()
hits = []
pred_prob = []
acc = []
with torch.no_grad():
    for customer, product, club_status, age_groups, product_groups, color_groups, index_name in tqdm(customer_product_test):
        
        # select 29 products from item set that customer has no interactions
        interacted_items = user_interacted_items[customer]
        interacted_items = interacted_items + [product]
        not_interacted_items = set(all_products_id) - set(interacted_items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 49, replace=False))
        test_items = [product] + selected_not_interacted

        # get the other product features based on the selected test items
        product_groups_batch = torch.tensor(article_data.loc[article_data["article_id"].isin(test_items)]["product_group_name"].to_numpy()).reshape(-1, 1).to(device)
        color_groups_batch = torch.tensor(article_data.loc[article_data["article_id"].isin(test_items)]["colour_group_name"].to_numpy()).reshape(-1, 1).to(device)
        index_name_batch = torch.tensor(article_data.loc[article_data["article_id"].isin(test_items)]["index_name"].to_numpy()).reshape(-1, 1).to(device)

        test_items = torch.tensor(test_items).reshape(-1, 1).to(device)
        customer_batch = torch.tensor([customer]*50).reshape(-1, 1).to(device)
        club_status_batch = torch.tensor([club_status]*50).reshape(-1, 1).to(device)
        age_groups_batch = torch.tensor([age_groups]*50).reshape(-1, 1).to(device)

        y_single_pred = model(torch.tensor(customer).to(device), torch.tensor(product).to(device), torch.tensor(club_status).to(device), torch.tensor(age_groups).to(device), torch.tensor(product_groups).to(device), torch.tensor(color_groups).to(device), torch.tensor(index_name).to(device))
        y_pred = model(customer_batch, test_items, club_status_batch, age_groups_batch, product_groups_batch, color_groups_batch, index_name_batch)
        y_pred = torch.sigmoid(y_pred)
        y_pred = y_pred.squeeze()
        topk_probs, topk_indices = torch.topk(torch.sigmoid(y_pred), 10)
        # Convert the top 10 probabilities to a list
        # topk_probs_list = topk_probs.numpy().tolist()
        top10_items = [test_items[i].item() for i in topk_indices]

        if product in top10_items:
            hits.append(1)
        else:
            hits.append(0)
        
        pred_prob.append(torch.sigmoid(y_single_pred))
        if torch.round(torch.sigmoid(y_single_pred)) == 1:
            acc.append(1)
        else:
            acc.append(0)
    
print("The Hit Ratio@10 is {:.2f}".format(np.average(hits)))
print("The Accuracy is {:.2f}".format(np.average(acc)))

cuda:0


  0%|          | 0/2128 [00:00<?, ?it/s]

The Hit Ratio@10 is 0.68
The Accuracy is 0.91


In [58]:
# model test
train_transactions = pd.read_csv(dataDir/'train_transactions_medium.csv')
test_transactions = pd.read_csv(dataDir/'test_transactions_medium.csv')
article_data = pd.read_csv(dataDir/'article_data_medium.csv')
customer_data = pd.read_csv(dataDir/'customer_data_medium.csv')

all_products_id = article_data["article_id"].unique()
customer_product_test = set(zip(test_transactions["customer_id"], test_transactions["article_id"], 
                                test_transactions["club_member_status"], test_transactions["age"], 
                                test_transactions["product_group_name"], test_transactions["colour_group_name"], 
                                test_transactions["index_name"]))

# Dict of all items that are interacted with by each user
user_interacted_items = train_transactions.groupby('customer_id')['article_id'].apply(list).to_dict()

model = NCF(num_users=num_users, num_items=num_items, num_product_groups=num_product_groups, num_color_groups=num_color_groups, num_index_name=num_index_name)

model.load_state_dict(torch.load('best_model.pt'))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

model.eval()
hits_5 = []
hits_10 = []
true_top5_label = torch.tensor([1] + [0]*4).to(device)
true_top10_label = torch.tensor([1] + [0]*9).to(device)
true_positives_5 = 0
false_positives_5 = 0
false_negatives_5 = 0
true_positives_10 = 0
false_positives_10 = 0
false_negatives_10 = 0

with torch.no_grad():
    for customer, product, club_status, age_groups, product_groups, color_groups, index_name in tqdm(customer_product_test):
        
        # select 29 products from item set that customer has no interactions
        interacted_items = user_interacted_items[customer]
        interacted_items = interacted_items + [product]
        not_interacted_items = set(all_products_id) - set(interacted_items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 39, replace=False))
        test_items = [product] + selected_not_interacted

        # get the other product features based on the selected test items
        product_groups_batch = torch.tensor(article_data.loc[article_data["article_id"].isin(test_items)]["product_group_name"].to_numpy()).reshape(-1, 1).to(device)
        color_groups_batch = torch.tensor(article_data.loc[article_data["article_id"].isin(test_items)]["colour_group_name"].to_numpy()).reshape(-1, 1).to(device)
        index_name_batch = torch.tensor(article_data.loc[article_data["article_id"].isin(test_items)]["index_name"].to_numpy()).reshape(-1, 1).to(device)

        test_items = torch.tensor(test_items).reshape(-1, 1).to(device)
        customer_batch = torch.tensor([customer]*40).reshape(-1, 1).to(device)
        club_status_batch = torch.tensor([club_status]*40).reshape(-1, 1).to(device)
        age_groups_batch = torch.tensor([age_groups]*40).reshape(-1, 1).to(device)

        y_pred = model(customer_batch, test_items, club_status_batch, age_groups_batch, product_groups_batch, color_groups_batch, index_name_batch)
        y_pred = torch.sigmoid(y_pred)
        y_pred = y_pred.squeeze()
        # select top 5, 10 items for evaluations
        top5_probs, top5_indices = torch.topk(torch.sigmoid(y_pred), 5)
        top10_probs, top10_indices = torch.topk(torch.sigmoid(y_pred), 10)

        # Convert the top 5, 10 probabilities to a list
        top5_items = [test_items[i].item() for i in top5_indices]
        top10_items = [test_items[i].item() for i in top10_indices]

        # Hit Rate@5
        if product in top5_items:
            hits_5.append(1)
        else:
            hits_5.append(0)

        # Hit Rate@10
        if product in top10_items:
            hits_10.append(1)
        else:
            hits_10.append(0)
        
        # Precision and Recall for k = 5, 10
        pred_top5_label = torch.round(torch.sigmoid(top5_probs))
        pred_top10_label = torch.round(torch.sigmoid(top10_probs))

        TP_5 = (pred_top5_label * true_top5_label).sum().item()
        FP_5 = (pred_top5_label * (1 - true_top5_label)).sum().item()
        FN_5 = ((1 - pred_top5_label) * true_top5_label).sum().item()
        TP_10 = (pred_top10_label * true_top10_label).sum().item()
        FP_10 = (pred_top10_label * (1 - true_top10_label)).sum().item()
        FN_10 = ((1 - pred_top10_label) * true_top10_label).sum().item()
        
        true_positives_5 += TP_5
        false_positives_5 += FP_5
        false_negatives_5 += FN_5
        true_positives_10 = TP_10
        false_positives_10 = FP_10
        false_negatives_10 = FN_10

precision_at_5 = true_positives_5 / (true_positives_5 + false_positives_5)
recall_at_5 = true_positives_5 / (true_positives_5 + false_negatives_5)
precision_at_10 = true_positives_10 / (true_positives_10 + false_positives_10)
recall_at_10 = true_positives_10 / (true_positives_10 + false_negatives_10)

print("The Hit Ratio@5 is {:.2f}".format(np.average(hits_5)))
print("The Precision@5 is {:.2f}".format(precision_at_5))
print("The Recall@5 is {:.2f}".format(recall_at_5))
print("The Hit Ratio@10 is {:.2f}".format(np.average(hits_10)))
print("The Precision@10 is {:.2f}".format(precision_at_10))
print("The Recall@10 is {:.2f}".format(recall_at_10))
          

cuda:0


  0%|          | 0/2128 [00:00<?, ?it/s]

The Hit Ratio@5 is 0.23
The Hit Ratio@10 is 0.87
