In [19]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelBinarizer

pd.options.mode.chained_assignment = None
dataDir = Path.cwd().parent/'Data'
np.random.seed(66)

articles_usecols = ["article_id", "product_group_name", "colour_group_name", "index_name"]
customers_usecols = ["customer_id", "club_member_status", "age"]

transactions = pd.read_csv(dataDir/'transactions.csv')
articles = pd.read_csv(dataDir/'articles.csv', usecols=articles_usecols)
customers = pd.read_csv(dataDir/'customers.csv', usecols=customers_usecols)

In [20]:
len(transactions.customer_id.unique())

1362281

In [21]:
""" Preprocessing on articles and customers data
    Articles: product_group_name, colour_group_name, index_name
    Customers: club_member_status, age
"""

articles = articles.loc[articles.product_group_name != "Unknown"]
articles = articles.loc[articles.colour_group_name != "Unknown"]
customers.dropna(axis=0, how='any', subset=["club_member_status", "age"], inplace=True)
# filter out transactions data with articles and customers ID
transactions = transactions.loc[transactions['article_id'].isin(articles.article_id.unique())]
transactions = transactions.loc[transactions['customer_id'].isin(customers.customer_id.unique())]

In [None]:
# calculate customer interactions in transaction data
# drop customers that only contain few interactions
transactions["interaction"] = 1
transactions_temp = transactions.drop_duplicates(subset=["customer_id", "article_id"])
comb_transactions = transactions_temp[["customer_id", "interaction"]].groupby(by=["customer_id"], sort=False, as_index=False).sum(["interaction"])
comb_transactions = comb_transactions.loc[comb_transactions.interaction >= 5]

# randomly select part of the transaction data
rand_userIds = np.random.choice(comb_transactions.customer_id.unique(), 
                                size=int(len(comb_transactions['customer_id'].unique())*0.0005), 
                                replace=False)

transactions = transactions.loc[transactions['customer_id'].isin(rand_userIds)]

print('There are {} rows of data from {} users (users with suffication data)'.format(len(transactions), len(rand_userIds)))

transactions.drop_duplicates(subset=["customer_id", "article_id"], keep="first", inplace=True)

# merge transaction data with article and customer data
transactions = transactions.merge(customers, how='left', left_on=["customer_id"], right_on=["customer_id"])
transactions = transactions.merge(articles, how='left', left_on=["article_id"], right_on=["article_id"])

In [5]:
# binarize categorical features club_member_status and sales_channel_id in transaction data
lb = LabelBinarizer()
transactions.sales_channel_id = lb.fit_transform(transactions.sales_channel_id)
transactions.club_member_status = lb.fit_transform(transactions.club_member_status)

# standardize numerical features age and price in transaction data
std = StandardScaler()
transactions[["price", "age"]] = std.fit_transform(transactions[["price", "age"]])

# training set and test set
transactions['rank_latest'] = transactions.groupby(['customer_id'])['t_dat'].rank(method='first', ascending=False)

train_transactions = transactions[transactions['rank_latest'] != 1]
test_transactions = transactions[transactions['rank_latest'] == 1]

# drop articles that do not exist in training set
test_product_list = list(set(test_transactions.article_id.unique()) & set(train_transactions.article_id.unique()))
test_transactions = test_transactions.loc[test_transactions['article_id'].isin(test_product_list)]

# drop columns that we no longer need
drop_cols = ["t_dat", "interaction", "rank_latest"]
train_transactions.drop(labels=drop_cols, axis=1, inplace=True)
test_transactions.drop(labels=drop_cols, axis=1, inplace=True)

"""reindex"""
# map string type customer_id to int type
customer_mapper = {}
customer_keys = train_transactions.customer_id.unique()
customer_values = list(range(len(train_transactions.customer_id.unique())))
customer_mapper.update(zip(customer_keys, customer_values))

# map string type article_id to int type
product_mapper = {}
product_keys = train_transactions.article_id.unique()
product_values = list(range(len(train_transactions.article_id.unique())))
product_mapper.update(zip(product_keys, product_values))

# map color_group_name to int type
color_mapper = {}
color_keys = train_transactions.colour_group_name.unique()
color_values = list(range(len(train_transactions.colour_group_name.unique())))
color_mapper.update(zip(color_keys, color_values))

# map color_group_name to int type
product_group_mapper = {}
product_group_keys = train_transactions.product_group_name.unique()
product_group_values = list(range(len(train_transactions.product_group_name.unique())))
product_group_mapper.update(zip(product_group_keys, product_group_values))

# map index_name to int type
index_name_mapper = {}
index_name_keys = train_transactions.index_name.unique()
index_name_values = list(range(len(train_transactions.index_name.unique())))
index_name_mapper.update(zip(index_name_keys, index_name_values))

# reindex categorical features based on feature mappers
train_transactions["customer_id"] = train_transactions["customer_id"].map(customer_mapper)
train_transactions["article_id"] = train_transactions["article_id"].map(product_mapper)
train_transactions["colour_group_name"] = train_transactions["colour_group_name"].map(color_mapper)
train_transactions["product_group_name"] = train_transactions["product_group_name"].map(product_group_mapper)
train_transactions["index_name"] = train_transactions["index_name"].map(index_name_mapper)
test_transactions["customer_id"] = test_transactions["customer_id"].map(customer_mapper)
test_transactions["article_id"] = test_transactions["article_id"].map(product_mapper)
test_transactions["colour_group_name"] = test_transactions["colour_group_name"].map(color_mapper)
test_transactions["product_group_name"] = test_transactions["product_group_name"].map(product_group_mapper)
test_transactions["index_name"] = test_transactions["index_name"].map(index_name_mapper)

# get a list of all articles id
all_products_id = train_transactions["article_id"].unique()

train_transactions.head()

Unnamed: 0,customer_id,article_id,price,sales_channel_id,club_member_status,age,product_group_name,colour_group_name,index_name
0,0,0,-0.28801,0,0,-0.9531,0,0,0
1,1,1,-0.712215,0,0,-0.717387,0,0,0
2,2,2,-0.966737,1,0,0.382604,1,1,1
3,2,3,-0.966737,1,0,0.382604,1,2,1
4,2,4,-0.966737,1,0,0.382604,1,0,1


In [6]:
len(train_transactions.article_id.unique())

16450

In [7]:
from scipy.sparse import csr_matrix

def sparse_to_tensor(sparse_matrix):
    """
    Transform scipy coo matrix to pytorch sparse tensor
    """
    sparse_matrix = sparse_matrix.tocoo()
    values = sparse_matrix.data
    indices = (sparse_matrix.row, sparse_matrix.col) # np.vstack
    shape = sparse_matrix.shape

    i = torch.LongTensor(indices)
    v = torch.DoubleTensor(values)
    s = torch.Size(shape)

    return torch.sparse.DoubleTensor(i, v, s)

def sparse_batch_collate(batch): 
    """
    Collate function which to transform scipy csr matrix to pytorch sparse tensor
    """
    # batch[0] since it is returned as a one element list

    customer_batch, product_batch, targets_batch = batch[0]
    
    if type(customer_batch[0]) == csr_matrix:
        customer_batch = customer_batch.tocoo() # removed vstack
        customer_batch = sparse_to_tensor(customer_batch)
    else:
        customer_batch = torch.DoubleTensor(customer_batch)

    if type(product_batch[0]) == csr_matrix:
        product_batch = product_batch.tocoo() # removed vstack
        product_batch = sparse_to_tensor(product_batch)
    else:
        product_batch = torch.DoubleTensor(product_batch)
    
    if type(targets_batch[0]) == csr_matrix:
        targets_batch = targets_batch.tocoo() # removed vstack
        targets_batch = sparse_to_tensor(targets_batch)
    else:
        targets_batch = torch.DoubleTensor(targets_batch)
    
    return customer_batch, product_batch, targets_batch

In [17]:
articles

Unnamed: 0,article_id,product_group_name,colour_group_name,index_name
0,108775015,Garment Upper body,Black,Ladieswear
1,108775044,Garment Upper body,White,Ladieswear
2,108775051,Garment Upper body,Off White,Ladieswear
3,110065001,Underwear,Black,Lingeries/Tights
4,110065002,Underwear,White,Lingeries/Tights
...,...,...,...,...
105537,953450001,Socks & Tights,Black,Menswear
105538,953763001,Garment Upper body,Black,Ladieswear
105539,956217002,Garment Full body,Black,Ladieswear
105540,957375001,Accessories,Black,Divided


In [16]:
articles["price"].loc[articles.article_id == 108775044]

KeyError: 'price'

In [8]:
class HMSaleTrainDataLoader(Dataset):
    """HMSaleTrainDataLoader Training set of HM sales data

    Args:
        transactions (pd.DataFrame): Dataframe of transaction records
        all_products_id (list): A list contains all product ids
    """
    def __init__(self, transactions, all_products_id, all_customer_product_set):
        self.customers, self.products, self.prices, self.sales_channels, \
        self.club_status, self.age_groups, self.product_groups, self.color_groups, \
        self.index_name, self.labels = self.get_dataset(transactions, all_products_id, all_customer_product_set)

    def __len__(self):
        return len(self.customers)
    
    def __getitem__(self, idx):
        return self.customers[idx], self.products[idx], self.prices[idx], self.sales_channels[idx], self.club_status[idx], \
               self.age_groups[idx], self.product_groups[idx], self.color_groups[idx], self.index_name[idx], self.labels[idx]
    
    def get_dataset(self, transactions, all_products_id, all_customer_product_set):
        customers, products, prices, sales_channels, club_status, age_groups, product_groups, color_groups, index_name, labels  = [], [], [], [], [], [], [], [], [], []
        
        """negative sampling"""
        # set up negative:positive ratio as 4:1
        negative_samples = 4
        customer_product_set = set(zip(transactions["customer_id"], transactions["article_id"], 
                                       transactions["price"], transactions["sales_channel_id"], 
                                       transactions["club_member_status"], transactions["age"], 
                                       transactions["product_group_name"], transactions["colour_group_name"], transactions["index_name"]))

        for u, i, price, sale, club, age, product, color, index in tqdm(customer_product_set):
            customers.append(u)
            products.append(i)
            sales_channels.append(sale)
            club_status.append(club)
            age_groups.append(age)
            product_groups.append(product)
            color_groups.append(color)
            index_name.append(index)
            labels.append(1)
            for _ in range(negative_samples):
                negative_product = np.random.choice(all_products_id)
                while (u, negative_product, price, sale, club, age, product, color, index) in all_customer_product_set:
                    negative_product = np.random.choice(all_products_id)
                customers.append(u)
                products.append(negative_product)
                prices.append(price)
                sales_channels.append(sale)
                club_status.append(club)
                age_groups.append(age)
                product_groups.append(product)
                color_groups.append(color)
                index_name.append(index)
                labels.append(0)
        return customers, products, prices, sales_channels, club_status, age_groups, product_groups, color_groups, index_name, torch.tensor(labels)

class HMSaleTestDataLoader(Dataset):
    """HMSaleTestataLoader Test set of HM sales data

    Args:
        transactions (pd.DataFrame): Dataframe of transaction records
        all_products_id (list): A list contains all product ids
    """
    def __init__(self, transactions, all_products_id):
        self.customers, self.products, self.prices, self.sales_channels, \
        self.club_status, self.age_groups, self.product_groups, self.color_groups, \
        self.index_name, self.labels = self.get_dataset(transactions, all_products_id)

    def __len__(self):
        return len(self.customers)
    
    def __getitem__(self, idx):
        return self.customers[idx], self.products[idx], self.prices[idx], self.sales_channels[idx], self.club_status[idx], \
               self.age_groups[idx], self.product_groups[idx], self.color_groups[idx], self.index_name[idx], self.labels[idx]
    
    def get_dataset(self, transactions, all_products_id):
        customers, products, prices, sales_channels, club_status, age_groups, product_groups, color_groups, index_name, labels  = [], [], [], [], [], [], [], [], [], []
        customer_product_set = set(zip(transactions["customer_id"], transactions["article_id"], 
                                       transactions["price"], transactions["sales_channel_id"], 
                                       transactions["club_member_status"], transactions["age"], 
                                       transactions["product_group_name"], transactions["colour_group_name"], transactions["index_name"]))
      
        for u, i, price, sale, club, age, product, color, index in tqdm(customer_product_set):
            customers.append(u)
            products.append(i)
            prices.append(price)
            sales_channels.append(sale)
            club_status.append(club)
            age_groups.append(age)
            product_groups.append(product)
            color_groups.append(color)
            index_name.append(index)
            labels.append(1)

        return torch.tensor(customers), torch.tensor(products), torch.tensor(prices), torch.tensor(sales_channels), torch.tensor(club_status), torch.tensor(age_groups), torch.tensor(product_groups), torch.tensor(color_groups), torch.tensor(index_name), torch.tensor(labels)    

def binary_acc(y_pred, y_test):
    y_pred_label = torch.softmax(y_pred, dim=1)
    _, y_pred_label = torch.max(y_pred_label, dim = 1)
    correct_pred = (y_pred_label == y_test).sum()
    acc = correct_pred/y_test.shape[0]
    return acc  
    

In [9]:
class NCF(nn.Module):
    """NCF - Neural Collaborative Filtering proposed by He et al.

    Args:
        num_users (int): Number of users
        num_items (iut): Number of products
        num_product_groups (int): Number of product groups
        num_color_groups: (int): Number of color groups
        num_index_name: (int): Number of index name
    """
    def __init__(
            self, 
            num_users: int, 
            num_items: int,
            num_product_groups: int,
            num_color_groups: int,
            num_index_name: int,
            user_embedding_dim: int = 16,
            item_embedding_dim: int = 32,
            product_group_embedding_dim: int = 8,
            color_group_embedding_dim: int = 16,
            index_name_embedding_dim: int = 6,
            input_size: int = 53,
            hidden_size_1: int = 64,
            hidden_size_2: int = 128,
            output_size: int = 32,
            num_classes: int = 2,
        ):
            super().__init__()
            # embedding layers for categorical features and user-item interaction
            self.user_embedding_layer = nn.Embedding(num_embeddings=num_users, embedding_dim=user_embedding_dim)
            self.item_embedding_layer = nn.Embedding(num_embeddings=num_items, embedding_dim=item_embedding_dim)
            self.product_group_embedding_layer = nn.Embedding(num_embeddings=num_product_groups, embedding_dim=product_group_embedding_dim)
            self.color_group_embedding_layer = nn.Embedding(num_embeddings=num_color_groups, embedding_dim=color_group_embedding_dim)
            self.index_name_embedding_layer = nn.Embedding(num_embeddings=num_index_name, embedding_dim=index_name_embedding_dim)

            self.relu = nn.LeakyReLU()
            if user_embedding_dim and item_embedding_dim is not None:
                in_channels = (
                    [input_size] 
                    + [hidden_size_2]
                    + [hidden_size_1]
                    + [num_classes]
                )
            else:
                raise ValueError
            self.fcs = nn.Sequential(
                *[nn.Linear(in_features=in_channels[i], out_features=in_channels[i+1]) for i in range(len(in_channels)-1) if i != len(in_channels)-1]
            )
            
            in_channels_1 = (
                [50] 
                + [hidden_size_2]
                + [output_size]
            )
            in_channels_2 = (
                [30] 
                + [hidden_size_1]
                + [16]
            )
            
            self.encoder1 = nn.Sequential(
                *[nn.Linear(in_features=in_channels_1[i], out_features=in_channels_1[i+1]) for i in range(len(in_channels_1)-1) if i != len(in_channels_1)-1]
            )    
            
            self.encoder2 = nn.Linear(in_features=2, out_features=5, bias=True)
            
            self.encoder3 = nn.Sequential(
            *[nn.Linear(in_features=in_channels_2[i], out_features=in_channels_2[i+1]) for i in range(len(in_channels_2)-1) if i != len(in_channels_2)-1]
            )
            
    
    def forward(self, user_input, item_input, prices, sales_channels, club_status, age_groups, product_groups, color_groups, index_name):

        user_embedding = self.user_embedding_layer(user_input)
        item_embedding = self.item_embedding_layer(item_input)
        product_group_embedding = self.product_group_embedding_layer(product_groups)
        color_group_embedding = self.color_group_embedding_layer(color_groups)
        index_name_embedding = self.index_name_embedding_layer(index_name)
        concat_embedding = torch.cat([user_embedding, item_embedding, product_group_embedding, color_group_embedding, index_name_embedding], dim=-1)
        
        latent_vec_1 = torch.cat([user_embedding, item_embedding, prices.reshape(-1, 1), sales_channels.reshape(-1, 1)], dim=-1)
        latent_vec_1 = self.encoder1(latent_vec_1)

        latent_vec_2 = torch.cat([club_status.reshape(-1, 1), age_groups.reshape(-1, 1)], dim=-1)
        latent_vec_2 = self.encoder2(latent_vec_2)

        latent_vec_3 = torch.cat([product_group_embedding, color_group_embedding, index_name_embedding], dim=-1)
        latent_vec_3 = self.encoder3(latent_vec_3)

        concat_embedding = torch.cat([latent_vec_1, latent_vec_2, latent_vec_3], dim=-1)
        
#         concat_input = torch.cat([prices.reshape(-1, 1), sales_channels.reshape(-1, 1), club_status.reshape(-1, 1), age_groups.reshape(-1, 1)], dim=-1)
#         concat_embedding = torch.cat([concat_embedding, concat_input], dim=-1)

        for fc_layer in self.fcs:
            concat_embedding = fc_layer(concat_embedding)
            concat_embedding = self.relu(concat_embedding)
        #pred = F.softmax(concat_embedding)
        return concat_embedding
    

In [10]:
def binary_acc(y_pred, y_test):
    acc = 0.0
    
    y_pred_label = torch.softmax(y_pred, dim=1)
    _, y_pred_label = torch.max(y_pred_label, dim = 1)
    correct_pred = (y_pred_label == y_test).sum()
    acc = correct_pred.item()/y_test.shape[0]
    return acc

In [11]:
train_transactions = pd.read_csv(dataDir/'medium_train.csv')
# get a list of all articles id
all_products_id = train_transactions["article_id"].unique()
customer_product_set = set(zip(transactions["customer_id"], transactions["article_id"], 
                               transactions["price"], transactions["sales_channel_id"], 
                               transactions["club_member_status"], transactions["age"], 
                               transactions["product_group_name"], transactions["colour_group_name"], transactions["index_name"]))

# set up hyper-parameters
learning_rate = 0.005
epoch = 80
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

torch.autograd.set_detect_anomaly(True)

# set up dataset for training
num_users = len(train_transactions.customer_id.unique())
print("num_users:", num_users)
num_items = len(all_products_id)
print("num_items:", num_items)
num_product_groups = len(train_transactions.product_group_name.unique())
print("num_product_groups:", num_product_groups)
num_color_groups = len(train_transactions.colour_group_name.unique())
print("num_color_groups:", num_color_groups)
num_index_name = len(train_transactions.index_name.unique())
print("num_index_name:", num_index_name)
train_data = HMSaleTrainDataLoader(train_transactions, all_products_id, customer_product_set)
train_loader = DataLoader(train_data, batch_size=1024, shuffle=True)

num_users: 880
num_items: 16450
num_product_groups: 13
num_color_groups: 49
num_index_name: 10


  0%|          | 0/26416 [00:00<?, ?it/s]

In [23]:
len(train_data)

132080

In [19]:
len(all_products_id)

16450

In [6]:
# initiate model for training
model = NCF(num_users=num_users, num_items=num_items, num_product_groups=num_product_groups, num_color_groups=num_color_groups, num_index_name=num_index_name)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
model.train()
best_acc = 0.0
for e in tqdm(range(epoch)):
    epoch_loss = 0.0
    epoch_acc = 0.0
    for customer_batch, product_batch, prices_batch, sales_channels_batch, club_status_batch, age_groups_batch, product_groups_batch, color_groups_batch, index_name_batch, label_batch in train_loader:
        customer_batch, product_batch, prices_batch, sales_channels_batch, club_status_batch, \
        age_groups_batch, product_groups_batch, color_groups_batch, index_name_batch, label_batch \
        = customer_batch.to(device), product_batch.to(device), prices_batch.to(device), \
        sales_channels_batch.to(device), club_status_batch.to(device), age_groups_batch.to(device), \
        product_groups_batch.to(device), color_groups_batch.to(device), index_name_batch.to(device), label_batch.to(device), 
        optimizer.zero_grad()
        y_pred = model(customer_batch, product_batch, prices_batch.float(), sales_channels_batch.float(), club_status_batch.float(), age_groups_batch.float(), product_groups_batch, color_groups_batch, index_name_batch)
        loss = loss_fn(y_pred, label_batch)
        acc = binary_acc(y_pred, label_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc
        cur_acc = epoch_acc/len(train_loader)
    if cur_acc > best_acc:
        best_acc = cur_acc
        torch.save(model.state_dict(), 'best_model.pt')
    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.6f} | Acc: {epoch_acc/len(train_loader):.4f}')

print(f'\nBest Accuracy: {best_acc:.3f}')


num_users: 880
num_items: 16450
num_product_groups: 13
num_color_groups: 49
num_index_name: 10


  0%|          | 0/26416 [00:00<?, ?it/s]

NCF(
  (user_embedding_layer): Embedding(880, 16)
  (item_embedding_layer): Embedding(16450, 32)
  (product_group_embedding_layer): Embedding(13, 8)
  (color_group_embedding_layer): Embedding(49, 16)
  (index_name_embedding_layer): Embedding(10, 6)
  (relu): LeakyReLU(negative_slope=0.01)
  (fcs): Sequential(
    (0): Linear(in_features=53, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=64, bias=True)
    (2): Linear(in_features=64, out_features=2, bias=True)
  )
  (encoder1): Sequential(
    (0): Linear(in_features=50, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=32, bias=True)
  )
  (encoder2): Linear(in_features=2, out_features=5, bias=True)
  (encoder3): Sequential(
    (0): Linear(in_features=30, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=16, bias=True)
  )
)


  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 000: | Loss: 0.510221 | Acc: 0.7953
Epoch 001: | Loss: 0.498899 | Acc: 0.8000
Epoch 002: | Loss: 0.484177 | Acc: 0.8012
Epoch 003: | Loss: 0.448124 | Acc: 0.8132
Epoch 004: | Loss: 0.399691 | Acc: 0.8364
Epoch 005: | Loss: 0.349653 | Acc: 0.8611
Epoch 006: | Loss: 0.306397 | Acc: 0.8817
Epoch 007: | Loss: 0.270558 | Acc: 0.8978
Epoch 008: | Loss: 0.239634 | Acc: 0.9107
Epoch 009: | Loss: 0.213120 | Acc: 0.9214
Epoch 010: | Loss: 0.192285 | Acc: 0.9287
Epoch 011: | Loss: 0.170711 | Acc: 0.9365
Epoch 012: | Loss: 0.152721 | Acc: 0.9434
Epoch 013: | Loss: 0.137393 | Acc: 0.9495
Epoch 014: | Loss: 0.121147 | Acc: 0.9545
Epoch 015: | Loss: 0.111538 | Acc: 0.9573
Epoch 016: | Loss: 0.099764 | Acc: 0.9608
Epoch 017: | Loss: 0.091812 | Acc: 0.9632
Epoch 018: | Loss: 0.082614 | Acc: 0.9665
Epoch 019: | Loss: 0.076051 | Acc: 0.9685
Epoch 020: | Loss: 0.070681 | Acc: 0.9700
Epoch 021: | Loss: 0.065541 | Acc: 0.9726
Epoch 022: | Loss: 0.063165 | Acc: 0.9741
Epoch 023: | Loss: 0.057886 | Acc:

In [28]:
def hit(gt_items, pred_items):
    count = 0
    for item in gt_items:
        if item in pred_items:
            count += 1
    return count / len(gt_items)

def ndcg(gt_items, pred_items):
    dcg, idcg = 0.0, 0.0
    for i, item in enumerate(pred_items[:len(gt_items)]):
        if item in gt_items:
            dcg += 1 / np.log2(i + 2)
            idcg += 1 / np.log2(i + 2)
    return dcg / idcg

In [None]:
customer_product_test = set(zip(test_transactions["customer_id"], test_transactions["article_id"], 
                                test_transactions["price"], test_transactions["sales_channel_id"], 
                                test_transactions["club_member_status"], test_transactions["age"], 
                                test_transactions["product_group_name"], test_transactions["colour_group_name"], test_transactions["index_name"]))

In [12]:
# Dict of all items that are interacted with by each user
user_interacted_items = transactions.groupby('customer_id')['article_id'].apply(list).to_dict()

In [45]:
# model test

test_transactions = pd.read_csv(dataDir/'medium_test.csv')

model = NCF(num_users=num_users, num_items=num_items, num_product_groups=num_product_groups, num_color_groups=num_color_groups, num_index_name=num_index_name)
test_data = HMSaleTestDataLoader(test_transactions, all_products_id, customer_product_set)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

model.load_state_dict(torch.load('best_model.pt'))
model.to(device)

print(device)

model.eval()
test_acc = 0.0
top_k = 10
HR, NDCG = [], []
with torch.no_grad():
    for customer, product, prices, sales_channels, club_status, age_groups, product_groups, color_groups, index_name, label in tqdm(customer_product_test):
        customer, product, prices, sales_channels, club_status, age_groups, product_groups, color_groups, index_name, label \
        = customer.to(device), product.to(device), prices.to(device), sales_channels.to(device), club_status.to(device), \
        age_groups.to(device), product_groups.to(device), color_groups.to(device), index_name.to(device), label.to(device)
        
        # select 99 products from item set that customer has no interactions
        interacted_items = user_interacted_items[customer_batch]
        not_interacted_items = set(all_products_id) - set(interacted_items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 66))
        test_items = selected_not_interacted + [i]
        
        y_pred = model(torch.tensor([customer]*100), torch.tensor(test_items), prices.float(), sales_channels.float(), club_status.float(), age_groups.float(), product_groups, color_groups, index_name)
        acc = binary_acc(y_pred, label)
        test_acc += acc
        
        y_pred_label = torch.softmax(y_pred, dim=1)
        _, y_pred_label = torch.max(y_pred_label, dim = 1)
        _, indices = torch.topk(y_pred_label, top_k, dim=0)
        recommends = torch.take(product_batch, indices).cpu().numpy().tolist()

        gt_items = [product.item() for product, label in zip(product_batch, label_batch) if label.item() == 1]
        HR.append(hit(gt_items, recommends))
        NDCG.append(ndcg(gt_items, recommends))
        
    test_acc = test_acc / len(test_dataloader)
    hr_mean = np.mean(HR)
    ndcg_mean = np.mean(NDCG)

print(f'\nClassification Accuracy on test set: {test_acc:.3f}')
print(f'HR@{top_k}: {hr_mean:.3f}')
print(f'NDCG@{top_k}: {ndcg_mean:.3f}')
        

  0%|          | 0/498 [00:00<?, ?it/s]

cuda:0

Classification Accuracy on test set: 0.894
HR@10: 0.000
NDCG@10: 0.000


In [44]:
recommends

[13598, 2269, 7943, 14184, 13396, 3247, 14136, 5368, 15693, 5810]

In [40]:
gt_items

[5218,
 16142,
 584,
 11495,
 15576,
 7677,
 1330,
 2378,
 3521,
 11354,
 5796,
 1702,
 8054,
 4111,
 8054]

In [35]:
ng_item

12774

In [36]:
recommends

[[15428, 11222],
 [16104, 5305],
 [16425, 12041],
 [2044, 2866],
 [14601, 16124],
 [8933, 8991],
 [5457, 10828],
 [15123, 13445],
 [7441, 16376],
 [16350, 13129]]

In [14]:
print(len(test_data))

2490


In [None]:
HR, NDCG = metrics(model, test_loader, args.top_k, device)

In [9]:
def hit(ng_item, pred_items):
    if ng_item in pred_items:
        return 1
    return 0


def ndcg(ng_item, pred_items):
    if ng_item in pred_items:
        index = pred_items.index(ng_item)
        return np.reciprocal(np.log2(index+2))
    return 0


def metrics(model, test_loader, top_k, device):
    HR, NDCG = [], []

    for user, item, label in test_loader:
        user = user.to(device)
        item = item.to(device)

        predictions = model(user, item)
        _, indices = torch.topk(predictions, top_k)
        recommends = torch.take(item, indices).cpu().numpy().tolist()

        ng_item = item[0].item() # leave one-out evaluation has only one item per user
        HR.append(hit(ng_item, recommends))
        NDCG.append(ndcg(ng_item, recommends))

    return np.mean(HR), np.mean(NDCG)