In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

pd.options.mode.chained_assignment = None
dataDir = Path.cwd().parent.parent.parent/'backup/HM_data'
np.random.seed(66)

transactions = pd.read_csv(dataDir/'transactions.csv')
# articles = pd.read_csv(dataDir/'articles.csv')
# customers = pd.read_csv(dataDir/'customers.csv')

In [71]:
len(transactions.customer_id.unique())

1362281

In [2]:
# calculate customer interactions in transaction data
# drop customers that only contain few interactions
transactions["interaction"] = 1
transactions_temp = transactions.drop_duplicates(subset=["customer_id", "article_id"])
comb_transactions = transactions_temp[["customer_id", "interaction"]].groupby(by=["customer_id"], sort=False, as_index=False).sum(["interaction"])
comb_transactions = comb_transactions.loc[comb_transactions.interaction >= 5]

# randomly select part of the transaction data
rand_userIds = np.random.choice(comb_transactions.customer_id.unique(), 
                                size=int(len(comb_transactions['customer_id'].unique())*0.003), 
                                replace=False)

transactions = transactions.loc[transactions['customer_id'].isin(rand_userIds)]

print('There are {} rows of data from {} users (users with suffication data)'.format(len(transactions), len(rand_userIds)))

There are 92598 rows of data from 2668 users (users with suffication data)


In [3]:
transactions.drop_duplicates(subset=["customer_id", "article_id"], keep="first", inplace=True)
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,interaction
248,2018-09-20,015e83393e4fc3b071ba6fc5f174bf3bf9813c88dcaf3e...,640244003,0.033881,1,1
498,2018-09-20,0277917f4faac280023baecd03c3790b951a9403085da1...,686269001,0.016932,2,1
499,2018-09-20,0277917f4faac280023baecd03c3790b951a9403085da1...,633130002,0.016932,2,1
1458,2018-09-20,07a55454c1c21b932c594eb7ca3ca52ae208cbbcc1552c...,532954003,0.003373,2,1
1459,2018-09-20,07a55454c1c21b932c594eb7ca3ca52ae208cbbcc1552c...,200182001,0.013542,2,1
...,...,...,...,...,...,...
31782166,2020-09-22,cf61e83193fa7b1877fcb795e555150570082c7c2da055...,523489001,0.001678,1,1
31782167,2020-09-22,cf61e83193fa7b1877fcb795e555150570082c7c2da055...,678942057,0.016932,1,1
31786935,2020-09-22,f479d7e42d8415dd7cfd4f7ba79ab811ca1d77fa85f2f8...,869331006,0.030492,1,1
31786936,2020-09-22,f479d7e42d8415dd7cfd4f7ba79ab811ca1d77fa85f2f8...,853881001,0.016932,1,1


In [4]:
# training set and test set

transactions['rank_latest'] = transactions.groupby(['customer_id'])['t_dat'].rank(method='first', ascending=False)

train_transactions = transactions[transactions['rank_latest'] != 1]
test_transactions = transactions[transactions['rank_latest'] == 1]

# drop articles that do not exist in training set
test_product_list = list(set(test_transactions.article_id.unique()) & set(train_transactions.article_id.unique()))
test_transactions = test_transactions.loc[test_transactions['article_id'].isin(test_product_list)]

"""reindex"""
# map string type customer_id to int type
customer_mapper = {}
customer_keys = train_transactions.customer_id.unique()
customer_values = list(range(len(train_transactions.customer_id.unique())))
customer_mapper.update(zip(customer_keys, customer_values))

# map string type article_id to int type
product_mapper = {}
product_keys = train_transactions.article_id.unique()
product_values = list(range(len(train_transactions.article_id.unique())))
product_mapper.update(zip(product_keys, product_values))

train_transactions["customer_id"] = train_transactions["customer_id"].map(customer_mapper)
train_transactions["article_id"] = train_transactions["article_id"].map(product_mapper)
test_transactions["customer_id"] = test_transactions["customer_id"].map(customer_mapper)
test_transactions["article_id"] = test_transactions["article_id"].map(product_mapper)

# get a list of all articles id
all_products_id = train_transactions["article_id"].unique()

train_transactions.head()

# drop columns that we no longer need
# train_transactions = train_transactions[['customer_id', 'article_id', 'price']]
# test_transactions = test_transactions[['customer_id', 'article_id', 'price']]
# comb_transactions = train_transactions.groupby(by=["customer_id", "article_id"], sort=False, as_index=False).sum(["interaction"])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,interaction,rank_latest
248,2018-09-20,0,0,0.033881,1,1,25.0
498,2018-09-20,1,1,0.016932,2,1,29.0
499,2018-09-20,1,2,0.016932,2,1,30.0
1458,2018-09-20,2,3,0.003373,2,1,41.0
1459,2018-09-20,2,4,0.013542,2,1,42.0


In [5]:
len(train_transactions.customer_id.unique())

2668

In [6]:
from scipy.sparse import csr_matrix

def sparse_to_tensor(sparse_matrix):
    """
    Transform scipy coo matrix to pytorch sparse tensor
    """
    sparse_matrix = sparse_matrix.tocoo()
    values = sparse_matrix.data
    indices = (sparse_matrix.row, sparse_matrix.col) # np.vstack
    shape = sparse_matrix.shape

    i = torch.LongTensor(indices)
    v = torch.DoubleTensor(values)
    s = torch.Size(shape)

    return torch.sparse.DoubleTensor(i, v, s)

def sparse_batch_collate(batch): 
    """
    Collate function which to transform scipy csr matrix to pytorch sparse tensor
    """
    # batch[0] since it is returned as a one element list

    customer_batch, product_batch, targets_batch = batch[0]
    
    if type(customer_batch[0]) == csr_matrix:
        customer_batch = customer_batch.tocoo() # removed vstack
        customer_batch = sparse_to_tensor(customer_batch)
    else:
        customer_batch = torch.DoubleTensor(customer_batch)

    if type(product_batch[0]) == csr_matrix:
        product_batch = product_batch.tocoo() # removed vstack
        product_batch = sparse_to_tensor(product_batch)
    else:
        product_batch = torch.DoubleTensor(product_batch)
    
    if type(targets_batch[0]) == csr_matrix:
        targets_batch = targets_batch.tocoo() # removed vstack
        targets_batch = sparse_to_tensor(targets_batch)
    else:
        targets_batch = torch.DoubleTensor(targets_batch)
    
    return customer_batch, product_batch, targets_batch

In [7]:
class HMSaleTrainDataLoader(Dataset):
    """HMSaleTrainDataLoader Training set of HM sales data

    Args:
        transactions (pd.DataFrame): Dataframe of transaction records
        all_products_id (list): A list contains all product ids
    """
    def __init__(self, transactions, all_products_id):
        self.customers, self.products, self.labels = self.get_dataset(transactions, all_products_id)

    def __len__(self):
        return len(self.customers)
    
    def __getitem__(self, idx):
        return self.customers[idx], self.products[idx], self.labels[idx]
    
    def get_dataset(self, transactions, all_products_id):
        customers, products, labels = [], [], []
        customer_product_set = set(zip(transactions['customer_id'], transactions['article_id']))
        
        """negative sampling"""
        # set up negative:positive ratio as 4:1
        negative_samples = 4

        for u, i in tqdm(customer_product_set):
            customers.append(u)
            products.append(i)
            labels.append(1)
            for _ in range(negative_samples):
                negative_product = np.random.choice(all_products_id)
                while (u, negative_product) in customer_product_set:
                    negative_product = np.random.choice(all_products_id)
                customers.append(u)
                products.append(negative_product)
                labels.append(0)
        return customers, products, torch.tensor(labels)
    
def binary_acc(y_pred, y_test):
    y_pred_label = torch.softmax(y_pred, dim=1)
    _, y_pred_label = torch.max(y_pred_label, dim = 1)
    correct_pred = (y_pred_label == y_test).sum()
    acc = correct_pred/y_test.shape[0]
    return acc  
    

In [8]:
class NCF(nn.Module):
    """NCF - Neural Collaborative Filtering proposed by He et al.

    Args:
        num_users (int): Number of users
        num_items (iut): Number of products
        transactions (pd.DataFrame): Dataframe of transaction records
        all_products_id (list): A list contains all product ids
    """
    def __init__(
            self, 
            num_users, 
            num_items, 
            embedding_dim: int = 10,
            hidden_size: int = 64,
            output_size: int = 32,
            num_hidden_layers: int = 1,
            num_classes: int = 2,
        ):
            super().__init__()
            self.user_embedding_layer = nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
            self.item_embedding_layer = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_dim)
            self.relu = nn.ReLU()
            self.fcs = nn.Sequential()
            if embedding_dim is not None:
                in_channels = (
                    [embedding_dim + embedding_dim] 
                    + [hidden_size]*num_hidden_layers 
                    + [output_size]
                )
            else:
                raise ValueError
            for i in range(len(in_channels)):
                if i != len(in_channels)-1:
                    self.fcs.append(nn.Linear(in_features=in_channels[i],  out_features=in_channels[i+1]))
                else:
                    self.fcs.append(nn.Linear(in_features=in_channels[i],  out_features=num_classes))
    
    def forward(self, user_input, item_input):

        user_embedding = self.user_embedding_layer(user_input)
        item_embedding = self.item_embedding_layer(item_input)
        concat_embedding = torch.cat([user_embedding, item_embedding], dim=-1)
        
        for fc_layer in self.fcs:
            concat_embedding = fc_layer(concat_embedding)
            concat_embedding = self.relu(concat_embedding)
        # print("TEST")
        # print(concat_embedding)
        #pred = F.softmax(concat_embedding)
        return concat_embedding
    

In [9]:
# set up hyper-parameters
learning_rate = 0.01
epoch = 40
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.autograd.set_detect_anomaly(True)

# set up dataset for training
num_users = len(train_transactions.customer_id.unique())
print("num_users:", num_users)
num_items = len(all_products_id)
print("num_items:", num_items)
train_data = HMSaleTrainDataLoader(train_transactions, all_products_id)
train_loader = DataLoader(train_data, batch_size=1024, shuffle=True)

# initiate model for training
model = NCF(num_users=num_users, num_items=num_items)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
model.train()
best_acc = 0.0
for e in tqdm(range(epoch)):
    epoch_loss = 0.0
    epoch_acc = 0.0
    for customer_batch, product_batch, label_batch in train_loader:
        customer_batch, product_batch, label_batch = customer_batch.to(device), product_batch.to(device), label_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(customer_batch, product_batch)
        loss = loss_fn(y_pred, label_batch)
        acc = binary_acc(y_pred, label_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        cur_acc = epoch_acc/len(train_loader)
    if cur_acc > best_acc:
        best_acc = cur_acc
        torch.save(model.state_dict(), 'best_model.pt')
    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.6f} | Acc: {epoch_acc/len(train_loader):.4f}')

print(f'\nBest Accuracy: {best_acc:.3f}')


num_users: 2668
num_items: 31159


  0%|          | 0/76784 [00:00<?, ?it/s]

NCF(
  (user_embedding_layer): Embedding(2668, 10)
  (item_embedding_layer): Embedding(31159, 10)
  (relu): ReLU()
  (fcs): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=32, bias=True)
    (2): Linear(in_features=32, out_features=2, bias=True)
  )
)


  0%|          | 0/40 [00:00<?, ?it/s]

Epoch 000: | Loss: 0.503244 | Acc: 0.8000
Epoch 001: | Loss: 0.480977 | Acc: 0.8000
Epoch 002: | Loss: 0.460374 | Acc: 0.8000
Epoch 003: | Loss: 0.445362 | Acc: 0.8000
Epoch 004: | Loss: 0.426127 | Acc: 0.8000
Epoch 005: | Loss: 0.401630 | Acc: 0.8000
Epoch 006: | Loss: 0.373798 | Acc: 0.8000
Epoch 007: | Loss: 0.342097 | Acc: 0.8493
Epoch 008: | Loss: 0.308760 | Acc: 0.8705
Epoch 009: | Loss: 0.277887 | Acc: 0.8861
Epoch 010: | Loss: 0.250040 | Acc: 0.9001
Epoch 011: | Loss: 0.224752 | Acc: 0.9120
Epoch 012: | Loss: 0.201982 | Acc: 0.9218
Epoch 013: | Loss: 0.181652 | Acc: 0.9290
Epoch 014: | Loss: 0.162129 | Acc: 0.9362
Epoch 015: | Loss: 0.143862 | Acc: 0.9432
Epoch 016: | Loss: 0.128279 | Acc: 0.9490
Epoch 017: | Loss: 0.116868 | Acc: 0.9531
Epoch 018: | Loss: 0.106592 | Acc: 0.9568
Epoch 019: | Loss: 0.098257 | Acc: 0.9599
Epoch 020: | Loss: 0.091570 | Acc: 0.9620
Epoch 021: | Loss: 0.085412 | Acc: 0.9643
Epoch 022: | Loss: 0.082107 | Acc: 0.9651
Epoch 023: | Loss: 0.077983 | Acc:

In [15]:
len(train_transactions.article_id.unique())

31735

In [84]:
y = torch.tensor([1, 1])

# a batch of 2 samples of 4 indices each
input = torch.tensor([[0.0004, 0.0100],
                     [0.1527, 0.1276]])

y_pred_softmax = torch.softmax(input, dim = 1)
_, y_pred_tags = torch.max(y_pred_softmax, dim = 1)

y_pred_softmax

tensor([[0.4976, 0.5024],
        [0.5063, 0.4937]])

In [94]:
print(_)

tensor([0.5024, 0.5063])


In [85]:
y_pred_tags

tensor([1, 0])

In [86]:
correct_pred = (y_pred_tags == y).sum()
correct_pred

tensor(1)