In [1]:
import dataframe

In [2]:
import numpy as np
import pandas as pd
import random
from timeit import default_timer as timer

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [4]:
# Select GPU Number
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [5]:
# check if cuda available
device = "cuda" if torch.cuda.is_available() else "cpu"

torch.manual_seed(315)
if device == "cuda":
    torch.cuda.manual_seed_all(912)

device

'cuda'

In [6]:
train = dataframe.train
test = dataframe.test
ratings = dataframe.ratings

In [7]:
user_pool = set(train["userId"].unique()) # 6040
item_pool = set(train["itemId"].unique()) # 3706
interact_status = train.groupby("userId")["itemId"].apply(set).reset_index().rename(columns = {"itemId" : "interacted_items"})
interact_status["negative_items"] = interact_status["interacted_items"].apply(lambda x: item_pool - x)
train_loader = pd.merge(train, interact_status, on="userId")
train_loader["negatives"] = train_loader["negative_items"].apply(lambda x: random.sample(x, 10))

In [8]:
users, items, neg_items = [], [], []
for row in train_loader.itertuples():
    users.append(int(row.userId))
    items.append(int(row.itemId))
    neg_items.append(row.negatives)

In [9]:
class RatingDataset(Dataset):
    """
    torch.utils.data.Dataset 상속
    """
    def __init__(self, user_tensor, item_tensor, neg_item_list):
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.neg_items = neg_item_list
        
    def __len__(self):
        return self.user_tensor.size(0)
    
    def __getitem__(self, index):
        return self.user_tensor[index], self.item_tensor[index], self.neg_items[index]

In [10]:
dataset = RatingDataset(user_tensor = torch.LongTensor(users),
                        item_tensor = torch.LongTensor(items),
                        neg_item_list = torch.LongTensor(neg_items))

In [11]:
# create test set
test_user, test_item = [], []
for i in range(len(test)):
    test_user.append(test["userId"][i])
    test_item.append(test["itemId"][i])

In [12]:
class CML(nn.Module):
    
    def __init__(self, config):
        
        super(CML, self).__init__()
        self.config = config
        self.num_users = config["num_users"]
        self.num_items = config["num_items"]
        self.latent_dim = config["latent_dim"]
        self.margin = config["margin"]
        self.lambda_c = config["lambda_c"]
        
        self.user_embedding = nn.Embedding(self.num_users, self.latent_dim, max_norm = 1) # restrict norms
        self.item_embedding = nn.Embedding(self.num_items, self.latent_dim, max_norm = 1)
        
    
    def distance_loss(self, i, j, k):
        """
        compute distance loss
        """
        
        user = self.user_embedding(i).view(len(i), 1, self.latent_dim) # batchsize, X, latent_dim
        item = self.item_embedding(j).view(len(i), 1, self.latent_dim)
        neg_item = self.item_embedding(k)
        d_ij = torch.cdist(user, item).view(-1, 1)**2 #(1024, 1)
        d_ik = torch.cdist(user, neg_item).view(-1, 10)**2 #(1024, 10)
        
        metric = self.margin + d_ij - d_ik # (1024, 10)
        loss = 0
        for i in range(len(metric)):
            temp_metric = metric[i][metric[i]>0]
            rank_d_ij = 3676 * len(temp_metric) / 10
            w_ij = np.log(rank_d_ij + 1)
            loss +=  (w_ij * temp_metric).sum()
        
        return loss
    
    
    def cov_loss(self):
        
        self.U = self.user_embedding(torch.LongTensor([x for x in range(self.num_users)]).cuda())
        self.V = self.item_embedding(torch.LongTensor([x for x in range(self.num_items)]).cuda())
        
        matrix = torch.cat([self.U, self.V])
        n_rows = matrix.shape[0]
        matrix = matrix - torch.mean(matrix, dim=0)
        cov = torch.matmul(matrix.T, matrix) / n_rows
        loss = (torch.linalg.norm(cov) - torch.linalg.norm(torch.diagonal(cov),2))/self.num_users
        
        return loss * self.lambda_c
    
    
    def evaluate(self, train_user, train_item, test_user, test_item):
        
        self.U = self.user_embedding(torch.LongTensor([x for x in range(self.num_users)]).cuda())
        self.V = self.item_embedding(torch.LongTensor([x for x in range(self.num_items)]).cuda())
        
        x = torch.cdist(self.U, self.V)
        for i, j in zip(train_user, train_item):
            x[i, j] = 100
        _, indices = x.topk(50, largest = False)
        indices = indices.cpu().detach().numpy()
        hit = 0
        count = 0
        for i in range(len(test_user)):
            count += 1
            if test_item[i] in indices[test_user[i]]:
                hit += 1
        
        return hit/count
        

In [13]:
CML_config = {
    "num_users" : 6040,
    "num_items" : 3706,
    "latent_dim" : 64,
    "margin" : 0.5,
    "lambda_c": 10
}

In [14]:
model = CML(CML_config).cuda()
optimizer = optim.Adagrad(model.parameters(), lr=0.01)

In [16]:
num_epochs = 100
training_process = []
# epoch
for epoch_id in range(1, num_epochs + 1):
    train_loader = DataLoader(dataset, batch_size = 1024, shuffle = True)
    start_epoch = timer()
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        user, item, neg_items = batch[0], batch[1], batch[2]
        user, item, neg_items = user.cuda(), item.cuda(), neg_items.cuda()
        
        optimizer.zero_grad()
        loss = model.distance_loss(user, item, neg_items) + model.cov_loss()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    model.eval()
    recall_50 = model.evaluate(users, items, test_user, test_item)
    training_process.append([epoch_id, recall_50])
    print("epoch = {:d}, total_loss = {:.4f}, recall@50 = {:.4f}, epoch_time = {:.4f}sec".format(epoch_id, total_loss, recall_50, timer()-start_epoch))

epoch = 1, total_loss = 13378622.1079, recall@50 = 0.1196, epoch_time = 326.1814sec
epoch = 2, total_loss = 12666869.5400, recall@50 = 0.1214, epoch_time = 326.2458sec
epoch = 3, total_loss = 12172309.0938, recall@50 = 0.1269, epoch_time = 324.8080sec
epoch = 4, total_loss = 11735678.8247, recall@50 = 0.1374, epoch_time = 324.3977sec
epoch = 5, total_loss = 11311266.4707, recall@50 = 0.1494, epoch_time = 324.3144sec
epoch = 6, total_loss = 10884358.3149, recall@50 = 0.1619, epoch_time = 325.5522sec
epoch = 7, total_loss = 10456512.7041, recall@50 = 0.1736, epoch_time = 324.7538sec
epoch = 8, total_loss = 10036451.0552, recall@50 = 0.1838, epoch_time = 325.9228sec
epoch = 9, total_loss = 9634979.2900, recall@50 = 0.1923, epoch_time = 324.7940sec
epoch = 10, total_loss = 9261420.7554, recall@50 = 0.1995, epoch_time = 326.7753sec
epoch = 11, total_loss = 8920677.0181, recall@50 = 0.2059, epoch_time = 325.6648sec
epoch = 12, total_loss = 8613178.1440, recall@50 = 0.2115, epoch_time = 324.7