In [None]:
import os
import sys
import json
import torch
import random
import pickle
import argparse
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim import Adam

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = True

seed_everything(2022)

In [3]:
args = {
    "data_dir": "../data/ml-1m",
    "epochs": 20,
    "device": "cuda:0",
    "lr": 5e-4,
}

args = argparse.Namespace(**args)


In [4]:
def load_ndjson(input_file):
    with open(input_file, "r") as f:
        lines = f.read()
        d = [json.loads(l) for l in lines.splitlines()]
    return d


def load_seq_txt(input_file):
    output = []
    with open(input_file, "r") as f:
        for line in f.readlines():
            line = line.strip("\n")
            line = line.split(" ")
            line = [int(i) for i in line]
            output.append(line)
    return output


def load_dataset(data_dir, mode=""):
    mode_list = ["", "train", "dev", "test"]
    if mode not in mode_list:
        raise ValueError("Incorrect mode. Must be `train`|`dev`|`test`.")

    if mode != "":
        data_dir = os.path.join(data_dir, mode)
    behavior = load_seq_txt(os.path.join(data_dir, "sequential_data.txt"))
    content = load_ndjson(os.path.join(data_dir, "content.json"))
    return behavior, content


behavior, content = load_dataset(args.data_dir, mode="")

In [None]:
class Seq_Dataset(Dataset):
    def __init__(self, purchase_history, mode="") -> None:
        self.max_len = 10
        self.purchase_history = purchase_history
        self.mode = mode
        assert mode in ["train", "test"]
        super().__init__()

    def __len__(self):
        return len(self.purchase_history)

    def __getitem__(self, index):
        purchase_history = self.purchase_history[index]
        if self.mode == "train":
            purchase_history = purchase_history[0:-1]
        
        seq_list = purchase_history[0:-1]
        tgt_list = purchase_history[1:]
        seq = self.truncate_and_pad(seq_list)
        tgt = self.truncate_and_pad(tgt_list)
        return torch.LongTensor(seq), torch.LongTensor(tgt)
    
    def truncate_and_pad(self, input_list):
        length = len(input_list)
        if length > self.max_len:
            return input_list[length - self.max_len : length]
        elif length < self.max_len:
            return [0] * (self.max_len - length) + input_list
        else:
            return input_list


max_item = 0
def process_dataset(behavior, content):
    purchase_history = []
    global max_item
    for i, user_info in enumerate(behavior):
        if not user_info or len(user_info) < 5:
            continue
        user_info = user_info[1:]
        max_item = max(max_item, max(user_info))
        purchase_history.append(user_info)
    print(len(purchase_history))
    return purchase_history


train_dataset = Seq_Dataset(process_dataset(behavior, content), mode="train")
test_dataset = Seq_Dataset(process_dataset(behavior, content), mode="test")

In [7]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=False, num_workers=0)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=True, num_workers=0)

In [14]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import xavier_normal_, xavier_uniform_, uniform_, constant_

class Caser(nn.Module):

    def __init__(self, args):
        super(Caser, self).__init__()
        self.num_item = args.num_item + 1
        self.embedding_size = args.d_model
        self.dropout = args.dropout
        self.max_len = args.max_len
        self.embedding_sharing = args.embedding_sharing
        self.n_h = args.caser_nh
        self.n_v = args.caser_nv


        self.item_embedding = nn.Embedding(self.num_item, self.embedding_size, padding_idx=0)

        self.conv_v = nn.Conv2d(in_channels=1, out_channels=self.n_v, kernel_size=(self.max_len, 1))

        lengths = [i + 1 for i in range(self.max_len)]
        self.conv_h = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=self.n_h, kernel_size=(i, self.embedding_size)) for i in lengths
        ])

        self.fc1_dim_v = self.n_v * self.embedding_size
        self.fc1_dim_h = self.n_h * len(lengths)
        fc1_dim_in = self.fc1_dim_v + self.fc1_dim_h
        self.fc1 = nn.Linear(fc1_dim_in, self.embedding_size)
        self.fc2 = nn.Linear(self.embedding_size, self.embedding_size)
        self.final_layer = nn.Linear(self.embedding_size, self.num_item)

        self.dropout_layer = nn.Dropout(self.dropout)
        self.ac_conv = nn.ReLU()
        self.ac_fc = nn.ReLU()

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            stdv = np.sqrt(1. / self.num_item)
            uniform_(module.weight.data, -stdv, stdv)
        elif isinstance(module, nn.Linear):
            xavier_normal_(module.weight.data)
            if module.bias is not None:
                constant_(module.bias.data, 0.1)

    def forward(self, seqs):
        item_seq = seqs
        item_seq_emb = self.item_embedding(item_seq).unsqueeze(1)

        out, out_h, out_v = None, None, None
        if self.n_v:
            out_v = self.conv_v(item_seq_emb)
            out_v = out_v.view(-1, self.fc1_dim_v)

        out_hs = list()
        if self.n_h:
            for conv in self.conv_h:
                conv_out = self.ac_conv(conv(item_seq_emb).squeeze(3))
                pool_out = F.max_pool1d(conv_out, conv_out.size(2)).squeeze(2)
                out_hs.append(pool_out)
            out_h = torch.cat(out_hs, 1)

        out = torch.cat([out_v, out_h], 1)
        out = self.dropout_layer(out)
        z = self.ac_fc(self.fc1(out))
        seq_output = self.ac_fc(self.fc2(z))
        if self.embedding_sharing:
            return seq_output, F.linear(seq_output, self.item_embedding.weight)
        else:
            return seq_output, self.final_layer(seq_output)

In [47]:
def train(model, train_dataloader, test_dataloader, opt, loss_func, max_item):
    for epoch in range(args.epochs):
        model.train()
        total_loss = 0.0
        train_iter = tqdm(train_dataloader, ncols=100)

        for idx, (seq, tgt) in enumerate(train_iter):
            seq = seq.to(args.device)
            tgt = tgt.to(args.device)[:, -1]
            
            _, out = model(seq)
            loss = loss_func(out.view(len(tgt), -1), tgt.view(-1))

            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.cpu().item()

            train_iter.set_postfix({"loss": total_loss / (idx + 1)})

In [48]:
def _dcg_score(y_true, order, k=10):
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

def _ndcg_score(y_true, order, k=10):
    actual = _dcg_score(y_true, order, k)
    return actual / 1.


class MetricScores(object):
    def __init__(self) -> None:
        self.ndcg5s = []
        self.ndcg10s = []
        self.ndcg20s = []
        self.recall1_true = 0
        self.recall5_true = 0
        self.recall5_total = 0
        self.recall10_true = 0
        self.recall10_total = 0
        self.recall20_true = 0
        self.recall20_total = 0
        self.k = [1, 5, 10, 20]

    def __call__(self, label_ids: torch.Tensor, pred_ids: torch.Tensor):
        assert len(label_ids) == len(pred_ids)
    
        for k in self.k:
            pred = pred_ids[:, 0:k].detach().cpu().numpy()
            label = label_ids.detach().cpu().numpy().reshape(-1, 1)

            binary_labels = np.where(pred==label, 1, 0)

            if k == 10:
                self.recall10_true += np.sum(binary_labels)
                self.recall10_total += binary_labels.shape[0]
                for y_true in binary_labels:
                    order = [i for i in range(10)]
                    ndcg5 = _ndcg_score(y_true, order, 5)
                    ndcg10 = _ndcg_score(y_true, order, 10)
                    
                    self.ndcg5s.append(ndcg5)
                    self.ndcg10s.append(ndcg10)
            elif k == 5:
                self.recall5_true += np.sum(binary_labels)
                self.recall5_total += binary_labels.shape[0]
            elif k == 1:
                self.recall1_true += np.sum(binary_labels)
            elif k == 20:
                self.recall20_true += np.sum(binary_labels)
                self.recall20_total += binary_labels.shape[0]
                for y_true in binary_labels:
                    order = [i for i in range(20)]
                    ndcg20 = _ndcg_score(y_true, order, 20)
                    self.ndcg20s.append(ndcg20)
        return


    def output(self):
        ndcg5, ndcg10, ndcg20 = np.mean(self.ndcg5s), np.mean(self.ndcg10s), np.mean(self.ndcg20s)
        recall1 = self.recall1_true / self.recall5_total
        recall5 = self.recall5_true / self.recall5_total
        recall10 = self.recall10_true / self.recall10_total
        recall20 = self.recall20_true / self.recall20_total
        print(
            "Recall@1: {:.4f}\nRecall@5: {:.4f}\nRecall@10: {:.4f}\nRecall@20: {:.4f}\nnDCG@5: {:.4f}\nnDCG@10: {:.4f}\nnDCG@20: {:.4f}\n".format(
                recall1, recall5, recall10, recall20, ndcg5, ndcg10, ndcg20
            )
        )
        res = {}
        res["scores"] = {
            "Recall@1": recall1,
            "Recall@5": recall5,
            "Recall@10": recall10,
            "Recall@20": recall20,
            "nDCG@5": ndcg5,
            "nDCG@10": ndcg10,
            "nDCG@20": ndcg20,
        }
        return res

In [49]:
def evaluate(model, test_dataloader, max_item, device):
    res = MetricScores()
    model.eval()
    with torch.no_grad():
        for idx, (seq, tgt) in enumerate(tqdm(test_dataloader)):
            seq = seq.to(device)
            tgt = tgt.to(device)
            
            _, out = model(seq)
            pred = torch.argsort(out, dim=-1, descending=True)
            tgt = tgt[:, -1]

            res(tgt, pred)

    return res.output()

In [None]:
caser_args = {
    "num_item": max_item,
    "max_len": 10,
    "caser_nh": 8,
    "caser_nv": 8,
    "d_model": 512,
    "dropout": 0.1,
    "embedding_sharing": False,
}

model = Caser(argparse.Namespace(**caser_args))

model = model.to(args.device)
CE = torch.nn.CrossEntropyLoss(ignore_index=0)
opt = torch.optim.Adam(model.parameters(), lr=args.lr)
train(model, train_dataloader, test_dataloader, opt, CE, max_item+1)

print(args)
print(caser_args)
res = evaluate(model, test_dataloader, max_item, args.device)