In [8]:
import numpy as np
import pandas as pd
import random
from matplotlib import pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from tqdm import tqdm
import time

In [9]:
class Caser(nn.Module):
    def __init__(self,num_factors, num_users, num_items, L=5, d=16,
                 d_prime=4, drop_ratio=0.05):
        """
        :param num_factors: 嵌入向量长度
        :param num_users:
        :param num_items:
        :param L:
        :param d:
        :param d_prime:
        :param drop_ratio:
        """
        super(Caser, self).__init__()
        self.P = nn.Embedding(num_users,num_factors)
        self.Q = nn.Embedding(num_items,num_factors)
        self.d_prime, self.d = d_prime, d
        # Vertical convolution layer
        self.conv_v = nn.Conv2d(in_channels=1,out_channels=d_prime,kernel_size=(L, 1))
        # Horizontal convolution layer
        h = [i + 1 for i in range(L)]
        self.conv_h, self.max_pool = nn.Sequential(), nn.Sequential()
        for i in h:
            self.conv_h.add_module('conv_h'+str(i),nn.Conv2d(in_channels=1,out_channels=d,kernel_size=(i, num_factors)))
            self.max_pool.add_module('max_pool'+str(i),nn.MaxPool1d(L - i + 1))
        # Fully-connected layer
        self.fc1_dim_v, self.fc1_dim_h = d_prime * num_factors, d * len(h)
        self.fc = nn.Sequential(
            nn.Linear(in_features=d_prime * num_factors + d * L,out_features=num_factors),
            nn.ReLU()
        )
        self.V = nn.Embedding(num_items, num_factors * 2)
        self.b = nn.Embedding(num_items, 1)
        self.dropout = nn.Dropout(drop_ratio)
        self.relu = nn.ReLU()
    def forward(self, user_id, seq, item_id):
        item_embs = torch.unsqueeze(self.Q(seq), 1) # (batch_size,1,L,num_factors)
        user_emb = self.P(user_id) # (batch_size,num_factors)
        out, out_h, out_v, out_hs = None, None, None, []
        if self.d_prime:
            out_v = self.conv_v(item_embs) # (batch_size,d_prime,1,num_factors)
            out_v = out_v.reshape(out_v.shape[0], self.fc1_dim_v) # (batch_size,d_prime*num_factors)
        if self.d:
            for conv, maxp in zip(self.conv_h, self.max_pool):
                conv_out = torch.squeeze(self.relu(conv(item_embs)), dim=3) # (batch_size,1,L,num_factors) -conv> (batch_size,d,L->1,1) -squeeze> (batch_size,d,L->1)
                t = maxp(conv_out) # (batch_size,d,1)
                pool_out = torch.squeeze(t, dim=2) # (batch_size,d)
                out_hs.append(pool_out)
            out_h = torch.concat(out_hs, dim=1) # (batch_size,L*d)
        out = torch.concat([out_v, out_h], dim=1) # (batch_size,L*d+d_prime*num_factors)
        z = self.fc(self.dropout(out)) # (batch_size,num_factors)
        x = torch.concat([z, user_emb], dim=1) # (batch_size,2*num_factors)
        q_prime_i = torch.squeeze(self.V(item_id)) # (batch_size,2*num_factors)
        b = torch.squeeze(self.b(item_id)) # (batch_size,1)
        res = (x * q_prime_i).sum(1) + b # (batch_size,2*num_factors) -sum> (batch_size,1)+(batch_size,1)
        return res

In [10]:
class SeqDataset(Dataset):
    def __init__(self, user_ids, item_ids, L, num_users, num_items,candidates):
        """
        :param user_ids:数据集中按user_id和时间排序后的user_id，范围[0,num_users-1]
        :param item_ids:数据集中对应user_ids顺序的item_id，范围[0,num_items-1]
        :param L:输出前L个交互项目作为一个序列，下一个项目作为目标
        :param num_users:
        :param num_items:
        :param candidates: 字典 key为user，value为按时间排序的item list
        """
        user_ids, item_ids = np.array(user_ids), np.array(item_ids)
        temp, self.cand = {}, candidates
        self.all_items = set([i for i in range(num_items)]) #set [0,num_items-1]
        [temp.setdefault(user_ids[i], []).append(i) for i, _ in enumerate(user_ids)] #字典，user_id作为key，value为相同user_id重复出现的索引列表，和item_ids一致
        temp = sorted(temp.items(), key=lambda x: x[0]) #变成list，并按value的第一个索引排序
        u_ids = np.array([i[0] for i in temp])  #所有temp中的user_id
        idx = np.array([i[1][0] for i in temp]) #不同user_id交互的开始的索引
        self.ns = int(sum([c - L if c >= L + 1 else 1 for c  # 计算每个user_id有多少窗口数，求和也就是SeqDataset能提供的训练数据的数量
                                in np.array([len(i[1]) for i in temp])]))
        self.seq_items = np.zeros((self.ns, L))  #长度不超过L的训练item X
        self.seq_users = np.zeros(self.ns, dtype='int32')
        self.seq_tgt = np.zeros((self.ns, 1)) #预测目标 Y
        self.test_seq = np.zeros((num_users, L)) #测试集 X
        test_users, _uid = np.empty(num_users), None
        for i, (uid, i_seq) in enumerate(self._seq(u_ids, item_ids, idx, L + 1)):
            if uid != _uid:
                self.test_seq[uid][:] = i_seq[-L:] #第一个产生的是最新的序列，i_seq最后L个作为test_seq[uid]，是测试集的输入X
                test_users[uid], _uid = uid, uid
            self.seq_tgt[i][:] = i_seq[-1:] #i_seq最后1个作为预测目标
            self.seq_items[i][:], self.seq_users[i] = i_seq[:L], uid #前L个作为输入X

    def _win(self, tensor, window_size, step_size=1): #生成窗口大小最大为window_size的序列
        if len(tensor) - window_size >= 0:
            for i in range(len(tensor), 0, - step_size): #从后到前
                if i - window_size >= 0:
                    yield tensor[i - window_size:i]
                else:
                    break
        else:
            yield tensor

    def _seq(self, u_ids, item_ids, idx, max_len): #生成器，每调用一次生成一个训练数据
        for i in range(len(idx)): #对所有唯一的user_id
            stop_idx = None if i >= len(idx) - 1 else int(idx[i + 1]) # 本user_id的最后一个索引是下一个user_id第一个索引的前一个
            for s in self._win(item_ids[int(idx[i]):stop_idx], max_len):
                yield (int(u_ids[i]), s)

    def __len__(self):
        return self.ns

    def __getitem__(self, idx):
        """
        :return: 第i个训练数据的uid,第i个训练数据的序列,第i个训练数据的目标,负例
        """
        neg = list(self.all_items - set(self.cand[int(self.seq_users[idx])])) #负例
        i = random.randint(0, len(neg) - 1)
        return (self.seq_users[idx], self.seq_items[idx], self.seq_tgt[idx],
                neg[i])

In [11]:
class ArrayDataset(Dataset):
    def __init__(self, ArrayData,seq):
        self._data  = ArrayData
        self.seq = seq
    def __len__(self):
        return len(self._data[0])

    def __getitem__(self, idx):
        return self._data[0][idx],self._data[1][idx],self.seq

In [12]:
def hit_and_auc(rankedlist, test_matrix, k):
    """
    计算每个用户的命中数和 AUC。
    :param rankedlist:
    :param test_matrix: 用户真正交互的item
    :param k:
    :return:
    """
    hits_k = [(idx, val) for idx, val in enumerate(rankedlist[:k])
              if val in set(test_matrix)]
    hits_all = [(idx, val) for idx, val in enumerate(rankedlist)
                if val in set(test_matrix)]
    max = len(rankedlist) - 1
    auc = 1.0 * (max - hits_all[0][0]) / max if len(hits_all) > 0 else 0
    return len(hits_k), auc


def evaluate_ranking(net, test_input, candidates,num_users, num_items,device):
    ranked_list, ranked_items, hit_rate, auc = {}, {}, [], []
    all_items = set([i for i in range(num_items)])
    for u in tqdm(range(num_users)):
        u_seq = test_input[0][u]
        true_item = test_input[1][u]
        neg_items = list(all_items - set(candidates[int(u)]))
        user_ids, item_ids, x, scores = [], [], [], []
        [item_ids.append(i) for i in neg_items] #记录u没有评价的item id
        [user_ids.append(u) for _ in neg_items] #u的id，和item_ids长度相同
        x.extend([np.array(user_ids)])
        x.extend([np.array(item_ids)])
        # x[0]:len=len(neg_items) 元素全为u,x[1]:len=len(neg_items) 元素为neg_items的id
        x = np.array(x)
        x = torch.tensor(x)
        test_data_set = ArrayDataset(x,u_seq)
        test_data_iter = DataLoader(test_data_set, shuffle=False, batch_size=1024)
        for user_id, item_id,seq in test_data_iter:
            user_id = torch.as_tensor(user_id,dtype=torch.int,device=device)
            item_id = torch.as_tensor(item_id,dtype=torch.int,device=device)
            seq = torch.as_tensor(seq,dtype=torch.int,device=device)
            score = net(user_id,seq,item_id)
            scores.extend(score)
        # print(scores)
        # scores = [item for sublist in scores for item in sublist]
        item_scores = list(zip(item_ids, scores))
        ranked_list[u] = sorted(item_scores, key=lambda t: t[1], reverse=True)
        ranked_items[u] = [r[0] for r in ranked_list[u]]

        temp = hit_and_auc(ranked_items[u], true_item, 50)
        hit_rate.append(temp[0])
        auc.append(temp[1])
    return np.mean(np.array(hit_rate)), np.mean(np.array(auc))

def train_ranking(net, train_iter, test_input, optimizer,loss,num_users,num_items, num_epochs, device, evaluator,candidates):
    net = net.to(device)
    print("training on ", device)
    plt_epoch = []
    for epoch in range(num_epochs):
        plt_epoch.append(epoch)
        train_l_sum, train_acc_sum, n, batch_count, start = 0.0, 0.0, 0, 0, time.time()
        for u,seq,pos_item,neg_item in tqdm(train_iter):
            seq = torch.as_tensor(seq,dtype=torch.int,device=device)
            u = torch.as_tensor(u,dtype=torch.int,device=device)
            pos_item = torch.as_tensor(pos_item,dtype=torch.int,device=device)
            neg_item = torch.as_tensor(neg_item,dtype=torch.int,device=device)

            p_pos = net(u,seq,pos_item)
            p_neg = net(u,seq,neg_item)
            l = loss(p_pos, p_neg)
            train_l_sum += l.sum().cpu().item()
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            batch_count+=1
        with torch.no_grad():
            hit_rate, auc = evaluator(net, test_input,candidates,num_users, num_items,device)
        print('epoch %d,train_loss %.4f,hit_rate %.4f,auc %.4f, time %.1f sec'
              % (epoch + 1,train_l_sum / batch_count,hit_rate,auc, time.time() - start))


In [None]:
def read_data_ml100k():
    """
    从u.data读取数据转成dataFrame
    :return:
    """
    names = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv('../../data/ml-100k/u.data', '\t', names=names,engine='python')
    num_users = data.user_id.unique().shape[0]
    num_items = data.item_id.unique().shape[0]
    return data, num_users, num_items

def split_data_ml100k(data, num_users, split_mode='random', test_ratio=0.1):
    """
    将数据集按随机模式或时序模式分割成训练集和测试集
    :param data:
    :param num_users:
    :param split_mode:
    :param test_ratio:
    :return:
    """
    if split_mode == 'seq-aware':
        train_items, test_items, train_list = {}, {}, []
        for line in data.itertuples():
            u, i, rating, time = line[1], line[2], line[3], line[4]
            train_items.setdefault(u, []).append((u, i, rating, time)) # 如果键不在字典里，setdefault将键和默认值添加到字典中，最后返回该键对应的值。这里返回list，通过append将所有相同user的item放到同一个key对应的value里。
            if u not in test_items or test_items[u][-1] < time: # 将最新的item放到test_items
                test_items[u] = (i, rating, time)
        for u in range(1, num_users + 1): #导致train_data按u从小到大排序
            train_list.extend(sorted(train_items[u], key=lambda k: k[3])) # 将每个user对应的value 按照时间戳从小到大排序放到train_list
        test_data = [(key, *value) for key, value in test_items.items()] # 将test_items变成list
        train_data = [item for item in train_list if item not in test_data] #将train_list不在test_data里的元素放到train_data
        train_data = pd.DataFrame(train_data,columns=['u', 'i', 'rating', 'time'])
        test_data = pd.DataFrame(test_data,columns=['u', 'i', 'rating', 'time']).sort_values(by='u')
    else:
        mask = np.random.uniform(0, 1, (len(data))) < (1 - test_ratio)# 生成(len(data),)大小的bool类型数组 随机test_ratio比例的元素为False，其余为True
        neg_mask = [not x for x in mask] # 生成len(data)长度的bool类型list 元素和mask相反
        train_data, test_data = data[mask], data[neg_mask]
    return train_data, test_data

def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
    """
    从训练集或测试集中分出用户列表、item列表、得分列表，给出显式或隐式交互矩阵
    :param data:
    :param num_users:
    :param num_items:
    :param feedback:
    :return:
    """
    users, items, scores = [], [], []
    inter = np.zeros((num_items, num_users)) if feedback == 'explicit' else {}
    for line in data.itertuples():
        user_index, item_index = int(line[1] - 1), int(line[2] - 1) #0~942 0~1681
        score = int(line[3]) if feedback == 'explicit' else 1
        users.append(user_index)
        items.append(item_index)
        scores.append(score)
        if feedback == 'implicit':
            inter.setdefault(user_index, []).append(item_index) # 隐式则为字典 key为user，value为按时间排序的item list
        else:
            inter[item_index, user_index] = score
    return users, items, scores, inter

In [13]:
TARGET_NUM, L, batch_size = 1, 5, 2048
df, num_users, num_items = read_data_ml100k()
train_data, test_data = split_data_ml100k(df, num_users, 'seq-aware')
users_train, items_train, ratings_train, candidates = load_data_ml100k(
    train_data, num_users, num_items, feedback="implicit")
_, _, _, test_seq_Y = load_data_ml100k(
    test_data, num_users, num_items, feedback="implicit")

train_set = SeqDataset(users_train, items_train, L, num_users,
                            num_items, candidates)
train_iter = DataLoader(train_set, batch_size, True)
test_seq_X = train_set.test_seq



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = Caser(10, num_users, num_items, L)
num_epochs, wd, optimizer = 3, 1e-5, torch.optim.Adam(net.parameters(), lr=0.04)
def BPRLoss(positive,negative):
    sigmoid = nn.Sigmoid()
    return - torch.sum(torch.log(sigmoid(positive - negative)), dim=0, keepdim=True)
loss = BPRLoss

  df, num_users, num_items = d2l.read_data_ml100k()


In [14]:
train_ranking(net, train_iter, [test_seq_X,test_seq_Y], optimizer,loss,num_users,num_items, num_epochs, device, evaluate_ranking,candidates)

training on  cuda


100%|██████████| 47/47 [00:06<00:00,  7.37it/s]
100%|██████████| 943/943 [19:09<00:00,  1.22s/it]


epoch 1,train_loss 2556.7216,hit_rate 0.0604,auc 0.6387, time 1156.3 sec


100%|██████████| 47/47 [00:07<00:00,  6.11it/s]
100%|██████████| 943/943 [19:55<00:00,  1.27s/it]


epoch 2,train_loss 1001.9722,hit_rate 0.1241,auc 0.7295, time 1203.6 sec


100%|██████████| 47/47 [00:06<00:00,  7.35it/s]
100%|██████████| 943/943 [21:07<00:00,  1.34s/it]


epoch 3,train_loss 718.7881,hit_rate 0.1569,auc 0.7626, time 1274.4 sec
