In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install pickle5



In [2]:
from collections import defaultdict
import pickle5 as pickle
import numpy as np 
import argparse
import random
import time 
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy
import bottleneck as bn
import time
import os

In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [46]:
class AdditiveAttention(nn.Module):
    
    def __init__(self,
                 query_vector_dim,
                 candidate_vector_dim,
                 ):
        super(AdditiveAttention, self).__init__()
        self.linear = nn.Linear(candidate_vector_dim, query_vector_dim)
        self.attention_query_vector = nn.Parameter(
            torch.empty(query_vector_dim).uniform_(-0.1, 0.1))
        

    def forward(self, candidate_vector):
        """
        Args:
            candidate_vector: batch_size, candidate_size, candidate_vector_dim
        Returns:
            (shape) batch_size, candidate_vector_dim
        """
        # batch_size, candidate_size, query_vector_dim
        temp = torch.tanh(self.linear(candidate_vector))
        # batch_size, candidate_size
        candidate_weights = F.softmax(torch.matmul(
            temp, self.attention_query_vector),
                                      dim=1)
        
        # batch_size, candidate_vector_dim
        target = torch.bmm(candidate_weights.unsqueeze(dim=1),
                           candidate_vector).squeeze(dim=1)
        return target

class MovieEncoder(nn.Module):
    def __init__(self,config):
        super(MovieEncoder,self).__init__()
        self.config=config
        self.title_embedding=nn.Embedding(
            self.config.num_items+1,
            self.config.title_embed_dim,
            padding_idx=0
        )
        self.title_embedding.weight.data.copy_(torch.from_numpy(config.pre_train_title))
        self.title_embedding.requires_grad=config.finetune_title

        self.cats=nn.Embedding(
            self.config.num_items+1,
            self.config.num_cat+1,
            # padding_idx=0
        )
        self.cats.weight.data.copy_(torch.from_numpy(self.config.cats))
        self.cats.requires_grad=False

        self.cats_embedding=nn.Embedding(
            self.config.num_cat+1,
            self.config.cat_embed_dim,
            padding_idx=0
        )
        self.mask=nn.Embedding(
            self.config.num_cat+1,
            self.config.cat_embed_dim,
            padding_idx=0
        )
        self.mask.weight.data.copy_(torch.from_numpy(self.config.masking_cat))
        self.mask.requires_grad=False
        self.attension=AdditiveAttention(
            self.config.cat_embed_dim,
            self.config.cat_embed_dim
        )
    
    def forward(self,seq_history):
        """
        imput: batch_size,seq_length
        output: batch_size,seq_length,vector_embedding
        """
        # print('size of seq',seq_history.shape)
        # print('seq',seq_history)
        title_embed=self.title_embedding(seq_history)
        cat=self.cats(seq_history)
        # print(cat.size())
        # print(cat)
        cat=cat.type(torch.LongTensor)
    
        cat=cat.to(self.config.device)
        # print(cat.size())
        cat_embed=self.cats_embedding(cat)
        mask=self.mask(cat)
        cat_embed=cat_embed*mask
        x,y,z,t=cat_embed.shape

        cat_embed=cat_embed.view(x*y,z,t)
        cat_embed=self.attension(cat_embed)
        cat_embed=cat_embed.view(x,y,t)
        
        return torch.cat((title_embed.permute(0,2,1),cat_embed.permute(0,2,1)),dim=1).permute(0,2,1)


class LSTUR(torch.nn.Module):
    def __init__(self, config):
        super(LSTUR, self).__init__()
        self.config = config

        self.gru = nn.GRU(
                config.hidden_gru_size,
                config.hidden_gru_size )
            
        self.movie_encoder=MovieEncoder(self.config).to(self.config.device)
 
        self.user_embedding = nn.Embedding(
            config.num_users+1,
            config.hidden_gru_size ,
            padding_idx=0)
 
    def forward(self, user, clicked_movies,candidate_movies):
        user = F.dropout2d(self.user_embedding(
            user.to(self.config.device)).unsqueeze(dim=0),
                           p=self.config.masking_probability,
                           training=self.training).squeeze(dim=0)
 
        clicked_moives_vector = self.movie_encoder(clicked_movies).permute(1,0,2)
        # batch_size, num_filters * 2
        
        _, last_hidden = self.gru(clicked_moives_vector,
                                    user.unsqueeze(dim=0))
        user_vector = last_hidden.squeeze(dim=0)
 
        candidate_movies_vector=self.movie_encoder(candidate_movies).permute(1,0,2)
        # batch_size, 1 + K
        click_probability = torch.bmm(candidate_movies_vector.permute(1,0,2),
                                user_vector.unsqueeze(dim=-1)).squeeze(dim=-1)
        return click_probability
 
    def get_prediction(self, user, clicked_moives,candidate_movies):
        user=self.user_embedding(user)
        clicked_moives_vector = self.movie_encoder(clicked_moives).permute(1,0,2)
        
        # batch_size, num_filters * 2
        _, last_hidden = self.gru(clicked_moives_vector,
                                    user.unsqueeze(dim=0))
        user_vector = last_hidden.squeeze(dim=0)

        candidate_movies_vector=self.movie_encoder(candidate_movies).permute(1,0,2)

        click_probability = torch.bmm(candidate_movies_vector.permute(1,0,2),
                                user_vector.unsqueeze(dim=-1)).squeeze(dim=-1)
        return click_probability
 
class LSTUR_for_news:
    def __init__(self,config_):
        self.config=config_
        self.device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.net=LSTUR(self.config).to(self.device)
        self.optimizer=torch.optim.Adam(self.net.parameters(),lr=1e-4)
        self.recall_5=[]
        self.recall_10=[]
        self.recall_20=[]
        self.recall_40=[]
        self.recall_30=[]
        self.recall_50=[]
        self.ndcg_10=[]
        self.ndcg_20=[]
        self.ndcg_30=[]
        self.ndcg_40=[]
        self.ndcg_50=[]
        

    def trainer(self,max_iter):
        data_train=DataReader(self.config.path_train,self.config,data_train=True)
        data_val=DataReader(self.config.path_val,self.config,data_train=False)
        
        it=0
        old_epoch=0
        best_ndcg=0
        t=time.time()
        while it<max_iter:
            self.net.train()
            batch_users,batch_seq_history,batch_pos_neg,batch_labels,_=data_train.next_batch()
            self.optimizer.zero_grad()
 
            batch_users=torch.from_numpy(batch_users).to(self.device)
            batch_seq_history=torch.from_numpy(batch_seq_history).to(self.device)
            
            batch_pos_neg=torch.from_numpy(batch_pos_neg).to(self.device)
            clicked_prob=self.net(batch_users,batch_seq_history,batch_pos_neg)
            
            loss=self.loss_func(clicked_prob)
            loss.backward()

            self.optimizer.step()

            if(it%300==0):
                print('epoch {} : it = {}, loss = {}'.format(old_epoch,it,loss.item()))
                # eval_ndcg=self.eval(data_val)
                # print('epoch {} : NDCG_50 = {}'.format(old_epoch,eval_ndcg))

            if(old_epoch!=data_train.epoch):
                print('time train =',time.time()-t,end=' ')
                t=time.time()
                old_epoch=data_train.epoch
                eval_ndcg=self.eval(data_val)
                print('epoch {} : NDCG_50 = {}'.format(old_epoch,eval_ndcg))
                if(best_ndcg<eval_ndcg):
                    best_ndcg=eval_ndcg
                    self.save_model(old_epoch,loss.item())       
            it+=1
 
    def eval(self,data_val,k=10):
        self.net.eval()
        list_ndcg_10=[]
        list_ndcg_20=[]
        list_ndcg_30=[]
        list_ndcg_40=[]
        list_ndcg_50=[]
        
        list_recall_5=[]
        list_recall_10=[]
        list_recall_20=[]
        list_recall_30=[]
        list_recall_40=[]
        list_recall_50=[]
        
        with torch.no_grad():
            # get candidate 
            while True:
                batch_users,batch_seq_history,_,batch_labels,candidate=data_val.next_batch()

                heldout=np.zeros((self.config.num_candidate,self.config.batch_size),dtype=np.int)
                heldout[0]=1
                heldout=heldout.T

                batch_seq_history=torch.from_numpy(batch_seq_history).to(self.device)
                batch_users=torch.from_numpy(np.array(batch_users)).to(self.device)
                candidate=torch.from_numpy(np.array(candidate,dtype=np.int)).to(self.device)
 
                x_pred=self.net(batch_users,batch_seq_history,candidate)
                x_pred=x_pred.detach().cpu().numpy()
                
                #list_ndcg_100.append(self.NDCG_binary_at_k_batch(x_pred,heldout,k=100))
                list_ndcg_50.append(self.NDCG_binary_at_k_batch(x_pred,heldout,k=50))
                list_ndcg_40.append(self.NDCG_binary_at_k_batch(x_pred,heldout,k=40))
                list_ndcg_30.append(self.NDCG_binary_at_k_batch(x_pred,heldout,k=30))
                list_ndcg_20.append(self.NDCG_binary_at_k_batch(x_pred,heldout,k=20))
                list_ndcg_10.append(self.NDCG_binary_at_k_batch(x_pred,heldout,k=10))
                list_recall_5.append(self.Recall_K(x_pred,heldout,k=5))
                list_recall_10.append(self.Recall_K(x_pred,heldout,k=10))
                list_recall_20.append(self.Recall_K(x_pred,heldout,k=20))
                list_recall_30.append(self.Recall_K(x_pred,heldout,k=30))
                list_recall_40.append(self.Recall_K(x_pred,heldout,k=40))
                list_recall_50.append(self.Recall_K(x_pred,heldout,k=50))
                if(data_val.epoch==1):
                    data_val.epoch=0
                    break
            
            self.recall_5.append(np.mean(list_recall_5))
            self.recall_10.append(np.mean(list_recall_10))
            self.recall_20.append(np.mean(list_recall_20))
            self.recall_30.append(np.mean(list_recall_30))
            self.recall_40.append(np.mean(list_recall_40))
            self.recall_50.append(np.mean(list_recall_50))
            self.ndcg_10.append(np.mean(list_ndcg_10))
            self.ndcg_20.append(np.mean(list_ndcg_20))
            self.ndcg_30.append(np.mean(list_ndcg_30))
            self.ndcg_40.append(np.mean(list_ndcg_40))
            self.ndcg_50.append(np.mean(list_ndcg_50))
            #self.ndcg_100.append(np.mean(list_ndcg_100))
            return np.array(list_ndcg_50).mean()
    
        
    def loss_func(self,batch_prob):
        batch_prob=torch.exp(batch_prob)
        
        pos=batch_prob.T[0]
        
        sum_pos_neg=torch.sum(batch_prob,axis=1)
        loss=pos/sum_pos_neg
        
        loss=sum(torch.log(loss))
        
        return -loss
 
 
    def Recall_K(self,x_pred,heldout_batch,k=100):
        n_users = x_pred.shape[0]
        idx = bn.argpartition(-x_pred, k, axis=1)
        x_pred_binary = np.zeros_like(x_pred, dtype=bool)
        x_pred_binary[np.arange(n_users)[:, np.newaxis], idx[:, :k]] = True
 
        x_true_binary = np.array(heldout_batch) > 0
        tmp = (np.logical_and(x_true_binary, x_pred_binary).sum(axis=1)).astype(np.float32)
        recall = tmp / np.minimum(k, x_true_binary.sum(axis=1))
        return np.mean(recall)
    
    def NDCG_binary_at_k_batch(self,X_pred, heldout_batch, k=100):
        heldout_batch=np.array(heldout_batch)
        batch_users = X_pred.shape[0]
        idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
        topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                           idx_topk_part[:, :k]]
        idx_part = np.argsort(-topk_part, axis=1)
        # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
        # topk predicted score
        idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
        # build the discount template
        tp = 1. / np.log2(np.arange(2, k + 2))
 
        DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                             idx_topk] * tp).sum(axis=1)
        IDCG = np.array([(tp[:min(int(n), k)]).sum() for n in heldout_batch.sum(axis=1)])
        return (DCG / IDCG).mean()
    
    def save_model(self,epoch,loss):
        torch.save({
            'epoch': epoch,
            'model_state_dict': self.net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'loss': loss
            }, '/content/drive/MyDrive/Project3/save_data_lda.pth')
    
class DataReader:
    def __init__(self,path,config_,data_train=True):
        self.config=config_
        self.epoch=0
        self.batch_id=0
        self.users=[]
        self.seq_history=[]
        self.candidate=[]
        self.labels=[]
        self.batch_size=self.config.batch_size
        self.data_train=data_train
        
        with open(path,'r') as f:
            data=f.read().splitlines()
        
        for i in range(len(data)):
            data[i]=data[i].split()
 
        for i in range(len(data)):
            self.users.append(data[i][0])
            self.seq_history.append(data[i][1:10])
            self.labels.append(data[i][10])
            self.candidate.append(data[i][10:])
                
        self.users=np.array(self.users,dtype=np.int)
        self.seq_history=np.array(self.seq_history,dtype=np.int)
        self.labels=np.array(self.labels,dtype=np.int)
        self.candidate=np.array(self.candidate,dtype=np.int)        
        self.n_batch=int(np.ceil(len(data)/self.batch_size))
        
    def get_negative(self,seq_history,labels):
        if(self.data_train==False):
            return None
        
        batch_negative=[]
        
        for i in range(self.batch_size):
            tmp_seq_history=np.array(copy.copy(seq_history[i]))
            
            tmp_seq_history=list(tmp_seq_history[tmp_seq_history>0])
            while True:
                item_negative=list(np.random.choice(self.config.num_items,self.config.negative_sample_ratio,replace=False)+1)
                if(len(set(tmp_seq_history+item_negative+[labels[i]]))==(len(set(tmp_seq_history))+self.config.negative_sample_ratio+1)):
                    batch_negative.append([labels[i]]+item_negative)
                    break
            
        return np.array(batch_negative,dtype=np.int)
 
    def next_batch(self):
        if(self.batch_id>=self.n_batch-1):
            self.batch_id=0
            self.epoch+=1
            tmp=list(zip(self.users,self.seq_history,self.labels,self.candidate))
            random.shuffle(tmp)
            self.users,self.seq_history,self.labels,self.candidate=list(zip(*tmp))
            
            self.users=np.array(list(self.users),dtype=np.int)
            self.seq_history=np.array(list(self.seq_history),dtype=np.int)
            self.labels=np.array(list(self.labels),dtype=np.int)
            self.candidate=np.array(list(self.candidate),dtype=np.int)
                   
            
        start=self.batch_id*self.batch_size
        end=start+self.batch_size
        self.batch_id+=1
        return self.users[start:end],\
                self.seq_history[start:end],\
                self.get_negative(self.seq_history[start:end],self.labels[start:end]),\
                self.labels[start:end],\
                self.candidate[start:end]
        

In [47]:
class Config:
    def __init__(self,
                 path_train=None,
                 path_test=None,
                 path_val=None,
                 batch_size=100,
                 title_embed_dim=50,
                 cat_embed_dim=540,                 
                 hidden_gru_size=590,
                 num_users=0,
                 num_items=0,
                 num_cat=0,
                 max_iter=15000,
                 negative_sample_ratio=10,
                 masking_probability=0.5,
                 num_candidate=100,
                 mask_cat=None,
                 pre_train_title=None,
                 cats=None,
                 finetune_title=False,
                 lr=1e-4,):
        self.path_train=path_train
        self.path_val=path_val
        self.path_test=path_test
        self.batch_size=batch_size
        self.title_embed_dim=title_embed_dim
        self.cat_embed_dim=cat_embed_dim
        self.hidden_gru_size=hidden_gru_size
        self.num_users=num_users
        self.num_items=num_items
        self.num_cat=num_cat
        self.max_iter=max_iter
        self.num_candidate=num_candidate
        self.negative_sample_ratio=negative_sample_ratio
        self.masking_probability=masking_probability
        self.masking_cat=np.array(mask_cat,dtype=np.int)
        self.pre_train_title=np.array(pre_train_title)
        self.cats=np.array(cats,dtype=np.int)
        self.finetune_title=finetune_title
        self.device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')



In [48]:
def load_data(path):
    data=open(path,'rb') 
    data=pickle.load(data)
    data=np.array(list(data.values()))
    return data

path_train='/content/drive/MyDrive/Project3/data_train.txt'
path_val='/content/drive/MyDrive/Project3/data_val.txt'
path_test='/content/drive/MyDrive/Project3/data_test.txt'
path_title_embed='/content/drive/MyDrive/Project3/movieId2embed.p'
path_masking='/content/drive/MyDrive/Project3/cat_masking.p'
path_cat='/content/drive/MyDrive/Project3/movieId2cats.p'


title_embedding=load_data(path_title_embed)
masking=load_data(path_masking)
cats=load_data(path_cat)

n_items=len(title_embedding)-1
n_users=6040

In [49]:
config=Config(path_train=path_train,
              path_test=path_test,
              path_val=path_val,
              num_cat=18,
              num_users=n_users,
              num_items=n_items,
              pre_train_title=title_embedding,
              mask_cat=masking,
              cats=cats)

In [51]:
model=LSTUR_for_news(config)
model.trainer(max_iter=35000)


epoch 0 : it = 0, loss = 410.0194396972656
epoch 0 : it = 300, loss = 236.35240173339844
epoch 0 : it = 600, loss = 235.57308959960938
epoch 0 : it = 900, loss = 185.66293334960938
epoch 0 : it = 1200, loss = 187.6563262939453
epoch 0 : it = 1500, loss = 208.79196166992188
epoch 0 : it = 1800, loss = 185.57232666015625
epoch 0 : it = 2100, loss = 188.76455688476562
epoch 0 : it = 2400, loss = 178.26141357421875
epoch 0 : it = 2700, loss = 131.4141845703125
epoch 0 : it = 3000, loss = 160.55343627929688
epoch 0 : it = 3300, loss = 269.1005554199219
epoch 0 : it = 3600, loss = 133.6104736328125
epoch 0 : it = 3900, loss = 135.75086975097656
epoch 0 : it = 4200, loss = 143.3461151123047
epoch 0 : it = 4500, loss = 166.97366333007812
epoch 0 : it = 4800, loss = 183.9907684326172
epoch 0 : it = 5100, loss = 146.13563537597656
epoch 0 : it = 5400, loss = 238.38563537597656
epoch 0 : it = 5700, loss = 151.50294494628906
epoch 0 : it = 6000, loss = 138.80532836914062
epoch 0 : it = 6300, loss 

KeyboardInterrupt: ignored

In [None]:
checkpoint=torch.load('/content/drive/MyDrive/Project3/save_data_lda.pth')
model.net.load_state_dict(checkpoint['model_state_dict'])
# model.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# model.trainer(path_train=config.path_train,
#               path_val=config.path_val,
#               max_iter=35000,
#               k=50)

data_test=DataReader(path_test,config,data_train=False)
print(model.eval(data_test,k=10))

In [9]:
data_train=DataReader(path_train,config,data_train=True)

In [10]:
print(data_train.seq_history[0])

[   0    0    0    0    0    0    0    0 3186]


In [52]:
kq=[]
kq.append(model.recall_5)
kq.append(model.recall_10)
kq.append(model.recall_20)
kq.append(model.recall_30)
kq.append(model.recall_40)
kq.append(model.recall_50)
kq.append(model.ndcg_10)
kq.append(model.ndcg_20)
kq.append(model.ndcg_30)
kq.append(model.ndcg_40)
kq.append(model.ndcg_50)
#kq.append(model.ndcg_100)

print(kq)
kq=np.array(kq,dtype=str)
for i in range(len(kq)):
    kq[i]=' '.join(kq[i])



[[0.24593647316538886, 0.26413377192982457, 0.30240131578947366, 0.3625328947368421], [0.40306681270536693, 0.42639254385964914, 0.4691995614035087, 0.5362390350877193], [0.6196276013143484, 0.6377412280701754, 0.6709649122807019, 0.7315350877192983], [0.762026286966046, 0.7737938596491228, 0.7949232456140352, 0.8384320175438598], [0.8580284775465499, 0.8619188596491228, 0.8764802631578947, 0.903201754385965], [0.9209090909090909, 0.9205921052631578, 0.9265131578947369, 0.9437390350877193], [0.20503503721730154, 0.2206490757631248, 0.25083760080013606, 0.29534587128142825], [0.2595829311075138, 0.2738707631961871, 0.301774053112967, 0.3447022925976146], [0.28990026913359673, 0.30282508345365194, 0.32818006398094424, 0.36749845873365033], [0.30847775134702105, 0.319877449673898, 0.34396257321912405, 0.38003178121918707], [0.31986052005528276, 0.33049962980233333, 0.3530216770420636, 0.38737110556328697]]


FileNotFoundError: ignored