In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
device = 'cuda'

In [2]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pickle
import random
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from copy import deepcopy
from sklearn.metrics import roc_auc_score
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from pytorchtools import EarlyStopping
assert(torch.cuda.is_available())

## Data Preprocessing

In [3]:
# import logging
# def logger(content):
#     logging.getLogger('matplotlib.font_manager').disabled = True
#     log_format = '[%(asctime)s] %(message)s'
#     date_format = '%Y-%m-%d %H:%M:%S'
#     logging.basicConfig(level = logging.DEBUG, format = log_format, datefmt = date_format)
#     logging.info(content)

In [None]:
# # merge news to one document
# news_set = set()
# news = []
# with open('/data/Recommend/MIND/MINDsmall_train/news.tsv', 'r') as f:
#     for line in f:
#         data = line.split('\t')
#         news_id = data[0]
#         if news_id not in news_set:
#             news.append(line)
#             news_set.add(news_id)
# with open('/data/Recommend/MIND/MINDsmall_dev/news.tsv') as f:
#     for line in f:
#         data = line.split('\t')
#         news_id = data[0]
#         if news_id not in news_set:
#             news.append(line)
#             news_set.add(news_id)
# # with open('/data/Recommend/MIND/MINDlarge_test/news.tsv') as f:
# #     for line in f:
# #         data = line.split('\t')
# #         news_id = data[0]
# #         if news_id not in news_set:
# #             news.append(line)
# #             news_set.add(news_id)

# # with open('/data/Recommend/MIND/small_news.tsv', 'w') as f:
# #     f.writelines(news)

# print(len(news))

In [3]:
def load_news(path):
    news_dict = {} # index -> news
    news_list = [] # index -> news
    newsid_dict = {} # newsid -> index
    word_dict = {'<PAD>': 0, '<OOV>': 1}
    cate_dict = {'<PAD>': 0, '<OOV>': 1}
    with open(path, 'r') as f:
        for line in f.readlines():
            news_id, category, subcategory, title, abstract, \
                url, title_entities, abstract_entities = line.strip().split('\t')
            title = title.lower().replace('.', '').replace(',', '').replace(';', '').replace(':', '').replace('\'', '').replace('"', '').replace('?', '').replace('!', '').replace('(', '').replace(')', '').split(' ')
            abstract = abstract.lower().replace('.', '').replace(',', '').replace(';', '').replace(':', '').replace('\'', '').replace('"', '').replace('?', '').replace('!', '').replace('(', '').replace(')', '').split(' ')
            for word in title + abstract:
                if word not in word_dict:
                    word_dict[word] = len(word_dict)
            if category not in cate_dict:
                cate_dict[category] = len(cate_dict)
            if subcategory not in cate_dict:
                cate_dict[subcategory] = len(cate_dict)
            if news_id not in newsid_dict:
                newsid_dict[news_id] = len(newsid_dict)
                news_list.append([category, subcategory, title, abstract])
    print(len(news_list))
    return news_list, newsid_dict, word_dict, cate_dict

In [4]:
max_title = 30
max_body = 100
def map_news_input(news_list, word_dict, cate_dict):
    n_news = len(news_list)
    titles = np.zeros((n_news, max_title), dtype = 'int32')
    bodys = np.zeros((n_news, max_body), dtype = 'int32')
    cates = np.zeros((n_news,1), dtype = 'int32')
    subcates = np.zeros((n_news,1), dtype = 'int32')
    for i in range(n_news):
        category, subcategory, title, abstract = news_list[i]
        titles[i, :len(title)] = [word_dict[word] for word in title[:max_title]]
        bodys[i, :len(abstract)] = [word_dict[word] for word in abstract[:max_body]]
        cates[i] = cate_dict[category]
        subcates[i] = cate_dict[subcategory]
    news_info = np.concatenate((titles, bodys, cates, subcates), axis = 1)
    print(news_info.shape)
    return news_info # index -> news_info

In [5]:
'''
news_list: original news
news_info: mapped news(word ids)
'''
news_list, newsid_dict, word_dict, cate_dict = load_news('/data/Recommend/MIND/small_news.tsv')
news_info = map_news_input(news_list, word_dict, cate_dict)

65238
(65238, 132)


In [6]:
def load_glove(word_to_ix, dim = 100):
    if dim == 100:
        path = '/data/pretrained/Glove/glove.6B.100d.txt'
    elif dim == 300:
        path = '/data/pretrained/Glove/glove.840B.300d.txt'
    word_emb = []
    word_emb = np.zeros((len(word_to_ix), dim), dtype = float)
    with open(path, 'r') as f:
        for line in f:
            data = line.strip().split(' ') # [word emb1 emb2 ... emb n]
            word = data[0]
            if word in word_to_ix:
                word_emb[word_to_ix[word]] = [float(i) for i in data[1:]]
    print(word_emb.shape)
    return torch.tensor(word_emb, dtype = torch.float)

In [7]:
word_emb = load_glove(word_dict, 300)
cate_emb = load_glove(cate_dict, 100)

(80416, 300)
(282, 100)


In [8]:
def load_train_impression(path, newsid_dict): # train&dev
    logs = []
    with open(path, 'r') as f:
        for line in f:
            imp_id, user_id, time, history, impression = line.strip().split('\t')
            if history:
                history = [newsid_dict[news_id] for news_id in history.split(' ')]
            else:
                history = []
            positive = []
            negative = []
            for item in impression.split(' '):
                news_id, num = item.split('-')
                if num == '1':
                    positive.append(newsid_dict[news_id])
                else:
                    negative.append(newsid_dict[news_id])
            logs.append([history, positive, negative]) # indexs
    return logs

In [9]:
max_history = 50
def map_user(logs): # index -> history, 用 index 代表 user_id, train&dev
    n_user = len(logs)
    user_hist = np.zeros((n_user, max_history), dtype = 'int32') # index -> history
    for i in range(n_user):
        history, positive, negative = logs[i]
        n_hist = len(history)
        if n_hist == 0:
            continue
        user_hist[i, -n_hist:] = history[-max_history:]
    return user_hist         

In [10]:
neg_ratio = 4
def neg_sample(negative):
    if len(negative) < neg_ratio:
        return random.sample(negative * (neg_ratio // len(negative) + 1), neg_ratio)
    else:
        return random.sample(negative, neg_ratio)

def get_train_input(logs): # 和 map_user 使用同一个 log
    all_pos = [] # 每个 sample 的 pos
    all_neg = []
    user_id = [] # 每个 sample 的 user，用 index 表示，和 map_user 的结果对应
    for i in range(len(logs)):
        history, positive, negative = logs[i]
        for pos in positive:
            all_pos.append(pos)
            all_neg.append(neg_sample(negative))
            user_id.append(i)
    n_imps = len(all_pos)
    imps = np.zeros((n_imps, 1 + neg_ratio), dtype = 'int32')
    for i in range(len(all_pos)):
        imps[i, 0] = all_pos[i]
        imps[i, 1:] = all_neg[i]
    user_id = np.array(user_id, dtype = 'int32')
    labels = np.zeros((n_imps, 1 + neg_ratio), dtype = 'int32')
    labels[:, 0] = 1
    print(n_imps)
    return imps, user_id, labels

def get_dev_input(logs): # 和 map_user 使用同一个 log
    imps = []
    labels = []
    user_id = np.zeros((len(logs)), dtype = 'int32') # 每个 sample 的 user index，和 map_user 的结果对应
    for i in range(len(logs)):
        history, positive, negative = logs[i]
        imps.append(np.array(positive + negative, dtype = 'int32'))
        labels.append([1] * len(positive) + [0] * len(negative))
        user_id[i] = i
    print(len(logs))
    return imps, user_id, labels

In [11]:
# # merge entity embedding to one document
# ent_set = set()
# ents = []
# with open('/data/Recommend/MIND/MINDsmall_train/entity_embedding.vec', 'r') as f:
#     for line in f:
#         ent_id = line.split('\t')[0]
#         if ent_id not in ent_set:
#             ents.append(line)
#             ent_set.add(ent_id)
# with open('/data/Recommend/MIND/MINDsmall_dev/entity_embedding.vec') as f:
#     for line in f:
#         ent_id = line.split('\t')[0]
#         if ent_id not in ent_set:
#             ents.append(line)
#             ent_set.add(ent_id)
# # with open('/data/Recommend/MIND/MINDlarge_test/entity_embedding.vec') as f:
# #     for line in f:
# #         ent_id = line.split('\t')[0]
# #         if ent_id not in ent_set:
# #             ents.append(line)
# #             ent_set.add(ent_id)

# with open('/data/Recommend/MIND/small_entity_embedding.vec', 'w') as f:
#     f.writelines(ents)

# print(len(ents))

In [11]:
def load_ent_emb(path):
    ent_emb = []
    ent_dict = {'<PAD>': 0, '<OOV>': 1}
    with open(path, 'r') as f:
        for line in f:
            data = line.strip().split('\t')
            ent_id = data[0]
            ent_dict[ent_id] = len(ent_dict)
            ent_emb.append([float(i) for i in data[1:]])
    ent_emb.insert(0, [0.] * len(ent_emb[0]))
    ent_emb.insert(0, [0.] * len(ent_emb[0]))
    ent_emb = torch.tensor(ent_emb, dtype = torch.float)
    print(ent_emb.shape)
    return ent_emb, ent_dict

In [12]:
max_ents = 5
def load_news_ent(path, ent_dict):
    n_news = len(news_list)
    news_ents = np.zeros((n_news, max_ents), dtype = 'int32')
    i = 0
    with open(path, 'r') as f:
        for line in f.readlines():
            data = line.strip().split('\t')
            ents = [ent['WikidataId'] for ent in json.loads(data[6])] + [ent['WikidataId'] for ent in json.loads(data[7])]
            news_ents[i, :len(ents)] = [ent_dict[ent] if ent in ent_dict else ent_dict['<OOV>'] for ent in ents[:max_ents]]
            i += 1
    print(len(news_ents))
    return news_ents # index -> ent_index

In [13]:
ent_emb, ent_dict = load_ent_emb('/data/Recommend/MIND/small_entity_embedding.vec')
news_ents = load_news_ent('/data/Recommend/MIND/small_news.tsv', ent_dict)

torch.Size([31453, 100])
65238


In [32]:
class TrainDataset(Dataset):
    def __init__(self, imp_datas, imp_users, imp_labels, news_info, user_clicks, batch_size, news_ents = None, news_urls = None):
        self.imp_datas = imp_datas # (n_imps, 1 + k)
        self.imp_users = imp_users
        self.imp_labels = imp_labels
        self.news = news_info
        self.user_clicks = user_clicks
        self.batch_size = batch_size
        self.news_ents = news_ents
        self.news_urls = news_urls
        
        self.n_data = imp_datas.shape[0]
        
    def __len__(self):
        return int(np.ceil(self.n_data / self.batch_size))

    def __getitem__(self, idx):
        start = idx * self.batch_size
        end = min((idx + 1) * self.batch_size, self.n_data)
        
        data_id = self.imp_datas[start: end] # (n_batch, 1 + k)
        data_news = self.news[data_id] # (n_batch, 1 + k, news_len)
        user_id = self.imp_users[start: end] # (n_batch)
        user_news_id = self.user_clicks[user_id] # (n_batch, n_hist)
        user_news = self.news[user_news_id] # (n_batch, n_hist, news_len)
        labels = self.imp_labels[start: end] # (n_batch, 1 + k)
        
        if self.news_ents is not None:
            samp_ents = self.news_ents[data_id]
            user_ents = self.news_ents[user_news_id]
            return data_news, user_news, labels, samp_ents, user_ents
        
        if self.news_urls is not None:
            samp_urls = self.news_urls[data_id]
            user_urls = self.news_urls[user_news_id]
            return data_news, user_news, labels, samp_urls, user_urls
        
        return data_news, user_news, labels
    
class DevDataset(Dataset): # data 和 label 是 list，每条数据不同长度
    def __init__(self, imp_datas, imp_users, imp_labels, news_info, user_clicks, batch_size):
        self.imp_datas = imp_datas # [imp1, imp2, ..., impn]
        self.imp_users = imp_users # (n_imps)
        self.imp_labels = imp_labels
        self.news = news_info
        self.user_clicks = user_clicks
        self.batch_size = batch_size
        
        self.n_data = len(imp_datas)
        
    def __len__(self):
        return int(np.ceil(self.n_data / self.batch_size))

    def __getitem__(self, idx):
        start = idx * self.batch_size
        end = min((idx + 1) * self.batch_size, self.n_data)
        
        data_ids = []
        data_news = [] # [(n_imp, news_len)]
        labels = [] # [(n_imp)]
        for i in range(start, end):
            data_id = self.imp_datas[i] # (n_imp)
            data_ids.append(data_id)
            # data_news.append(self.news[data_id]) # (n_imp, news_len)
            labels.append(self.imp_labels[i]) # (n_imp)
        user_id = self.imp_users[start: end] # (n_batch)
        user_news_id = self.user_clicks[user_id] # (n_batch, n_hist)
        # user_news = self.news[user_news_id] # (n_batch, n_hist, news_len)
        
        #return data_news, user_news, labels
        return data_ids, user_news_id, labels

In [12]:
n_batch = 16
train_logs = load_train_impression('/data/Recommend/MIND/MINDsmall_train/behaviors.tsv', newsid_dict)
train_user_hist = map_user(train_logs)
train_datas, train_users, train_labels = get_train_input(train_logs)
train_dataset = TrainDataset(train_datas, train_users, train_labels, news_info, train_user_hist, n_batch)

dev_logs = load_train_impression('/data/Recommend/MIND/MINDsmall_dev/behaviors.tsv', newsid_dict)
dev_user_hist = map_user(dev_logs)
dev_datas, dev_users, dev_labels = get_dev_input(dev_logs)
dev_dataset = DevDataset(dev_datas, dev_users, dev_labels, news_info, dev_user_hist, 64)

valid_datas, valid_users, valid_labels = get_train_input(dev_logs) # 用 train 的方法构造 dev_set
valid_dataset = TrainDataset(valid_datas, valid_users, valid_labels, news_info, dev_user_hist, n_batch)

236344
73152
111383


In [13]:
def encode_all_news(news_info, news_encoder):
    n_news = len(news_info)
    news_rep = []
    n_batch = 32
    for i in range((len(news_info) + n_batch - 1) // n_batch):
        batch_news = torch.tensor(news_info[i * n_batch: (i + 1) * n_batch], dtype = torch.long, device = 'cuda')
        batch_rep = news_encoder(batch_news).detach().cpu().numpy()
        news_rep.append(batch_rep)
    news_rep = np.concatenate(news_rep, axis = 0)
    return news_rep # (n_news, n_title, n_emb)

def encode_all_user(user_ids, user_hist, user_encoder, news_rep):
    user_rep = []
    with torch.no_grad():
        for _, batch in enumerate(dev_dataset):
            if len(batch[0]) == 0:
                break
            user_hist_rep = torch.tensor(news_rep[batch[1]], device = 'cuda') # (n_batch, n_hist)
            user = model.user_encoder(user_hist_rep).detach().cpu().numpy() # (n_batch, emb_dim)
            user_rep.append(user)
    # user_rep = np.concatenate(user_rep, axis = 0)
    return user_rep # [user_rep, ...]

In [14]:
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best

def mrr_score(y_true, y_score):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)

## Training & evaluating

In [15]:
# train with valid
def train(model, train_dataset, valid_dataset = None, epochs = 4):
    optimizer = optim.Adam(model.parameters(), lr = 1e-4)
    entrophy = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        train_losses = []
        valid_losses = []
        model.train()
        for _, batch in tqdm(enumerate(train_dataset)):
            if batch[0].shape[0] == 0:
                break
            # torch.Size([16, 5, 30]) torch.Size([16, 50, 30]) torch.Size([16])
            sample = torch.tensor(batch[0], dtype = torch.long, device = device)
            history = torch.tensor(batch[1], dtype = torch.long, device = device)
            correct = torch.argmax(torch.tensor(batch[2], dtype = torch.long, device = device), dim = 1)
            optimizer.zero_grad()
            output = model(history, sample)
            loss = entrophy(output, correct)
            train_losses.append(loss.item())
            loss.backward()
            optimizer.step()

        if valid_dataset is not None:
            model.eval()
            with torch.no_grad():
                for _, batch in enumerate(valid_dataset):
                    if batch[0].shape[0] == 0:
                        break
                    sample = torch.tensor(batch[0], dtype = torch.long, device = device)
                    history = torch.tensor(batch[1], dtype = torch.long, device = device)
                    correct = torch.argmax(torch.tensor(batch[2], dtype = torch.long, device = device), dim = 1)
                    output = model(sample, history)
                    loss = entrophy(output, correct)
                    valid_losses.append(loss.item())
                print('[epoch {:d}] train_loss: {:.4f} valid_loss: {:.4f}'.format(epoch + 1, np.average(train_losses), np.average(valid_losses)))
        else:
            print('[epoch {:d}] train_loss: {:.4f}'.format(epoch + 1, np.average(train_losses)))

In [16]:
def evaluate(model, dev_dataset, news_info, dev_users, dev_user_hist):
    news_rep = encode_all_news(news_info, model.news_encoder) # (65238, 400)
    user_rep = encode_all_user(dev_users, dev_user_hist, model.user_encoder, news_rep)
    
    model.eval()
    with torch.no_grad():
        auc_scores = []
        mrr_scores = []
        ndcg5_scores = []
        ndcg10_scores = []
        for i, batch in tqdm(enumerate(dev_dataset)):
            if len(batch[0]) == 0:
                break
            user = user_rep[i]
            for j in range(len(batch[0])):
                sample = news_rep[batch[0][j]] # (n_imp, emb_dim)
                positive = batch[2][j] # (1, n_imp)

                score = np.matmul(sample, user[j]) # (1, n_imp)
                predict = np.exp(score) / np.sum(np.exp(score))

                auc_scores.append(roc_auc_score(positive, predict))
                mrr_scores.append(mrr_score(positive, predict))
                ndcg5_scores.append(ndcg_score(positive, predict, k = 5))
                ndcg10_scores.append(ndcg_score(positive, predict, k = 10))
    print('[Test] AUC: {:4f}, MRR: {:4f}, nDCG5:{:4f}, nDCG10: {:4f}'.format(
        np.mean(auc_scores), np.mean(mrr_scores), np.mean(ndcg5_scores), np.mean(ndcg10_scores)
    ))

In [17]:
def train_and_eval(model, train_dataset, dev_dataset, news_info, dev_users, dev_user_hist, epochs = 4):
    optimizer = optim.Adam(model.parameters(), lr = 1e-4)
    entrophy = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        train_losses = []
        valid_losses = []
        model.train()
        for _, batch in tqdm(enumerate(train_dataset)):
            if batch[0].shape[0] == 0:
                break
            # torch.Size([16, 5, 30]) torch.Size([16, 50, 30]) torch.Size([16])
            sample = torch.tensor(batch[0], dtype = torch.long, device = device)
            history = torch.tensor(batch[1], dtype = torch.long, device = device)
            correct = torch.argmax(torch.tensor(batch[2], dtype = torch.long, device = device), dim = 1)
            optimizer.zero_grad()
            output = model(history, sample)
            loss = entrophy(output, correct)
            train_losses.append(loss.item())
            loss.backward()
            optimizer.step()
        print('[epoch {:d}] train_loss: {:.4f}'.format(epoch + 1, np.average(train_losses)))
        evaluate(model, dev_dataset, news_info, dev_users, dev_user_hist)

## Provider

In [116]:
def load_provider(news_list, newsid_dict):
    n_news = len(news_list)
    urlid_dict = {'<PAD>': 0, '<OOV>': 1}
    news_urls = np.zeros((n_news), dtype = 'int32')
    with open('/data/Recommend/MIND/news_urls_v2.json', 'r') as f:
        data = json.load(f)
        for news_id in data:
            url = data[news_id]
            if url not in urlid_dict:
                urlid_dict[url] = len(urlid_dict)
            if news_id in newsid_dict:
                news_urls[newsid_dict[news_id]] = urlid_dict[url]
    print(len(urlid_dict))
    return urlid_dict, news_urls

In [117]:
urlid_dict, news_urls = load_provider(news_list, newsid_dict)

1708


In [167]:
def load_freq_provider(news_list, newsid_dict):
    n_news = len(news_list)
    urlid_dict = {'<PAD>': 0, '<OOV>': 1}
    url_freq_dict = {} # 
    news_urls = np.zeros((n_news), dtype = 'int32')
    with open('/data/Recommend/MIND/news_urls_v2.json', 'r') as f:
        data = json.load(f)
        for news_id in data:
            url = data[news_id]
            if url not in urlid_dict:
                urlid_dict[url] = len(urlid_dict)
                url_freq_dict[urlid_dict[url]] = 1
            else:
                url_freq_dict[urlid_dict[url]] += 1
            if news_id in newsid_dict:
                news_urls[newsid_dict[news_id]] = urlid_dict[url]
    print(len(urlid_dict))
    return urlid_dict, news_urls, url_freq_dict

In [168]:
urlid_dict, news_urls, url_freq_dict = load_freq_provider(news_list, newsid_dict)

1708


In [176]:
# [(url_id, freq)]
sorted_url_freq = sorted([(i, url_freq_dict[i]) for i in url_freq_dict], key = lambda x: x[1], reverse = True)

s = 0
url_trans_dict = {} # url_id -> trans_id
trans = [[]] # trans_id -> url_id
idx = 0
while idx < len(sorted_url_freq):
    if s < 1000:
        trans[-1].append(sorted_url_freq[idx][0])
        s += sorted_url_freq[idx][1]
    else:
        trans.append([sorted_url_freq[idx][0]])
        s = sorted_url_freq[idx][1]
    url_trans_dict[sorted_url_freq[idx][0]] = len(trans) - 1
    idx += 1
print(len(trans))

105


In [183]:
trans_news_urls = np.array([url_trans_dict[i] for i in news_urls], dtype = int)

In [None]:
news_url_freq = 

In [184]:
url_train_dataset = TrainDataset(train_datas, train_users, train_labels, news_info, train_user_hist, n_batch, news_urls = trans_news_urls)

In [40]:
class AttentionPooling(nn.Module):
    def __init__(self, emb_dim, query_dim):
        super().__init__()
        self.fc1 = nn.Linear(emb_dim, query_dim)
        self.fc2 = nn.Linear(query_dim, 1)
        
    def forward(self, x, mask = None):
        '''
        (n_batch, n_seq, emb_dim) -> (n_batch, emb_dim)
        a = q^T tanh(V * k + v)
        alpha = softmax(a)
        '''
        a = self.fc2(torch.tanh(self.fc1(x))) # (n_batch, n_seq, 1)
        if mask is not None:
            a = a.masked_fill(mask.unsqueeze(-1) == 0, -1e9)
        alpha = F.softmax(a, dim = -2) # (n_batch, n_seq, 1)
        r = torch.matmul(alpha.transpose(-2, -1), x).squeeze(-2) # (n_batch, emb_dim)
        return r

In [97]:
class ProNewsEncoder(nn.Module):
    def __init__(self, args, word_emb, cate_emb, n_urls, url_dim, news_dim):
        super().__init__()
        self.args = args
        
        self.word_embedding = nn.Embedding.from_pretrained(word_emb)
        self.title_cnn = nn.Conv1d(word_emb.shape[1], news_dim, 3, padding = 1)
        self.title_attn = AttentionPooling(news_dim, 200)
        out_dim = news_dim
        if args['use_url']:
            self.url_embedding = nn.Embedding(n_urls, url_dim)
            self.url_fc1 = nn.Linear(url_dim, 200)
            self.url_fc2 = nn.Linear(200, news_dim)
            out_dim += news_dim
        if args['use_cate']:
            self.cate_embedding = nn.Embedding(n_urls, url_dim)
            self.cate_fc1 = nn.Linear(cate_emb.shape[1], 200)
            self.cate_fc2 = nn.Linear(200, news_dim)
            self.aggr_fc = nn.Linear
            out_dim += news_dim * 2
        # self.aggr_attn = AttentionPooling(news_dim, 200)
        self.aggr_fc = nn.Linear(out_dim, news_dim)
        self.drop = nn.Dropout(0.2)
    
    def forward(self, news, urls = None):
        title, body, cate = news[:, :30], news[:, 30: -2], news[:, -2:]
        
        t_rep = self.word_embedding(title) # (n_batch, n_seq, emb_dim)
        t_rep = self.drop(t_rep)
        t_rep = self.title_cnn(t_rep.transpose(2, 1)).transpose(2, 1) # (n_batch, n_seq, news_dim)
        t_rep = self.drop(t_rep) # no relu
        t_rep = self.title_attn(t_rep) # (n_batch, news_dim)
        
        if self.args['use_url']:
            u_rep = self.url_embedding(urls) # (n_news, emb_dim)
            u_rep = self.drop(u_rep)
            u_rep = F.relu(self.url_fc1(u_rep))
            u_rep = self.url_fc2(u_rep) # (n_news, news_dim)
            t_rep = torch.cat((t_rep, u_rep), dim = -1)
            #t_rep = torch.stack((t_rep, u_rep), dim = -2)
        #else:
            #t_rep = t_rep.unsqueeze(-2)
        if self.args['use_cate']:
            c_rep = self.cate_embedding(cate) # (n_news, 2, emb_dim)
            c_rep = self.drop(c_rep)
            c_rep = F.relu(self.cate_fc1(c_rep))
            c_rep = self.cate_fc2(c_rep) # (n_news, 2, news_dim)
            t_rep = torch.cat((t_rep, c_rep.reshape(c_rep.shape[0], -1)), dim = -1)
            # t_rep = torch.cat((t_rep, c_rep), dim = -2)
        # t_rep = self.aggr_attn(t_rep)
        t_rep = self.aggr_fc(t_rep)
        return t_rep # (n_news, news_dim)

class UserEncoder(nn.Module):
    def __init__(self, news_dim):
        super().__init__()
        self.attn = AttentionPooling(news_dim, 200)
    
    def forward(self, h): 
        u = self.attn(h)
        return u

In [51]:
class Provider(nn.Module):
    def __init__(self, args, word_emb, cate_emb, n_urls):
        super().__init__()
        url_dim, news_dim = 100, 256
        self.news_encoder = ProNewsEncoder(args, word_emb, cate_emb, n_urls, url_dim, news_dim)
        self.user_encoder = UserEncoder(news_dim)
    
    def forward(self, hist, samp, hist_urls, samp_urls):
        n_batch, n_news, n_sequence = hist.shape
        n_samp = samp.shape[1] # k + 1
        
        hist = hist.reshape(n_batch * n_news, n_sequence)
        hist_urls = hist_urls.reshape(n_batch * n_news)
        h = self.news_encoder(hist, hist_urls) # (n_batch*n_news, n_filter)
        h = h.reshape(n_batch, n_news, -1)  # (n_batch, n_news, n_filter)
        u = self.user_encoder(h) # (n_batch, n_filter)
        
        samp = samp.reshape(n_batch * n_samp, n_sequence)
        samp_urls = samp_urls.reshape(n_batch * n_samp)
        r = self.news_encoder(samp, samp_urls) # (n_batch*(k+1), n_filter)
        r = r.reshape(n_batch, n_samp, -1) # (n_batch, k + 1, n_filter)
        
        y = torch.bmm(r, u.unsqueeze(2)) # (n_batch, K + 1, 1)
        return y.squeeze(2)

In [88]:
args = {'model': 'Provider',
        'use_cate': False,
        'use_url': True,}
print(args)
model = Provider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_and_eval_provider(model, url_train_dataset, dev_dataset, news_info, news_urls, epochs = 10)

{'model': 'Provider', 'use_cate': False, 'use_url': True}


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 1] loss: 1.4302 AUC: 0.614185, MRR: 0.273071, nDCG5:0.297681, nDCG10: 0.363926


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 2] loss: 1.3785 AUC: 0.640705, MRR: 0.291117, nDCG5:0.320171, nDCG10: 0.383687


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 3] loss: 1.3606 AUC: 0.628058, MRR: 0.286336, nDCG5:0.313614, nDCG10: 0.377902


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 4] loss: 1.3491 AUC: 0.635592, MRR: 0.290645, nDCG5:0.318341, nDCG10: 0.382643


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 5] loss: 1.3411 AUC: 0.641448, MRR: 0.295913, nDCG5:0.322590, nDCG10: 0.386736


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 6] loss: 1.3321 AUC: 0.635583, MRR: 0.286755, nDCG5:0.316828, nDCG10: 0.380306


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 7] loss: 1.3277 AUC: 0.624161, MRR: 0.284997, nDCG5:0.312688, nDCG10: 0.377207


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 8] loss: 1.3211 AUC: 0.633644, MRR: 0.290238, nDCG5:0.317840, nDCG10: 0.382220


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 9] loss: 1.3155 AUC: 0.634868, MRR: 0.289577, nDCG5:0.318457, nDCG10: 0.381380


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 10] loss: 1.3110 AUC: 0.639893, MRR: 0.295128, nDCG5:0.322584, nDCG10: 0.386759


In [89]:
args = {'model': 'Provider',
        'use_cate': False,
        'use_url': False,}
print(args)
model = Provider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_and_eval_provider(model, url_train_dataset, dev_dataset, news_info, news_urls, epochs = 10)

{'model': 'Provider', 'use_cate': False, 'use_url': False}


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 1] loss: 1.4413 AUC: 0.628879, MRR: 0.284624, nDCG5:0.313193, nDCG10: 0.378957


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 2] loss: 1.3947 AUC: 0.616490, MRR: 0.277957, nDCG5:0.299881, nDCG10: 0.368338


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 3] loss: 1.3783 AUC: 0.613622, MRR: 0.280652, nDCG5:0.303172, nDCG10: 0.370235


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 4] loss: 1.3674 AUC: 0.623822, MRR: 0.287230, nDCG5:0.311453, nDCG10: 0.376722


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 5] loss: 1.3590 AUC: 0.625941, MRR: 0.286986, nDCG5:0.312302, nDCG10: 0.378597


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 6] loss: 1.3529 AUC: 0.637591, MRR: 0.290674, nDCG5:0.317976, nDCG10: 0.383654


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 7] loss: 1.3466 AUC: 0.627673, MRR: 0.286657, nDCG5:0.311742, nDCG10: 0.379550


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 8] loss: 1.3431 AUC: 0.641195, MRR: 0.298448, nDCG5:0.325959, nDCG10: 0.391161


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 9] loss: 1.3382 AUC: 0.637345, MRR: 0.299605, nDCG5:0.327699, nDCG10: 0.391572


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 10] loss: 1.3333 AUC: 0.632965, MRR: 0.290412, nDCG5:0.315021, nDCG10: 0.381709


In [95]:
args = {'model': 'Provider',
        'use_cate': True,
        'use_url': True,}
print(args)
model = Provider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_and_eval_provider(model, url_train_dataset, dev_dataset, news_info, news_urls, epochs = 10)

{'model': 'Provider', 'use_cate': True, 'use_url': True}


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 1] loss: 1.4012, AUC: 0.640653, MRR: 0.303924, nDCG5:0.335263, nDCG10: 0.397745


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 2] loss: 1.3563, AUC: 0.643656, MRR: 0.304700, nDCG5:0.335069, nDCG10: 0.398930


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 3] loss: 1.3396, AUC: 0.640591, MRR: 0.299477, nDCG5:0.330489, nDCG10: 0.392819


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 4] loss: 1.3283, AUC: 0.647465, MRR: 0.303471, nDCG5:0.334918, nDCG10: 0.396716


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 5] loss: 1.3200, AUC: 0.653494, MRR: 0.312777, nDCG5:0.343799, nDCG10: 0.405539


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 6] loss: 1.3133, AUC: 0.656056, MRR: 0.314353, nDCG5:0.349014, nDCG10: 0.408039


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 7] loss: 1.3072, AUC: 0.650733, MRR: 0.311483, nDCG5:0.345407, nDCG10: 0.405460


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 8] loss: 1.3009, AUC: 0.656592, MRR: 0.317243, nDCG5:0.352243, nDCG10: 0.411916


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 9] loss: 1.2957, AUC: 0.650683, MRR: 0.313092, nDCG5:0.346717, nDCG10: 0.407398


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 10] loss: 1.2915, AUC: 0.652786, MRR: 0.309348, nDCG5:0.342627, nDCG10: 0.403183


In [96]:
args = {'model': 'Provider',
        'use_cate': True,
        'use_url': False,}
print(args)
model = Provider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_and_eval_provider(model, url_train_dataset, dev_dataset, news_info, news_urls, epochs = 10)

{'model': 'Provider', 'use_cate': True, 'use_url': False}


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 1] loss: 1.4078, AUC: 0.634319, MRR: 0.293220, nDCG5:0.320965, nDCG10: 0.386314


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 2] loss: 1.3670, AUC: 0.648170, MRR: 0.308181, nDCG5:0.338436, nDCG10: 0.402698


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 3] loss: 1.3523, AUC: 0.649158, MRR: 0.311340, nDCG5:0.340253, nDCG10: 0.402825


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 4] loss: 1.3421, AUC: 0.647363, MRR: 0.305711, nDCG5:0.334785, nDCG10: 0.399599


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 5] loss: 1.3338, AUC: 0.653449, MRR: 0.311382, nDCG5:0.342792, nDCG10: 0.405395


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 6] loss: 1.3261, AUC: 0.656934, MRR: 0.312819, nDCG5:0.343344, nDCG10: 0.407314


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 7] loss: 1.3211, AUC: 0.658063, MRR: 0.309421, nDCG5:0.341891, nDCG10: 0.405290


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 8] loss: 1.3173, AUC: 0.655687, MRR: 0.313527, nDCG5:0.343807, nDCG10: 0.407653


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 9] loss: 1.3120, AUC: 0.652762, MRR: 0.307561, nDCG5:0.338517, nDCG10: 0.403030


0it [00:00, ?it/s]

0it [00:00, ?it/s]

[epoch 10] loss: 1.3090, AUC: 0.656447, MRR: 0.312309, nDCG5:0.344102, nDCG10: 0.406753


In [55]:
args = {'model': 'Provider',
        'use_cate': False,
        'use_url': True,}
print(args)
model = Provider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_provider(model, url_train_dataset)
# train_and_eval(model, url_train_dataset, dev_dataset, news_info, dev_users, dev_user_hist, epochs = 10)

{'model': 'Provider', 'use_cate': False, 'use_url': True}


0it [00:00, ?it/s]

[epoch 1] train_loss: 1.4308


0it [00:00, ?it/s]

[epoch 2] train_loss: 1.3787


0it [00:00, ?it/s]

[epoch 3] train_loss: 1.3608


0it [00:00, ?it/s]

[epoch 4] train_loss: 1.3494


### train and eval

In [98]:
def train_epoch(model, train_dataset, optimizer, entrophy):
    train_losses = []
    model.train()
    for _, batch in enumerate(train_dataset):
        if batch[0].shape[0] == 0:
            break
        sample = torch.tensor(batch[0], dtype = torch.long, device = device)
        history = torch.tensor(batch[1], dtype = torch.long, device = device)
        correct = torch.argmax(torch.tensor(batch[2], dtype = torch.long, device = device), dim = 1)
        samp_urls = torch.tensor(batch[3], dtype = torch.long, device = device)
        user_urls = torch.tensor(batch[4], dtype = torch.long, device = device)
        optimizer.zero_grad()
        output = model(history, sample, user_urls, samp_urls)
        loss = entrophy(output, correct)
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()
    return np.average(train_losses)

In [73]:
def encode_provider_all_news(news_info, news_urls, news_encoder):
    n_news = len(news_info)
    news_rep = []
    n_batch = 32
    for i in range((len(news_info) + n_batch - 1) // n_batch):
        batch_news = torch.tensor(news_info[i * n_batch: (i + 1) * n_batch], dtype = torch.long, device = 'cuda')
        batch_urls = torch.tensor(news_urls[i * n_batch: (i + 1) * n_batch], dtype = torch.long, device = 'cuda')
        batch_rep = news_encoder(batch_news, batch_urls).detach().cpu().numpy()
        news_rep.append(batch_rep)
    news_rep = np.concatenate(news_rep, axis = 0)
    return news_rep # (n_news, n_title, n_emb)

def encode_all_user(user_encoder, news_rep):
    user_rep = []
    with torch.no_grad():
        for _, batch in enumerate(dev_dataset):
            if len(batch[0]) == 0:
                break
            user_hist_rep = torch.tensor(news_rep[batch[1]], device = 'cuda') # (n_batch, n_hist)
            user = model.user_encoder(user_hist_rep).detach().cpu().numpy() # (n_batch, emb_dim)
            user_rep.append(user)
    return user_rep # [user_rep, ...]

In [54]:
def train_provider(model, train_dataset, epochs = 4):
    optimizer = optim.Adam(model.parameters(), lr = 1e-4)
    entrophy = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        train_losses = []
        valid_losses = []
        model.train()
        for _, batch in tqdm(enumerate(train_dataset)):
            if batch[0].shape[0] == 0:
                break
            sample = torch.tensor(batch[0], dtype = torch.long, device = device)
            history = torch.tensor(batch[1], dtype = torch.long, device = device)
            correct = torch.argmax(torch.tensor(batch[2], dtype = torch.long, device = device), dim = 1)
            samp_urls = torch.tensor(batch[3], dtype = torch.long, device = device)
            user_urls = torch.tensor(batch[4], dtype = torch.long, device = device)
            optimizer.zero_grad()
            output = model(history, sample, user_urls, samp_urls)
            loss = entrophy(output, correct)
            train_losses.append(loss.item())
            loss.backward()
            optimizer.step()
        print('[epoch {:d}] train_loss: {:.4f}'.format(epoch + 1, np.average(train_losses)))

In [99]:
def evaluate_provider(model, news_info, news_urls):
    news_rep = encode_provider_all_news(news_info, news_urls, model.news_encoder)
    user_rep = encode_all_user(model.user_encoder, news_rep)
    model.eval()
    with torch.no_grad():
        auc_scores = []
        mrr_scores = []
        ndcg5_scores = []
        ndcg10_scores = []
        for i, batch in enumerate(dev_dataset):
            if len(batch[0]) == 0:
                break
            for j in range(len(batch[0])): # n_batch
                sample = news_rep[batch[0][j]] # (n_imp, emb_dim)
                score = np.matmul(sample, user_rep[i][j]) # (1, n_imp)
                predict = np.exp(score) / np.sum(np.exp(score))

                positive = batch[2][j] # (1, n_imp)
                
                auc_scores.append(roc_auc_score(positive, predict))
                mrr_scores.append(mrr_score(positive, predict))
                ndcg5_scores.append(ndcg_score(positive, predict, k = 5))
                ndcg10_scores.append(ndcg_score(positive, predict, k = 10))
    return np.mean(auc_scores), np.mean(mrr_scores), np.mean(ndcg5_scores), np.mean(ndcg10_scores)

In [186]:
def train_and_eval_provider(model, train_dataset, dev_dataset, news_info, news_urls, epochs = 10):
    optimizer = optim.Adam(model.parameters(), lr = 1e-4)
    entrophy = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        loss = train_epoch(model, train_dataset, optimizer, entrophy)
        auc, mrr, ndcg5, ndcg10 = evaluate_provider(model, news_info, news_urls)
        print('[epoch {:d}] loss: {:.4f}, AUC: {:.4f}, MRR: {:.4f}, nDCG5:{:.4f}, nDCG10: {:.4f}'.format(
            epoch + 1, loss, auc, mrr, ndcg5, ndcg10))

In [None]:
## NRMS Provider

In [100]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super().__init__()
        self.d_k = d_k

    def forward(self, Q, K, V, attn_mask=None):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(self.d_k)
        scores = torch.exp(scores)
        if attn_mask is not None:
            scores = scores * attn_mask
        attn = scores / (torch.sum(scores, dim=-1, keepdim=True)  + 1e-8)
        
        context = torch.matmul(attn, V)
        return context, attn

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, n_heads, d_k, d_v):
        super().__init__()
        self.d_model = d_model # 300
        self.n_heads = n_heads # 20
        self.d_k = d_k # 20
        self.d_v = d_v # 20
        
        self.W_Q = nn.Linear(d_model, d_k * n_heads) # 300, 400
        self.W_K = nn.Linear(d_model, d_k * n_heads) # 300, 400
        self.W_V = nn.Linear(d_model, d_v * n_heads) # 300, 400
        
        self._initialize_weights()
                
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight, gain=1)
                
    def forward(self, Q, K, V, attn_mask=None):
        residual, batch_size = Q, Q.size(0)
        
        q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2)
        k_s = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2)
        v_s = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_v).transpose(1,2)
        
        if attn_mask is not None:
            attn_mask = attn_mask.unsqueeze(1).expand(batch_size, max_len, max_len) 
            attn_mask = attn_mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1) 
        
        context, attn = ScaledDotProductAttention(self.d_k)(q_s, k_s, v_s, attn_mask) 
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_v) 
        return context # (n_batch, n_seq, emb_dim)

In [None]:
class ProAttnNewsEncoder(nn.Module):
    def __init__(self, word_emb, cate_emb, n_urls, url_dim, news_dim):
        super().__init__()
        self.args = args
        self.word_embedding = nn.Embedding.from_pretrained(word_emb)
        emb_dim = word_emb.shape[1]
        self.self_attn = MultiHeadSelfAttention(emb_dim, 16, 16, 16)
        self.addi_attn = AttentionPooling(news_dim, 200)
        self.dropout = nn.Dropout(0.2)
        if args['use_url']:
            self.url_embedding = nn.Embedding(n_urls, url_dim)
            #self.url_fc1 = nn.Linear(url_dim, 200)
            #self.url_fc2 = nn.Linear(200, news_dim)
        #if args['use_cate']:
            #self.cate_embedding = nn.Embedding(n_urls, url_dim)
            #self.cate_fc1 = nn.Linear(cate_emb.shape[1], 200)
            #self.cate_fc2 = nn.Linear(200, news_dim)
        self.aggr_attn = AttentionPooling(news_dim, 200)
    
    def forward(self, news, urls = None):
        title, body, cate, subcate = news[:, :max_title], news[:, max_title: -2], news[:, -2], news[:, -1]
        
        t_rep = self.word_embedding(title) # (n_batch, n_seq, emb_dim)
        if self.args['use_url']:
            u_rep = self.url_embedding(urls) # (n_news, emb_dim)
            t_rep = torch.cat((t_rep, u_rep), dim = -1)
        t_rep = self.dropout(t_rep)
        t_rep = self.self_attn(t_rep, t_rep, t_rep) # (n_batch, n_seq, 256)
        t_rep = self.addi_attn(t_rep) # (n_batch, 256)
            #u_rep = self.dropout(u_rep)
            #u_rep = F.relu(self.url_fc1(u_rep))
            #u_rep = self.url_fc2(u_rep) # (n_news, news_dim)
            # t_rep = torch.cat((t_rep, u_rep), dim = -1)
            t_rep = torch.stack((t_rep, u_rep), dim = -2)
#         if self.args['use_cate']:
#             c_rep = self.cate_embedding(cate) # (n_news, 2, emb_dim)
#             c_rep = self.drop(c_rep)
#             c_rep = F.relu(self.cate_fc1(c_rep))
#             c_rep = self.cate_fc2(c_rep) # (n_news, 2, news_dim)
#             # t_rep = torch.cat((t_rep, c_rep.reshape(c_rep.shape[0], -1)), dim = -1)
#             t_rep = torch.cat((t_rep, c_rep), dim = -2)
#         if self.args['use_url'] or self.args['use_cate']:
#             t_rep = self.aggr_attn(t_rep)
        
        return t_rep # (n_news, 256)

class AttnUserEncoder(nn.Module):
    def __init__(self, n_head, news_dim, query_dim):
        super().__init__()
        self.self_attn = MultiHeadSelfAttention(news_dim, n_head, 16, 16)
        self.addi_attn = AttentionPooling(news_dim, query_dim)
    
    def forward(self, h): # (n_batch, n_news, 256)
        u = self.self_attn(h, h, h) # (n_batch, n_news, 256)
        u = self.addi_attn(u) # (n_batch, 256)
        return u

In [110]:
# emb+dense，拼起来
class AttnNewsEncoder(nn.Module):
    def __init__(self, word_emb, cate_emb, n_urls, url_dim, news_dim):
        super().__init__()
        self.args = args
        self.word_embedding = nn.Embedding.from_pretrained(word_emb)
        emb_dim = word_emb.shape[1]
        self.self_attn = MultiHeadSelfAttention(emb_dim, 16, 16, 16)
        self.addi_attn = AttentionPooling(news_dim, 200)
        self.dropout = nn.Dropout(0.2)
        if args['use_url']:
            self.url_embedding = nn.Embedding(n_urls, url_dim)
            self.url_fc1 = nn.Linear(url_dim, 200)
            self.url_fc2 = nn.Linear(200, news_dim)
        if args['use_cate']:
            self.cate_embedding = nn.Embedding(n_urls, url_dim)
            self.cate_fc1 = nn.Linear(cate_emb.shape[1], 200)
            self.cate_fc2 = nn.Linear(200, news_dim)
        self.aggr_attn = AttentionPooling(news_dim, 200)
    
    def forward(self, news, urls = None):
        title, body, cate, subcate = news[:, :max_title], news[:, max_title: -2], news[:, -2], news[:, -1]
        
        t_rep = self.word_embedding(title) # (n_batch, n_seq, emb_dim)
        t_rep = self.dropout(t_rep)
        t_rep = self.self_attn(t_rep, t_rep, t_rep) # (n_batch, n_seq, 256)
        t_rep = self.addi_attn(t_rep) # (n_batch, 256)
        if self.args['use_url']:
            u_rep = self.url_embedding(urls) # (n_news, emb_dim)
            u_rep = self.dropout(u_rep)
            u_rep = F.relu(self.url_fc1(u_rep))
            u_rep = self.url_fc2(u_rep) # (n_news, news_dim)
            # t_rep = torch.cat((t_rep, u_rep), dim = -1)
            t_rep = torch.stack((t_rep, u_rep), dim = -2)
        if self.args['use_cate']:
            c_rep = self.cate_embedding(cate) # (n_news, 2, emb_dim)
            c_rep = self.drop(c_rep)
            c_rep = F.relu(self.cate_fc1(c_rep))
            c_rep = self.cate_fc2(c_rep) # (n_news, 2, news_dim)
            # t_rep = torch.cat((t_rep, c_rep.reshape(c_rep.shape[0], -1)), dim = -1)
            t_rep = torch.cat((t_rep, c_rep), dim = -2)
        if self.args['use_url'] or self.args['use_cate']:
            t_rep = self.aggr_attn(t_rep)
        
        return t_rep # (n_news, 256)

class AttnUserEncoder(nn.Module):
    def __init__(self, n_head, news_dim, query_dim):
        super().__init__()
        self.self_attn = MultiHeadSelfAttention(news_dim, n_head, 16, 16)
        self.addi_attn = AttentionPooling(news_dim, query_dim)
    
    def forward(self, h): # (n_batch, n_news, 256)
        u = self.self_attn(h, h, h) # (n_batch, n_news, 256)
        u = self.addi_attn(u) # (n_batch, 256)
        return u

In [108]:
class NRMSProvider(nn.Module):
    def __init__(self, args, word_emb, cate_emb, n_urls):
        super().__init__()
        n_head, query_dim, news_dim, url_dim = 16, 200, 256, 100
        self.news_encoder = AttnNewsEncoder(word_emb, cate_emb, n_urls, url_dim, news_dim)
        self.user_encoder = AttnUserEncoder(n_head, news_dim, query_dim)
    
    def forward(self, hist, samp, hist_urls, samp_urls):
        n_batch, n_news, n_sequence = hist.shape
        n_samp = samp.shape[1] # k + 1
        
        hist = hist.reshape(n_batch * n_news, n_sequence)
        hist_urls = hist_urls.reshape(n_batch * n_news)
        h = self.news_encoder(hist, hist_urls) # (n_batch*n_news, n_filter)
        h = h.reshape(n_batch, n_news, -1)  # (n_batch, n_news, n_filter)
        u = self.user_encoder(h) # (n_batch, n_filter)
        
        samp = samp.reshape(n_batch * n_samp, n_sequence)
        samp_urls = samp_urls.reshape(n_batch * n_samp)
        r = self.news_encoder(samp, samp_urls) # (n_batch*(k+1), n_filter)
        r = r.reshape(n_batch, n_samp, -1) # (n_batch, k + 1, n_filter)
        
        y = torch.bmm(r, u.unsqueeze(2)) # (n_batch, K + 1, 1)
        return y.squeeze(2)

In [112]:
# baseline
args = {'use_cate': False,
        'use_url': False,}
print(args)
model = NRMSProvider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_and_eval_provider(model, url_train_dataset, dev_dataset, news_info, news_urls, epochs = 10)

{'use_cate': False, 'use_url': False}
[epoch 1] loss: 1.4438, AUC: 0.626256, MRR: 0.287423, nDCG5:0.313649, nDCG10: 0.379230
[epoch 2] loss: 1.3633, AUC: 0.644016, MRR: 0.294473, nDCG5:0.324997, nDCG10: 0.390357
[epoch 3] loss: 1.3351, AUC: 0.647987, MRR: 0.306748, nDCG5:0.333062, nDCG10: 0.399120
[epoch 4] loss: 1.3170, AUC: 0.657707, MRR: 0.304572, nDCG5:0.335827, nDCG10: 0.400480
[epoch 5] loss: 1.3028, AUC: 0.649721, MRR: 0.303463, nDCG5:0.332081, nDCG10: 0.398349
[epoch 6] loss: 1.2898, AUC: 0.661174, MRR: 0.312188, nDCG5:0.344431, nDCG10: 0.407761
[epoch 7] loss: 1.2792, AUC: 0.660339, MRR: 0.311623, nDCG5:0.344018, nDCG10: 0.406630
[epoch 8] loss: 1.2695, AUC: 0.652947, MRR: 0.305997, nDCG5:0.338246, nDCG10: 0.401636
[epoch 9] loss: 1.2600, AUC: 0.653206, MRR: 0.307962, nDCG5:0.338948, nDCG10: 0.402673
[epoch 10] loss: 1.2516, AUC: 0.658162, MRR: 0.308586, nDCG5:0.340811, nDCG10: 0.404065


In [None]:
# emb+dense，拼起来
class AttnNewsEncoder(nn.Module):
    def __init__(self, word_emb, cate_emb, n_urls, url_dim, news_dim):
        super().__init__()
        self.args = args
        self.word_embedding = nn.Embedding.from_pretrained(word_emb)
        emb_dim = word_emb.shape[1]
        self.self_attn = MultiHeadSelfAttention(emb_dim, 16, 16, 16)
        self.addi_attn = AttentionPooling(news_dim, 200)
        self.dropout = nn.Dropout(0.2)
        if args['use_url']:
            self.url_embedding = nn.Embedding(n_urls, url_dim)
            self.url_fc1 = nn.Linear(url_dim, 200)
            self.url_fc2 = nn.Linear(200, news_dim)
        if args['use_cate']:
            self.cate_embedding = nn.Embedding(n_urls, url_dim)
            self.cate_fc1 = nn.Linear(cate_emb.shape[1], 200)
            self.cate_fc2 = nn.Linear(200, news_dim)
        self.aggr_attn = AttentionPooling(news_dim, 200)
    
    def forward(self, news, urls = None):
        title, body, cate, subcate = news[:, :max_title], news[:, max_title: -2], news[:, -2], news[:, -1]
        
        t_rep = self.word_embedding(title) # (n_batch, n_seq, emb_dim)
        t_rep = self.dropout(t_rep)
        t_rep = self.self_attn(t_rep, t_rep, t_rep) # (n_batch, n_seq, 256)
        t_rep = self.addi_attn(t_rep) # (n_batch, 256)
        if self.args['use_url']:
            u_rep = self.url_embedding(urls) # (n_news, emb_dim)
            u_rep = self.dropout(u_rep)
            u_rep = F.relu(self.url_fc1(u_rep))
            u_rep = self.url_fc2(u_rep) # (n_news, news_dim)
            # t_rep = torch.cat((t_rep, u_rep), dim = -1)
            t_rep = torch.stack((t_rep, u_rep), dim = -2)
        if self.args['use_cate']:
            c_rep = self.cate_embedding(cate) # (n_news, 2, emb_dim)
            c_rep = self.drop(c_rep)
            c_rep = F.relu(self.cate_fc1(c_rep))
            c_rep = self.cate_fc2(c_rep) # (n_news, 2, news_dim)
            # t_rep = torch.cat((t_rep, c_rep.reshape(c_rep.shape[0], -1)), dim = -1)
            t_rep = torch.cat((t_rep, c_rep), dim = -2)
        if self.args['use_url'] or self.args['use_cate']:
            t_rep = self.aggr_attn(t_rep)
        
        return t_rep # (n_news, 256)

In [111]:
args = {'use_cate': False,
        'use_url': True,}
print(args)
model = NRMSProvider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_and_eval_provider(model, url_train_dataset, dev_dataset, news_info, news_urls, epochs = 10)

{'use_cate': False, 'use_url': True}
[epoch 1] loss: 1.4431, AUC: 0.629754, MRR: 0.285048, nDCG5:0.311885, nDCG10: 0.378244
[epoch 2] loss: 1.3589, AUC: 0.635085, MRR: 0.290121, nDCG5:0.318390, nDCG10: 0.383206
[epoch 3] loss: 1.3306, AUC: 0.643643, MRR: 0.293269, nDCG5:0.322231, nDCG10: 0.388091
[epoch 4] loss: 1.3123, AUC: 0.646461, MRR: 0.299256, nDCG5:0.329119, nDCG10: 0.393342
[epoch 5] loss: 1.2970, AUC: 0.645105, MRR: 0.300852, nDCG5:0.330124, nDCG10: 0.394462
[epoch 6] loss: 1.2847, AUC: 0.650236, MRR: 0.296123, nDCG5:0.327296, nDCG10: 0.392557
[epoch 7] loss: 1.2728, AUC: 0.646378, MRR: 0.299704, nDCG5:0.329079, nDCG10: 0.393694
[epoch 8] loss: 1.2626, AUC: 0.647762, MRR: 0.301701, nDCG5:0.333167, nDCG10: 0.396765
[epoch 9] loss: 1.2534, AUC: 0.651204, MRR: 0.299765, nDCG5:0.332111, nDCG10: 0.395607
[epoch 10] loss: 1.2435, AUC: 0.641509, MRR: 0.296343, nDCG5:0.325201, nDCG10: 0.389937


In [114]:
# emb+dense，拼起来过 dense
class AttnNewsEncoder(nn.Module):
    def __init__(self, word_emb, cate_emb, n_urls, url_dim, news_dim):
        super().__init__()
        self.args = args
        self.word_embedding = nn.Embedding.from_pretrained(word_emb)
        emb_dim = word_emb.shape[1]
        self.self_attn = MultiHeadSelfAttention(emb_dim, 16, 16, 16)
        self.addi_attn = AttentionPooling(news_dim, 200)
        self.dropout = nn.Dropout(0.2)
        if args['use_url']:
            self.url_embedding = nn.Embedding(n_urls, url_dim)
            self.url_fc1 = nn.Linear(url_dim, 200)
            self.url_fc2 = nn.Linear(200, 100)
        self.aggr_fc = nn.Linear(news_dim + 100, news_dim)
    
    def forward(self, news, urls = None):
        title, body, cate, subcate = news[:, :max_title], news[:, max_title: -2], news[:, -2], news[:, -1]
        
        t_rep = self.word_embedding(title) # (n_batch, n_seq, emb_dim)
        t_rep = self.dropout(t_rep)
        t_rep = self.self_attn(t_rep, t_rep, t_rep) # (n_batch, n_seq, 256)
        t_rep = self.addi_attn(t_rep) # (n_batch, 256)
        if self.args['use_url']:
            u_rep = self.url_embedding(urls) # (n_news, emb_dim)
            u_rep = self.dropout(u_rep)
            u_rep = F.relu(self.url_fc1(u_rep))
            u_rep = self.url_fc2(u_rep) # (n_news, news_dim)
            t_rep = torch.cat((t_rep, u_rep), dim = -1)
        t_rep = self.aggr_fc(t_rep)
        
        return t_rep # (n_news, 256)

In [115]:
args = {'use_cate': False,
        'use_url': True,}
print(args)
model = NRMSProvider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_and_eval_provider(model, url_train_dataset, dev_dataset, news_info, news_urls, epochs = 10)

{'use_cate': False, 'use_url': True}
[epoch 1] loss: 1.4463, AUC: 0.613157, MRR: 0.271900, nDCG5:0.295055, nDCG10: 0.361679
[epoch 2] loss: 1.3671, AUC: 0.628813, MRR: 0.289894, nDCG5:0.315237, nDCG10: 0.380686
[epoch 3] loss: 1.3380, AUC: 0.644556, MRR: 0.295952, nDCG5:0.326661, nDCG10: 0.390277
[epoch 4] loss: 1.3186, AUC: 0.632399, MRR: 0.296454, nDCG5:0.323740, nDCG10: 0.387720
[epoch 5] loss: 1.3039, AUC: 0.638446, MRR: 0.293820, nDCG5:0.319958, nDCG10: 0.386343
[epoch 6] loss: 1.2897, AUC: 0.642365, MRR: 0.302987, nDCG5:0.330598, nDCG10: 0.393458
[epoch 7] loss: 1.2773, AUC: 0.643093, MRR: 0.297057, nDCG5:0.325500, nDCG10: 0.389809
[epoch 8] loss: 1.2659, AUC: 0.639924, MRR: 0.295067, nDCG5:0.323120, nDCG10: 0.389134
[epoch 9] loss: 1.2545, AUC: 0.642639, MRR: 0.302348, nDCG5:0.329903, nDCG10: 0.393644
[epoch 10] loss: 1.2437, AUC: 0.645298, MRR: 0.297470, nDCG5:0.326656, nDCG10: 0.390972


In [185]:
# transed_url，dense
args = {'use_cate': False,
        'use_url': True,}
print(args)
model = NRMSProvider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_and_eval_provider(model, url_train_dataset, dev_dataset, news_info, news_urls, epochs = 10)

{'use_cate': False, 'use_url': True}
[epoch 1] loss: 1.4527, AUC: 0.621859, MRR: 0.283489, nDCG5:0.307231, nDCG10: 0.374937
[epoch 2] loss: 1.3707, AUC: 0.635717, MRR: 0.290229, nDCG5:0.316730, nDCG10: 0.384467
[epoch 3] loss: 1.3418, AUC: 0.651020, MRR: 0.305289, nDCG5:0.337063, nDCG10: 0.400292
[epoch 4] loss: 1.3234, AUC: 0.646394, MRR: 0.301809, nDCG5:0.331351, nDCG10: 0.396631
[epoch 5] loss: 1.3072, AUC: 0.656247, MRR: 0.309307, nDCG5:0.342439, nDCG10: 0.405441
[epoch 6] loss: 1.2942, AUC: 0.654781, MRR: 0.305495, nDCG5:0.337555, nDCG10: 0.401396
[epoch 7] loss: 1.2829, AUC: 0.654060, MRR: 0.307587, nDCG5:0.338681, nDCG10: 0.402192
[epoch 8] loss: 1.2699, AUC: 0.657118, MRR: 0.309369, nDCG5:0.341139, nDCG10: 0.405270
[epoch 9] loss: 1.2602, AUC: 0.654281, MRR: 0.308069, nDCG5:0.338826, nDCG10: 0.402803
[epoch 10] loss: 1.2490, AUC: 0.658029, MRR: 0.313175, nDCG5:0.345284, nDCG10: 0.407919


In [190]:
# emb+dense，attn
class AttnNewsEncoder(nn.Module):
    def __init__(self, word_emb, cate_emb, n_urls, url_dim, news_dim):
        super().__init__()
        self.args = args
        self.word_embedding = nn.Embedding.from_pretrained(word_emb)
        emb_dim = word_emb.shape[1]
        self.self_attn = MultiHeadSelfAttention(emb_dim, 16, 16, 16)
        self.addi_attn = AttentionPooling(news_dim, 200)
        self.dropout = nn.Dropout(0.2)
        if args['use_url']:
            self.url_embedding = nn.Embedding(n_urls, url_dim)
            self.url_fc1 = nn.Linear(url_dim, 200)
            self.url_fc2 = nn.Linear(200, news_dim)
        if args['use_cate']:
            self.cate_embedding = nn.Embedding(n_urls, url_dim)
            self.cate_fc1 = nn.Linear(cate_emb.shape[1], 200)
            self.cate_fc2 = nn.Linear(200, news_dim)
        self.aggr_attn = AttentionPooling(news_dim, 200)
    
    def forward(self, news, urls = None):
        title, body, cate, subcate = news[:, :max_title], news[:, max_title: -2], news[:, -2], news[:, -1]
        
        t_rep = self.word_embedding(title) # (n_batch, n_seq, emb_dim)
        t_rep = self.dropout(t_rep)
        t_rep = self.self_attn(t_rep, t_rep, t_rep) # (n_batch, n_seq, 256)
        t_rep = self.addi_attn(t_rep) # (n_batch, 256)
        if self.args['use_url']:
            u_rep = self.url_embedding(urls) # (n_news, emb_dim)
            u_rep = self.dropout(u_rep)
            u_rep = F.relu(self.url_fc1(u_rep))
            u_rep = self.url_fc2(u_rep) # (n_news, news_dim)
            # t_rep = torch.cat((t_rep, u_rep), dim = -1)
            t_rep = torch.stack((t_rep, u_rep), dim = -2)
        if self.args['use_cate']:
            c_rep = self.cate_embedding(cate) # (n_news, 2, emb_dim)
            c_rep = self.drop(c_rep)
            c_rep = F.relu(self.cate_fc1(c_rep))
            c_rep = self.cate_fc2(c_rep) # (n_news, 2, news_dim)
            # t_rep = torch.cat((t_rep, c_rep.reshape(c_rep.shape[0], -1)), dim = -1)
            t_rep = torch.cat((t_rep, c_rep), dim = -2)
        if self.args['use_url'] or self.args['use_cate']:
            t_rep = self.aggr_attn(t_rep)
        
        return t_rep # (n_news, 256)

In [191]:
# emb+dense，attn
args = {'use_cate': False,
        'use_url': True,}
print(args)
model = NRMSProvider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_and_eval_provider(model, url_train_dataset, dev_dataset, news_info, news_urls, epochs = 10)

{'use_cate': False, 'use_url': True}
[epoch 1] loss: 1.4477, AUC: 0.6189, MRR: 0.2774, nDCG5:0.2987, nDCG10: 0.3673
[epoch 2] loss: 1.3638, AUC: 0.6364, MRR: 0.2875, nDCG5:0.3137, nDCG10: 0.3801
[epoch 3] loss: 1.3333, AUC: 0.6449, MRR: 0.2958, nDCG5:0.3214, nDCG10: 0.3877
[epoch 5] loss: 1.2997, AUC: 0.6547, MRR: 0.3095, nDCG5:0.3369, nDCG10: 0.4008
[epoch 6] loss: 1.2866, AUC: 0.6514, MRR: 0.3049, nDCG5:0.3331, nDCG10: 0.3975
[epoch 7] loss: 1.2754, AUC: 0.6561, MRR: 0.3057, nDCG5:0.3330, nDCG10: 0.3995
[epoch 8] loss: 1.2641, AUC: 0.6628, MRR: 0.3084, nDCG5:0.3408, nDCG10: 0.4051
[epoch 9] loss: 1.2550, AUC: 0.6558, MRR: 0.3031, nDCG5:0.3346, nDCG10: 0.3990
[epoch 10] loss: 1.2451, AUC: 0.6544, MRR: 0.3051, nDCG5:0.3328, nDCG10: 0.3985


In [192]:
# emb+dense，attn
args = {'use_cate': False,
        'use_url': True,}
print(args)
model = NRMSProvider(args, word_emb, cate_emb, len(urlid_dict)).to(device)
train_and_eval_provider(model, url_train_dataset, dev_dataset, news_info, news_urls, epochs = 10)

{'use_cate': False, 'use_url': True}
[epoch 1] loss: 1.4455, AUC: 0.6131, MRR: 0.2763, nDCG5:0.3001, nDCG10: 0.3672
[epoch 2] loss: 1.3645, AUC: 0.6428, MRR: 0.2960, nDCG5:0.3239, nDCG10: 0.3885
[epoch 3] loss: 1.3355, AUC: 0.6396, MRR: 0.2966, nDCG5:0.3244, nDCG10: 0.3893
[epoch 4] loss: 1.3159, AUC: 0.6439, MRR: 0.2982, nDCG5:0.3278, nDCG10: 0.3923
[epoch 5] loss: 1.3007, AUC: 0.6523, MRR: 0.3059, nDCG5:0.3366, nDCG10: 0.4004
[epoch 6] loss: 1.2894, AUC: 0.6540, MRR: 0.3080, nDCG5:0.3396, nDCG10: 0.4029
[epoch 7] loss: 1.2778, AUC: 0.6610, MRR: 0.3111, nDCG5:0.3437, nDCG10: 0.4061
[epoch 8] loss: 1.2682, AUC: 0.6488, MRR: 0.3038, nDCG5:0.3350, nDCG10: 0.3992
[epoch 9] loss: 1.2589, AUC: 0.6532, MRR: 0.3033, nDCG5:0.3343, nDCG10: 0.3995
[epoch 10] loss: 1.2479, AUC: 0.6540, MRR: 0.3061, nDCG5:0.3364, nDCG10: 0.4013
