In [31]:
from tqdm import tqdm
import pickle as pkl

from General.Utils import ValidateModel
from DataIteratorTransformer import NewsDataset

from DataIterator import NewsDataset as NewsDatasetOld
from torch.utils.data import DataLoader


from TestData.MindDependencies.Metrics import cal_metric


import torch as th
import numpy as np
import yaml

# Import Hparam
with open('Data/MINDdemo_utils/lstur.yaml','r') as stream:
    hparams = yaml.safe_load(stream)

# Import word_vec
word_embedding = np.load('Data/MINDdemo_utils/embedding_all.npy')

# Import word_vec
word_embedding = np.load('Data/MINDdemo_utils/embedding_all.npy')
word_embedding = word_embedding.astype(np.float32)


# %%
# Define Device
device = 'cuda' if th.cuda.is_available() else 'cpu'

# Define Data, Dataset and DataLoaders
train_behaviors_file = 'Data/MINDdemo_train/behaviors.tsv'
train_news_file = 'Data/MINDdemo_train/news.tsv'
word_dict_file = 'Data/MINDdemo_utils/word_dict_all.pkl'
user_dict_file = 'Data/MINDdemo_utils/uid2index.pkl'

valid_behaviors_file = 'Data/MINDdemo_dev/behaviors.tsv'
valid_news_file = 'Data/MINDdemo_dev/news.tsv'

# %%
import pickle

with open ("Data/MINDdemo_utils/word_dict.pkl", "rb") as f:
    word_dict = pickle.load(f)
with open ("Data/MINDdemo_utils/uid2index.pkl", "rb") as f:
    uid2index = pickle.load(f)

from dataclasses import dataclass

@dataclass
class HyperParams:
    batch_size: int
    title_size: int
    his_size: int
    wordDict_file: str
    userDict_file: str

hparamsdata = HyperParams(
    batch_size=32,
    title_size=20,
    his_size=50,
    wordDict_file=word_dict_file,
    userDict_file=user_dict_file,
)

TrainData = NewsDataset(train_behaviors_file, train_news_file, word_dict_file, userid_dict=uid2index,max_history_length=50)
LSTURData = NewsDatasetOld(train_behaviors_file, train_news_file, word_dict_file, userid_dict=uid2index,max_history_length=50,train=True)
TestData = NewsDataset(valid_behaviors_file, valid_news_file, word_dict_file, userid_dict=uid2index)

Max history length:  351
Max impressions length:  299


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.user_data['history_length'][self.user_data['history_length'] > 50] = 50


Max history length:  351
Max impressions length:  295


In [32]:
TrainDataLoader = DataLoader(LSTURData, batch_size=hparamsdata.batch_size, shuffle=False, num_workers=0)

In [33]:
from TestData.LSTURMind import NewsEncoder
newsencoder = NewsEncoder(attention_dim = hparams['model']['attention_hidden_dim'],
                        word_emb_dim = hparams['model']['word_emb_dim'],
                        dropout = hparams['model']['dropout'],
                        filter_num = hparams['model']['filter_num'],
                        windows_size = hparams['model']['window_size'],
                        gru_unit = hparams['model']['gru_unit'],
                        word_vectors = word_embedding,
                        device = device
                        )   




#%%
# Import Model
from Models.Transformer import lstransformer
impressions_length = 50


TransformerModule = lstransformer(his_size = hparamsdata.his_size,
                                  candidate_size = impressions_length,
                                  d_model = hparams['model']['gru_unit'], 
                                  ffdim = 800, 
                                  nhead = 1, 
                                  num_layers = 3, 
                                  newsencoder = newsencoder,
                                  user_vocab_size=uid2index.__len__() + 1,
                                  device=device,
                                  dropout=0.2,
                                )

# Move to device
model = TransformerModule.to(device)

In [34]:
def get_mask_key(batch_size,data_length, actual_length):

    mask = th.zeros((batch_size,data_length))


    for _ in range(batch_size):
        mask[_,actual_length[_]:] = 1

    return mask.bool()

def get_mask(batch_size, data_length, actual_length):

    mask = th.zeros((batch_size,data_length,data_length))


    for _ in range(batch_size):
        mask[_,:,actual_length[_]:] = float('-inf')
        

    return mask

In [36]:
user_id, history_title, history_abstract, history_length, impressions_title, impressions_abstract, impressions_length, labels, n_positive = next(iter(TrainDataLoader))

In [40]:
src_key_mask = get_mask_key(user_id.shape[0],hparamsdata.his_size,history_length)

In [41]:
score = model(user_id, history_title, None, src_key_mask, impressions_title, None, None)

In [43]:
score.shape

torch.Size([32, 5, 1])

In [14]:
score[0]

tensor([[0.0056],
        [0.0033],
        [0.0033],
        [0.0018],
        [0.0028],
        [0.0026],
        [0.0013],
        [0.0045],
        [0.0028],
        [0.0025],
        [0.0025],
        [0.0027],
        [0.0058],
        [0.0050],
        [0.0039],
        [0.0020],
        [0.0043],
        [0.0038],
        [0.0023],
        [0.0024],
        [0.0017],
        [0.0024],
        [0.0025],
        [0.0023],
        [0.0026],
        [0.0027],
        [0.0025],
        [0.0033],
        [0.0029],
        [0.0044],
        [0.0027],
        [0.0024],
        [0.0027],
        [0.0034],
        [0.0047],
        [0.0029],
        [0.0049],
        [0.0030],
        [0.0030],
        [0.0040],
        [0.0018],
        [0.0035],
        [0.0024],
        [0.0029],
        [0.0055],
        [0.0048],
        [0.0092],
        [0.0038],
        [0.0015],
        [0.0062],
        [0.0022],
        [0.0027],
        [0.0023],
        [0.0056],
        [0.0049],
        [0

In [25]:
src_mask = get_mask(32,50, history_length)
tgt_mask = get_mask(32,impressions_title.shape[1], impressions_length)
his_mask = get_mask_key(32,50,history_length)
impressions_mask = get_mask_key(32,impressions_title.shape[1],impressions_length)

score_mask = model(user_id, history_title, src_mask, his_mask, impressions_title, tgt_mask, impressions_mask)

In [27]:
impressions_length

tensor([ 87,   7,   3,  70,   2,  34, 128,  11,  93,  56,  49,   9,  16,  26,
         23,  37,  31,  23,  69,  19,  26, 120,   2,  34,  38,  35,  87,   4,
         13,  21,  21, 124])

In [28]:
score_mask[1]

tensor([[0.0038],
        [0.0038],
        [0.0044],
        [0.0034],
        [0.0026],
        [0.0031],
        [0.0061],
        [0.0047],
        [0.0044],
        [0.0055],
        [0.0032],
        [0.0033],
        [0.0033],
        [0.0048],
        [0.0020],
        [0.0045],
        [0.0017],
        [0.0052],
        [0.0028],
        [0.0032],
        [0.0028],
        [0.0020],
        [0.0044],
        [0.0038],
        [0.0037],
        [0.0037],
        [0.0032],
        [0.0028],
        [0.0033],
        [0.0032],
        [0.0021],
        [0.0035],
        [0.0026],
        [0.0021],
        [0.0033],
        [0.0019],
        [0.0026],
        [0.0027],
        [0.0014],
        [0.0026],
        [0.0047],
        [0.0045],
        [0.0051],
        [0.0025],
        [0.0058],
        [0.0018],
        [0.0034],
        [0.0017],
        [0.0023],
        [0.0033],
        [0.0039],
        [0.0019],
        [0.0022],
        [0.0014],
        [0.0023],
        [0

In [23]:
score_mask[0]

tensor([[0.0023],
        [0.0055],
        [0.0030],
        [0.0020],
        [0.0026],
        [0.0023],
        [0.0021],
        [0.0033],
        [0.0027],
        [0.0027],
        [0.0040],
        [0.0027],
        [0.0026],
        [0.0014],
        [0.0046],
        [0.0044],
        [0.0028],
        [0.0040],
        [0.0028],
        [0.0022],
        [0.0026],
        [0.0026],
        [0.0026],
        [0.0049],
        [0.0023],
        [0.0018],
        [0.0024],
        [0.0023],
        [0.0037],
        [0.0032],
        [0.0039],
        [0.0029],
        [0.0016],
        [0.0032],
        [0.0032],
        [0.0035],
        [0.0048],
        [0.0048],
        [0.0032],
        [0.0019],
        [0.0042],
        [0.0030],
        [0.0025],
        [0.0019],
        [0.0026],
        [0.0029],
        [0.0020],
        [0.0019],
        [0.0035],
        [0.0043],
        [0.0036],
        [0.0022],
        [0.0019],
        [0.0031],
        [0.0019],
        [0

In [10]:
mask.shape

torch.Size([32, 50, 50])

In [46]:

model.train(False)
score_mask = model(user_id, history_title, his_mask, impressions_title, impressions_mask)
score = model(user_id, history_title, his_mask, impressions_title, impressions_mask)