In [24]:
import tensorflow as tf

from TestData.MindDependencies.MindIt import MINDIterator
from TestData.MindDependencies.Utils import get_mind_data_set, validate_model


from tqdm import tqdm
import pickle as pkl

from General.Utils import ValidateModel
from DataIterator import NewsDataset
from torch.utils.data import DataLoader


import torch as th
import numpy as np
import yaml

# Import Hparam
with open('Data/MINDdemo_utils/lstur.yaml','r') as stream:
    hparams = yaml.safe_load(stream)

# Import word_vec
word_embedding = np.load('Data/MINDdemo_utils/embedding_all.npy')
word_embedding = word_embedding.astype(np.float32)


In [26]:
# Define Device
device = 'cuda' if th.cuda.is_available() else 'mps'

# Define Data, Dataset and DataLoaders
train_behaviors_file = 'Data/MINDdemo_train/behaviors.tsv'
train_news_file = 'Data/MINDdemo_train/news.tsv'
word_dict_file = 'Data/MINDdemo_utils/word_dict_all.pkl'
user_dict_file = 'Data/MINDdemo_utils/uid2index.pkl'

valid_behaviors_file = 'Data/MINDdemo_dev/behaviors.tsv'
valid_news_file = 'Data/MINDdemo_dev/news.tsv'

In [27]:
import pickle

with open ("Data/MINDdemo_utils/word_dict.pkl", "rb") as f:
    word_dict = pickle.load(f)
with open ("Data/MINDdemo_utils/uid2index.pkl", "rb") as f:
    uid2index = pickle.load(f)

from dataclasses import dataclass

@dataclass
class HyperParams:
    batch_size: int
    title_size: int
    his_size: int
    wordDict_file: str
    userDict_file: str

hparamsdata = HyperParams(
    batch_size=32,
    title_size=20,
    his_size=50,
    wordDict_file=word_dict_file,
    userDict_file=user_dict_file,
)

train_iterator = MINDIterator(hparamsdata,npratio=4)
test_iterator = MINDIterator(hparamsdata)

batch_loader_train = train_iterator.load_data_from_file(train_news_file, train_behaviors_file)
batch_loader_valid = test_iterator.load_data_from_file(valid_news_file, valid_behaviors_file)

In [28]:
from TestData.LSTURMind import LSTURini


# Set Model Architecture
LSTUR_con_module = LSTURini(
    attention_dim = hparams['model']['attention_hidden_dim'],
    word_emb_dim = hparams['model']['word_emb_dim'],
    dropout = hparams['model']['dropout'],
    filter_num = hparams['model']['filter_num'],
    windows_size = hparams['model']['window_size'],
    gru_unit = hparams['model']['gru_unit'],
    user_size = train_iterator.uid2index.__len__() + 1,
    word_vectors = th.from_numpy(word_embedding).to(device),
    device = device
)



model = LSTUR_con_module.to(device)

# Define Optimizer
optimizer = th.optim.Adam(model.parameters(), lr=0.0001)

loss_fn = th.nn.CrossEntropyLoss()

# Define Loss
# def loss_fn(Scores,n_positive):
#     n = Scores.shape[0]

#     loss = 0
#     for i in range(n):
#         loss += -th.log(th.exp(Scores[i,:n_positive[i],0])/th.exp(Scores[i,:n_positive[i],:]).sum(dim=1)).sum()

#     return loss/n

def loss_fn_vali(Scores,labels):

    loss = -th.log(th.exp(Scores[labels == 1].sum())/th.exp(Scores).sum())

    return loss

  self.word_embedding = nn.Embedding.from_pretrained(th.tensor(word_vectors,dtype=th.float32), freeze=False, padding_idx=0)


In [29]:
Pre_training = validate_model(model, valid_news_file, valid_behaviors_file, test_iterator, device, metrics=['group_auc', 'mean_mrr', 'ndcg@5;10'])

586it [00:06, 85.49it/s]
236it [02:30,  1.57it/s]
7538it [00:38, 193.99it/s]


In [38]:
def batch_to_tensor(batch, device):
    user_id = th.from_numpy(batch['user_index_batch']).to(device).flatten()
    history_title = th.from_numpy(batch['clicked_title_batch']).to(device)
    impressions_title = th.from_numpy(batch['candidate_title_batch']).to(device)
    labels = th.from_numpy(batch['labels']).to(device)

    return user_id, history_title, impressions_title, labels

In [42]:
# Train the model
AUC = [Pre_training['group_auc']]
MRR = [Pre_training['mean_mrr']]
NDCG5 = [Pre_training['ndcg@5']]
NDCG10 = [Pre_training['ndcg@10']]


for epoch in range(1):

    for batch in tqdm(train_iterator.load_data_from_file(train_news_file, train_behaviors_file)):

        user_id, history_title, impressions_title, labels = batch_to_tensor(batch,device)


        model.train()

        optimizer.zero_grad()

        Scores = model(user_id, history_title, impressions_title)

        loss = loss_fn(Scores,labels)

        loss.backward()

        optimizer.step()
    
    with th.no_grad():
        model.eval()
        model.train(False)

        result = validate_model(model, valid_news_file, valid_behaviors_file, test_iterator, device, metrics=['group_auc', 'mean_mrr', 'ndcg@5;10'])

        AUC.append(result['group_auc'])
        MRR.append(result['mean_mrr'])
        NDCG5.append(result['ndcg@5'])
        NDCG10.append(result['ndcg@510'])
    
    print(result)

443it [19:29,  2.60s/it]