In [None]:
from Data_loaders import load_batch

In [None]:
# Description: Load tsv file

# Import libraries
import pandas as pd
import random


# Load tsv file
News = pd.read_csv('MINDsmall_train/news.tsv', sep='\t', header=None)
News.columns = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
News_vali = pd.read_csv('MINDsmall_dev/news.tsv', sep='\t', header=None)
News_vali.columns = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']

News = pd.concat([News, News_vali], ignore_index=True)


UserData = pd.read_csv('MINDsmall_train/behaviors.tsv', sep='\t', header=None)
UserData.columns = ['impression_id', 'user_id', 'time', 'history', 'impressions']

UserData = UserData.dropna()

topic_size = News['category'].nunique()
subtopic_size = News['subcategory'].nunique()

print(f"Data contains {topic_size} topics and {subtopic_size} subtopics")

Data contains 17 topics and 264 subtopics


In [15]:
# Define Vocabulary for users and topics
from torchtext import vocab
from torchtext.data.utils import get_tokenizer
import torch as th
from LSTUR import GloVe

tokenizer = get_tokenizer('basic_english')

User_vocab = vocab.build_vocab_from_iterator([[id] for id in UserData['user_id']])
News_vocab = vocab.build_vocab_from_iterator([[id] for id in  News['news_id']])
Category_vocab = vocab.build_vocab_from_iterator([[Category] for Category in News['category']])
Subcategory_vocab = vocab.build_vocab_from_iterator([[Category] for Category in News['subcategory']])

# Define Vocabulary for title and abstract
max_title_length = max([len(tokenizer(title)) for title in News['title']])
max_history_length = max([len(history.split(" ")) for history in UserData['history']])
max_history_length = 50 # Overwrite

max_impressions_length = max([len(impressions.split(" ")) for impressions in UserData['impressions']])
max_impressions_length = 5 # Overwrite

# Define Datapoint to tensor
def Datapoint_to_Encodings(User):

    History = News_vocab.lookup_indices(User.history.split(" "))
    User_en = User_vocab.__getitem__(User.user_id)
    Impressions = User.impressions.split(" ")
    Impressions,Clicked = map(list, zip(*[Impression.split("-") for Impression in Impressions]))
    
    Positive, Negative = [],[]
    for idx, click in enumerate(Clicked):
        if click == "1":
            Positive.append(Impressions[idx])
        else:
            Negative.append(Impressions[idx])

    Impressions = [Positive[0]]

   

    if len(Negative) > 3:
        for _ in random.sample(Negative,4):
            Impressions.append(_)
    else:
        for _ in range(4):
            Impressions.append(random.choice(Negative))

    Clicked = [1,0,0,0,0]

    # Shuffle
    shuffled_index = [0,1,2,3,4]
    random.shuffle(shuffled_index)


    Impressions = [Impressions[i] for i in shuffled_index]
    Clicked = [Clicked[i] for i in shuffled_index]


    # Convert to tensor
    Impressions = News_vocab.lookup_indices(Impressions)
    History, User_en, Impressions, Clicked = map(th.tensor, [History, User_en, Impressions, Clicked])

    return History, User_en, Impressions, Clicked

# Pack Title
def pack_Title(title,max_length):

    src_len, _ = title.size()

    title_reformated = th.zeros(max_length,300)

    title_reformated[:src_len,:] = title

    return title_reformated, src_len


# Get Numeric Artikles representation
def get_Article_Encodings(Artikle):


    title = GloVe.get_vecs_by_tokens(tokenizer(Artikle['title']))
    
    #Abstract = [tokenizer(abstract) for abstract in Artikle['abstract']]
    Category = Category_vocab.__getitem__(Artikle['category'])
    Subcategory = Subcategory_vocab.__getitem__(Artikle['subcategory'])

    title, title_len = pack_Title(title,max_title_length)

    Category, Subcategory, title_len = map(th.tensor, [Category, Subcategory, title_len])

    

    return Category, Subcategory, title, title_len

# Store all News in Dictionary for faster access
News_tensors = {}

for i in range(len(News)):
    News_tensors[News_vocab.__getitem__(News['news_id'][i])] = get_Article_Encodings(News.loc[i])

# Get Numeric User representation
def Datapoint_to_tensor(User):

    History, User_en, Impressions, Clicked = Datapoint_to_Encodings(User)

    History_tensor = th.zeros(max_history_length,max_title_length,300)
    Category = th.zeros(max_history_length)
    Subcategory = th.zeros(max_history_length)
    history_len = min(len(History),max_history_length)

    for idx,article in enumerate(History[-history_len:]):
        Category[idx], Subcategory[idx], History_tensor[idx], _ = News_tensors[article.item()]

    Impressions_tensor = th.zeros(max_impressions_length,max_title_length,300)
    Category_Impressions = th.zeros(max_impressions_length)
    Subcategory_Impressions = th.zeros(max_impressions_length)
    Impressions_len = len(Impressions)

    history_len, Impressions_len = map(th.tensor, [history_len, Impressions_len])


    for idx,article in enumerate(Impressions):
        Category_Impressions[idx], Subcategory_Impressions[idx], Impressions_tensor[idx], _ = News_tensors[article.item()]
    
    Clicked = Clicked.argmax()

    return User_en, Category, Subcategory, History_tensor, history_len, Category_Impressions, Subcategory_Impressions, Impressions_tensor, Impressions_len, Clicked


# Def load batch
def load_batch(User, batch_size, device='cpu'):
    
        # User = User.sample(frac=1).reset_index(drop=True)
    
        for i in range(0, len(User), batch_size):
    
            User_batch = User[i:i+batch_size]
    
            User_en = []
            Category = []
            Subcategory = []
            History_tensor = []
            history_len = []
            Category_Impressions = []
            Subcategory_Impressions = []
            Impressions_tensor = []
            Impressions_len = []
            Clicked = []
    
            for i in range(len(User_batch)):
                User_en_, Category_, Subcategory_, History_tensor_, history_len_, Category_Impressions_, Subcategory_Impressions_, Impressions_tensor_, Impressions_len_, Clicked_ = Datapoint_to_tensor(User_batch.iloc[i])
                User_en.append(User_en_)
                Category.append(Category_)
                Subcategory.append(Subcategory_)
                History_tensor.append(History_tensor_)
                history_len.append(history_len_)
                Category_Impressions.append(Category_Impressions_)
                Subcategory_Impressions.append(Subcategory_Impressions_)
                Impressions_tensor.append(Impressions_tensor_)
                Impressions_len.append(Impressions_len_)
                Clicked.append(Clicked_)
    
            User_en, Category, Subcategory, History_tensor, history_len, Category_Impressions, Subcategory_Impressions, Impressions_tensor, Impressions_len, Clicked = map(th.stack, [User_en, Category, Subcategory, History_tensor, history_len, Category_Impressions, Subcategory_Impressions, Impressions_tensor, Impressions_len, Clicked])
            User_en, Category, Subcategory, history_len, Category_Impressions, Subcategory_Impressions, Impressions_len, Clicked = map(lambda x: x.long(), [User_en, Category, Subcategory, history_len, Category_Impressions, Subcategory_Impressions, Impressions_len, Clicked])
            yield User_en.to(device), Category.to(device), Subcategory.to(device), History_tensor.to(device), history_len.to(device), Category_Impressions.to(device), Subcategory_Impressions.to(device), Impressions_tensor.to(device), Impressions_len.to(device), Clicked.to(device)

            #yield User_en, Category, Subcategory, History_tensor, history_len, Category_Impressions, Subcategory_Impressions, Impressions_tensor, Impressions_len, Clicked



  from .autonotebook import tqdm as notebook_tqdm


Predicted tensor([3, 2, 4, 2, 1, 0, 3, 1, 4, 0], device='cuda:0') with loss 12.576431274414062
Predicted tensor([3, 2, 4, 2, 1, 0, 2, 1, 4, 0], device='cuda:0') with loss 9.68658447265625
Predicted tensor([1, 2, 4, 2, 1, 2, 1, 1, 3, 0], device='cuda:0') with loss 8.20469856262207
Predicted tensor([4, 2, 4, 4, 1, 2, 3, 1, 3, 0], device='cuda:0') with loss 7.97101354598999
Predicted tensor([0, 0, 4, 0, 1, 0, 3, 1, 3, 0], device='cuda:0') with loss 5.2576422691345215
Predicted tensor([1, 2, 4, 0, 2, 0, 3, 1, 3, 0], device='cuda:0') with loss 3.297945499420166
Predicted tensor([1, 0, 4, 0, 2, 0, 3, 0, 1, 0], device='cuda:0') with loss 1.1414740085601807
Predicted tensor([1, 0, 4, 0, 2, 0, 3, 2, 1, 0], device='cuda:0') with loss 1.9333064556121826
Predicted tensor([1, 0, 4, 0, 2, 0, 3, 0, 1, 2], device='cuda:0') with loss 0.02471764385700226
Predicted tensor([1, 0, 4, 0, 2, 0, 3, 0, 1, 2], device='cuda:0') with loss 0.025607779622077942
Predicted tensor([0, 0, 4, 0, 2, 0, 3, 0, 2, 0], devic

In [16]:
History_lengths = th.tensor([len(history.split(" ")) for history in UserData['history']])
Impressions_length = th.tensor([len(impressions.split(" ")) for impressions in UserData['impressions']])
UserData['History Length'] = History_lengths
UserData['Impressions Length'] = Impressions_length

User_low = UserData[UserData['History Length'] < 51].copy()

User_low = User_low.reset_index()


In [None]:
# Load Model
from LSTUR import LSTUR_con
from torch import nn,optim
device = "cuda"

LSTUR_con_module = LSTUR_con(
    seq_len = max_history_length,
    user_dim=300,
    user_size=User_vocab.__len__(),
    topic_size=Category_vocab.__len__(),
    topic_dim=100,
    subtopic_size=Subcategory_vocab.__len__(),
    subtopic_dim=100,
    word_dim=300,
    device=device
)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(LSTUR_con_module.parameters(), lr=0.001)




In [24]:
BatchSize = 100

model = LSTUR_con_module.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


for epoch in range(5):

    BatchLoader = load_batch(UserData, batch_size=BatchSize,device=device)

    for _ in range(50):
        User_en, Category, Subcategory, History_tensor, history_len, Category_Impressions, Subcategory_Impressions, Impressions_tensor, Impressions_len, Clicked = BatchLoader.__next__()

        optimizer.zero_grad()

        output = model(User_en, Category, Subcategory, History_tensor, history_len, Category_Impressions, Subcategory_Impressions, Impressions_tensor)

        loss = loss_fn(output, Clicked)
        loss.backward()
        optimizer.step()

        print(loss)


tensor([ 9, 50, 16, 10,  4, 36, 35,  4, 19, 28, 50, 50, 50, 50, 50, 39, 25, 10,
         5, 40, 50, 50, 50, 50, 12, 20, 38, 20, 12, 50,  3, 10, 18, 50, 26, 50,
         9, 14, 50,  3, 32, 50, 19,  6,  9, 12, 22, 17, 50, 35, 27, 11, 38, 11,
        26, 19, 40, 50, 13, 31, 18, 50, 13,  2,  9, 40, 37, 31, 50, 15, 30, 25,
         9, 45, 15, 11, 50,  8,  4, 50, 14, 22, 12,  8,  4, 20, 41, 43,  1, 46,
        18,  5, 50,  3, 39, 20, 50, 33, 20,  3], device='cuda:0')
tensor(5.7061, device='cuda:0', grad_fn=<NllLossBackward0>)


In [25]:
th.save(model, "lstur.pt")

In [None]:
# Load Model
from LSTUR import LSTUR_con
from torch import nn,optim

LSTUR_con_module = LSTUR_con(
    seq_len = max_history_length,
    user_dim=300,
    user_size=User_vocab.__len__(),
    topic_size=Category_vocab.__len__(),
    topic_dim=100,
    subtopic_size=Subcategory_vocab.__len__(),
    subtopic_dim=100,
    word_dim=300
)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(LSTUR_con_module.parameters(), lr=0.001)

my_scripted_model = th.jit.script(LSTUR_con_module).to('mps')

for i in range(5):
    out = my_scripted_model(
        User_en.long(),
        Category.long(),
        Subcategory.long(),
        History_tensor,
        history_len.long(),
        Category_Impressions.long(),
        Subcategory_Impressions.long(),
        Impressions_tensor, 
    )

    loss = loss_fn(out, Clicked.long())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(loss)
    print(out.argmax(dim=1))
    print("")


NotImplementedError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
  File "<string>", line 47, in <forward op>
            p1m = (1. - p) * float(train)
            scale = 1. / (float(p1m == 0.) + p1m)
            res,mask = torch.native_dropout(input, p, train)
                       ~~~~~~~~~~~~~~~~~~~~ <--- HERE

            def backward(grad_output):
RuntimeError: The operator 'aten::native_dropout' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.



In [None]:
Clicked

tensor([3, 4, 4, 1, 2, 0, 0, 1, 3, 0])

In [None]:
test = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
len(test)

test[-3:]


[10, 11, 12]

In [None]:
for i in range(10):
    out = LSTUR_con_module(
        User_en.long(),
        Category.long(),
        Subcategory.long(),
        History_tensor,
        history_len.long(),
        Category_Impressions.long(),
        Subcategory_Impressions.long(),
        Impressions_tensor, 
    )

    loss = loss_fn(out, Clicked.long())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(loss)
    print(out.argmax(dim=1))
    print("")


tensor(0.0099, grad_fn=<NllLossBackward0>)
tensor([ 5, 27,  5, 14,  0])

tensor(0.0023, grad_fn=<NllLossBackward0>)
tensor([ 5, 27,  5, 14,  0])

tensor(0.0022, grad_fn=<NllLossBackward0>)
tensor([ 5, 27,  5, 14,  0])

tensor(0.0008, grad_fn=<NllLossBackward0>)
tensor([ 5, 27,  5, 14,  0])

tensor(0.0262, grad_fn=<NllLossBackward0>)
tensor([ 5, 27,  5, 14,  0])

tensor(0.1536, grad_fn=<NllLossBackward0>)
tensor([ 5, 27,  5, 14,  0])

tensor(0.0006, grad_fn=<NllLossBackward0>)
tensor([ 5, 27,  5, 14,  0])

tensor(0.0050, grad_fn=<NllLossBackward0>)
tensor([ 5, 27,  5, 14,  0])

tensor(0.0052, grad_fn=<NllLossBackward0>)
tensor([ 5, 27,  5, 14,  0])

tensor(0.0159, grad_fn=<NllLossBackward0>)
tensor([ 5, 27,  5, 14,  0])



In [None]:
Clicked

tensor([ 5, 27,  5, 14,  0])

In [None]:
from LSTUR import NewsEncoder, TitleEncoder, TopicEncoder, GloVe
import torch as th




topic_embedding_dim = 100
subtopic_embedding_dim = 100
word_size = 100


NewsEncoderModule = NewsEncoder(100, 100, topic_size, subtopic_size, 10000)
TitleEncoderModule = TitleEncoder(100)
TopicEncoderModule = TopicEncoder(100, 100, topic_size, subtopic_size)

topic = th.tensor([0],dtype=th.int32)
subtopic = th.tensor([0],dtype=th.int32)

title = tokenizer(News['title'][0])
W = GloVe.get_vecs_by_tokens(title)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Encode categories and subcategories:
categories = News['category'].unique()
subcategories = News['subcategory'].unique()

test = [i for i in range(len(categories))]
test2 = [i for i in range(len(subcategories))]

# Add encodings to dict
res = {}
for cat in categories:
    for value in test:
        res[cat] = value
        test.remove(value)
        break


res2 = {}
for subcat in subcategories:
    for value in test2:
        res2[subcat] = value
        test2.remove(value)
        break


# Replace column with encodings:
C = th.tensor(News['category'].replace(res))
SC = th.tensor(News['subcategory'].replace(res2))

print(C.shape)
print(SC.shape)




torch.Size([51282])
torch.Size([51282])


['the',
 'brands',
 'queen',
 'elizabeth',
 ',',
 'prince',
 'charles',
 ',',
 'and',
 'prince',
 'philip',
 'swear',
 'by']

In [None]:
# Get titles lengths
import numpy as np
from torch.nn.utils.rnn import pad_sequence

lengths = th.tensor([len(tokenizer(News['title'][i])) for i in range(News['title'].size)])


# Pad titles
maxlength = max(lengths)
padlengths = (maxlength-lengths).tolist()
padtitles = [tokenizer(News["title"][i] + " <Pad>"*padlengths[i]) for i in range(len(padlengths))]



# Embed titles
Article_embedding = th.zeros(len(padtitles),maxlength,300)
i = 0
for t in padtitles:
    Article_embedding[i] = GloVe.get_vecs_by_tokens(t)
    i+=1




In [None]:
# Dictionary of articles from their ID's
newsid = News['news_id']
article_dict = {newsid[i]:Article_embedding[i] for i in range(len(padtitles))}

print(len(article_dict))
print(article_dict["N55189"].shape)


51282
torch.Size([64, 300])


In [None]:
# User ID's of each impression log:
users = User['user_id'].unique()

test2 = [i for i in range(len(users))]


# Add encodings to dict
userid_dict = {}
for user in users:
    for value in test2:
        userid_dict[user] = value
        test2.remove(value)
        break



# Replace column with encodings:
U = th.tensor(User['user_id'].replace(userid_dict))
U.shape

KeyboardInterrupt: 

In [None]:
# For each impression log: all topics, subtopics and titles embedded and encoded in tensors
# First pad topic/subtopic sequences
N = len(User['history'])


# th.zeros()
seq_lengths = th.zeros(N)
for i in range(N):
    if isinstance(User['history'][i], str):
        seq_lengths[i] = len(tokenizer(User['history'][i]))
seq_lengths

# [item.upper() for item in test]

# seq_lengths = [(len(tokenizer(User['history'][i]))) for i in range(len(User['history']))]

# Find max sequence length
max_seq_length = max(seq_lengths)


pad_seq_lengths = ((max_seq_length-seq_lengths).to(th.int32)).tolist()

X = [item.upper() for item in User["history"][0]]
# print(tokenizer(User["history"][0]) + ["Hpad>>"]*2)



# NOT WORKING YET: NEED to encode history first
pad_hists = th.zeors(N,max_seq_length)
for i in range(N):
pad_hists = [th.tensor(tokenizer(User["history"][0]) + ["<HPad>"]*pad_seq_lengths[i]) ]

pad_hists



['n55189', 'n42782', 'n34694', 'n45794', 'n18445', 'n63302', 'n10414', 'n19347', 'n31801', 'Hpad>>', 'Hpad>>']


ValueError: too many dimensions 'str'

In [None]:
NewsEncoderModule(topic, subtopic, W).size()

In [None]:
tokenizer(User["history"][0])

['n55189',
 'n42782',
 'n34694',
 'n45794',
 'n18445',
 'n63302',
 'n10414',
 'n19347',
 'n31801']

In [None]:
(max_seq_length-seq_lengths).to(th.int32)

tensor([549, 476, 542,  ..., 535, 500, 556], dtype=torch.int32)