# Librerías

In [1]:
import json


import torch.nn as nn
import torch


from torch.utils.data import Dataset, DataLoader



from transformers import AutoTokenizer, DataCollatorWithPadding

# Carga dataset

In [2]:
with open('DependencyDataset\ConvAI2\convai_rel_complete.json','r') as f:
    data = json.load(f)

# Funciones y clases auxiliares

In [5]:
def create_vocabulary(dataset,encoding):
    if encoding == 'relative':
        vocab0 = set()
        vocab1 = set()

        for item in list(dataset.values()):
            for dep_label in item[encoding]:
                dep_label_split = dep_label.split('_')
                vocab0.add(dep_label_split[0])
                vocab1.add(dep_label_split[1])

    return (vocab0,vocab1)

In [13]:
def tokenize_dependency_labels(dataset,vocabs,encoding):
    token_data = {}
    if encoding == 'relative':
        for index, item in enumerate(dataset):
            vocab0 , vocab1 = vocabs
            aux = [x.split('_') for x in dataset[item][encoding]]
            token_data[index] = {#'text':item,
                        'tag0':[aux[i][0] for i in range(len(aux))],
                        'tag1':[aux[i][1] for i in range(len(aux))]}

    return token_data


In [19]:
# Pytorch dataset

class MyDependencyDataSet(Dataset):
    def __init__(self,data,word_to_indx):
        self.data = data
        self.word_to_indx0 , self.word_to_indx1 = word_to_indx


    def __getitem__(self,index):
        if torch.is_tensor(index):
            index = index.tolist()
        
        x = self.data[index]

        tag0 = torch.tensor([self.word_to_indx0[tag] for tag in x['tag0']])

        tag1 = torch.tensor([self.word_to_indx1[tag] for tag in x['tag1']])

        sample = {'tag0': tag0,
                'tag1': tag1}

        return  sample


    def __len__(self):
        return len(self.data)


# Creación vocabularios y conversión a índices

Para el relative encoding creo dos vocabularios: Uno con la parte que codifica la posición de la cabeza y otro con la parte que codifica el tag de dependencia.

In [6]:
vocab0, vocab1 = create_vocabulary(data,'relative')

In [10]:
print('Tamaño de cada vocabulario: \n')
print(f'Vocabulario posición: {len(vocab0)} elementos')
print(f'Vocabulario dependency tag: {len(vocab1)} elementos')


Tamaño de cada vocabulario: 

Vocabulario posición: 79 elementos
Vocabulario dependency tag: 49 elementos


Debo crear los diccionarios que me permitan mapear cada palabra a un índice

In [11]:
word_to_indx0 = {word: i for i,word in enumerate(vocab0)}
word_to_indx1 = {word: i for i,word in enumerate(vocab1)}

Creo un diccionario con todos los dependency tags convertidos a índices

In [14]:
tokenized_data = tokenize_dependency_labels(data,(vocab0,vocab1),'relative')

In [17]:
print(tokenized_data[0])

{'tag0': ['1', '-2', '-1', '3', '2', '1', '-5', '-1', '-2', '1', '-2', '1', '-2', '-1', '-13'], 'tag1': ['amod', 'root', 'punct', 'nsubj', 'aux', 'advmod', 'parataxis', 'compound:prt', 'obj', 'nsubj', 'acl:relcl', 'mark', 'xcomp', 'advmod', 'punct']}


# Creación dataset y dataloader

Lo hago ya con dataset y dataloader porque para el modelo global tendré que usarlo (?)

In [20]:
pytorch_tokenized_data = MyDependencyDataSet(tokenized_data,(word_to_indx0,word_to_indx1))

In [21]:
pytorch_tokenized_data[0]

{'tag0': tensor([28, 44, 29, 71, 19, 28, 12, 29, 44, 28, 44, 28, 44, 29, 66]),
 'tag1': tensor([11,  4, 23, 14, 24,  0, 15, 43, 38, 14, 46,  8,  3,  0, 23])}

In [22]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
dl = DataLoader(pytorch_tokenized_data,batch_size=16,shuffle=True,collate_fn=data_collator)

# Modelo LSTM

In [39]:
class LSTM_enc(torch.nn.Module):
    def __init__(self,embedding_dim,hidden_dim,vocab_sizes):
        super(LSTM_enc,self).__init__()
        self.hidden_dim = hidden_dim

        self.word_emb0 = nn.Embedding(vocab_sizes[0],embedding_dim)
        self.word_emb1 = nn.Embedding(vocab_sizes[1],embedding_dim)

        self.lstm = nn.LSTM(embedding_dim * 2,hidden_dim)

    def forward(self,tag0,tag1):
        emb0 = self.word_emb0(tag0)
        emb1 = self.word_emb1(tag1)

        embeds = torch.cat((emb0,emb1),1)
        
        lstm_out = self.lstm(embeds)

        return lstm_out

In [33]:
hidden_dim = 128
embedding_dim = 100
vocab_sizes = (len(vocab0),len(vocab1))


In [40]:
lstm_model = LSTM_enc(embedding_dim,hidden_dim,vocab_sizes)

In [36]:
dl.dataset[0]

{'tag0': tensor([28, 44, 29, 71, 19, 28, 12, 29, 44, 28, 44, 28, 44, 29, 66]),
 'tag1': tensor([11,  4, 23, 14, 24,  0, 15, 43, 38, 14, 46,  8,  3,  0, 23])}

In [46]:
output , (hidden_state, cell_state) = lstm_model(**pytorch_tokenized_data[0])

In [49]:
hidden_state.shape

torch.Size([1, 128])