<a href="https://colab.research.google.com/github/Neilus03/NLP-2023/blob/main/Neural_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from tqdm import tqdm
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import nltk
nltk.download('cess_cat')

[nltk_data] Downloading package cess_cat to /root/nltk_data...
[nltk_data]   Unzipping corpora/cess_cat.zip.


True

In [None]:
from nltk.corpus import cess_cat as corpus

words = []
words_to_remove = ['*0*', '-Fpa-', '-Fpt-']
#for s in tqdm(corpus.sents()[:1000]): # debug or quickly train the network
for s in tqdm(corpus.sents()):
    new_s = ['<s>'] + s[:-1] + ['</s>']
    new_s = [w for w in new_s if w not in words_to_remove]
    words.extend(new_s)


100%|██████████| 17104/17104 [00:17<00:00, 1005.01it/s]


In [None]:
from torch.utils.data import Dataset, DataLoader

class FixedWindow(Dataset):
    def __init__(self, words, length_window):
        super().__init__()
        self.length_window = length_window
        # TODO:
        # compute the vocabulary = list of unique words in 'words',
        self.vocabulary = list(set([word for word in words]))
        
        # then assign a unique id number to each word in the vocabulary, 
        self.id_vocabulary = {i: word for i, word in enumerate(self.vocabulary)}
        
        # and finally compute a list of ids, one per word in 'words'
        
        # Create a reverse mapping to convert words to their ids, set the word as key and the id as value
        self.word_to_id = {word: i for i, word in self.id_vocabulary.items()}

        # Compute a list of ids, one per word in 'words'
        self.word_ids = [self.word_to_id[word] for word in words]


    def __len__(self):
        return len(self.word_ids) - self.length_window

    def __getitem__(self, idx):
        #TODO:
        '''
        returns a pair of tensors (first_ids, last_id) where
        first_ids are the ids of the words starting at index
        idx with length length_window-1, and last_id is the
        id at position idx+self.length_window-1, next to first_ids
        '''
        # Get the first_ids list of length length_window - 1
        first_ids_list = self.word_ids[idx: idx + self.length_window - 1]

        # Get the last_id at position idx + self.length_window - 1
        last_id_list = self.word_ids[idx + self.length_window - 1]

        # Convert the lists into tensors using torch.as_tensor()
        first_ids = torch.as_tensor(first_ids_list, dtype=torch.long)
        last_id = torch.as_tensor(last_id_list, dtype=torch.long)

        return first_ids, last_id



In [None]:

length_window = 5
dataset = FixedWindow(words, length_window)

x, y = dataset.__getitem__(10)
#print('x = {}, y = {}'.format(x, y))

batch_size = 1000 # 5 to debug
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) # shuffle=False to debug

if True:
    for nbatch, (X, y) in enumerate(dataloader):
        print('batch {}'.format(nbatch))
        print('X = {}'.format(X))
        print('y = {}'.format(y))
        for x,z in zip(X.numpy(), y.numpy()):
            print([dataset.id_vocabulary[w] for w in x], end=' ')
            print(dataset.id_vocabulary[z])
        if nbatch==3:
            break


batch 0
X = tensor([[33979, 34125, 33609, 35288],
        [12555,  7850, 11575, 22920],
        [29616, 20266, 31151, 21447],
        ...,
        [10646, 18101, 18629, 38830],
        [21316, 19737,  7739, 23538],
        [ 2497, 19004, 17332, 29270]])
y = tensor([24591, 38591,  2653, 33609, 24591, 22920, 26767, 21743, 26833,  5492,
         7342,  2497, 16069, 21743, 25711,  7047, 19065,  8886,  2653, 29367,
        18101, 20484, 30894, 34298, 33609,  5417, 12891, 39653,  1870, 29367,
        33609, 39084,  8688, 20266, 30894, 33609,  4746, 30894, 10478, 23538,
        25673, 32371, 33609, 20266, 20872,  8688,   917, 35787, 12555,  5023,
        14490, 12555,  2714, 17950, 17632,  2868, 29047, 29075, 17632, 33979,
        20266, 26233,  1504, 18301, 33609, 29367,  7275, 33609, 35237,  3964,
        18132, 21743, 24015,  8287, 26655, 38034, 29047, 33609, 16196, 17632,
        24591, 31970, 26384,  2497,  2653, 25673,  9591, 27139, 33609, 20824,
         6454, 30978, 33609, 25673, 3041

In [None]:
class NNLM(nn.Module):
    def __init__(self, num_classes, dim_input, dim_hidden, dim_embedding):
        super().__init__()
        self.num_classes = num_classes
        self.dim_input = dim_input
        self.dim_hidden = dim_hidden
        self.dim_embedding = dim_embedding
        self.embeddings = nn.Embedding(self.num_classes, self.dim_embedding) # embedding layer or look up table
        self.hidden1 = nn.Linear(self.dim_input * self.dim_embedding, self.dim_hidden, bias=False)
        self.ones = nn.Parameter(torch.ones(self.dim_hidden))       
        self.hidden2 = nn.Linear(self.dim_hidden, self.num_classes, bias=False)
        self.hidden3 = nn.Linear(self.dim_input * self.dim_embedding, self.num_classes, bias=False) # final layer
        self.bias = nn.Parameter(torch.ones(self.num_classes))

    def forward(self, X):
        word_embeds = self.embeddings(X)
        X = word_embeds.view(-1, self.dim_input * self.dim_embedding) # first layer
        tanh = torch.tanh(self.ones + self.hidden1(X)) # tanh layer
        output = self.bias + self.hidden3(X) + self.hidden2(tanh) # summing up all the layers with bias
        return output



In [None]:
num_classes = len(dataset.vocabulary)
dim_input = length_window - 1
dim_hidden = 50
dim_embedding = 32
learning_rate= 1e-3
num_epochs = 60

model = NNLM(num_classes, dim_input, dim_hidden, dim_embedding)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

path = 'NNLM.pt'
do_train = True
do_test = True

In [None]:
# In the top menu go to Runtime -> Change runtime type and set Hardware 
# accelerator to GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)
model = model.to(device)

cuda:0


In [None]:
from torch.cuda.random import device_count
if do_train:
    size = len(dataloader.dataset)
    for epoch in range(num_epochs):
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = loss_fn(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 100 == 0:
                loss, current = loss.item(), batch * batch_size
                print('Epoch {} loss: {:>7f}  [{:>5d}/{:>5d}]'
                    .format(epoch+1, loss, current, size))

    torch.save({'model_state_dict': model.state_dict()}, path)
else:
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])

Epoch 1 loss: 10.860778  [    0/509115]
Epoch 1 loss: 7.070927  [100000/509115]
Epoch 1 loss: 6.493715  [200000/509115]
Epoch 1 loss: 6.374540  [300000/509115]
Epoch 1 loss: 6.438286  [400000/509115]
Epoch 1 loss: 6.164059  [500000/509115]
Epoch 2 loss: 5.865498  [    0/509115]
Epoch 2 loss: 5.727062  [100000/509115]
Epoch 2 loss: 5.558372  [200000/509115]
Epoch 2 loss: 5.637250  [300000/509115]
Epoch 2 loss: 5.631076  [400000/509115]
Epoch 2 loss: 5.414221  [500000/509115]
Epoch 3 loss: 5.192548  [    0/509115]
Epoch 3 loss: 5.205191  [100000/509115]
Epoch 3 loss: 5.042065  [200000/509115]
Epoch 3 loss: 5.090976  [300000/509115]
Epoch 3 loss: 5.071109  [400000/509115]
Epoch 3 loss: 5.069546  [500000/509115]
Epoch 4 loss: 4.553745  [    0/509115]
Epoch 4 loss: 4.715072  [100000/509115]
Epoch 4 loss: 4.683568  [200000/509115]
Epoch 4 loss: 4.873069  [300000/509115]
Epoch 4 loss: 4.582596  [400000/509115]
Epoch 4 loss: 4.794263  [500000/509115]
Epoch 5 loss: 4.290627  [    0/509115]
Epoc

In [None]:
if do_test:
    num_sentences = 5
    max_num_words = 100

    nsent = 0
    generated_words = ['<s>', 'El', 'dia', 'que']
    assert len(generated_words)==dim_input # length_window-1

    model.eval()
    with torch.no_grad():
        while (nsent < num_sentences) and (len(generated_words) < max_num_words):
            # TODO: set input_ids to the list of ids of the last dim_input generated words
            input_ids = [dataset.word_to_id[word] for word in generated_words[-dim_input:]]

            pred = model(torch.tensor(input_ids).unsqueeze(0).to(device))
            probs = torch.nn.functional.softmax(pred, dim=1)

            #TODO:
            # probs is the probability of each id (word) in the vocabulary.
            # Now you have to select one output_id according to them, either the one
            # with maximum probability, or sample one id according to their
            # distribution in probs.
            # Hint: see doc. on numpy argmax and torch.multinomial
            # What's better ?

            # Using torch.multinomial to sample one id according to their distribution in probs
            output_id = torch.multinomial(probs, 1).item()

            # Get the ouput_word corresponding to the output_id
            output_word = dataset.id_vocabulary[output_id]

            generated_words += [output_word]
            if output_word == '</s>':
                nsent += 1  

    generated_text = ' '.join(generated_words)
    generated_text = generated_text.replace(' </s> <s>', '.').replace('<s> ','').replace(' </s>','.')
    for s in [' l\' ',' s\' ',' d\' ',]:
        generated_text = generated_text.replace(s, s[:-1])
    generated_text = generated_text.replace(' , ', ', ').replace('_',' ')
    print(generated_text)

El dia que no reunirà operadores de les eleccions autonòmiques, es vol càrrec amb una jornada de 40 necessari de fer del català i Izquierda Unida, poden marcar promocions de professionals, en concepte d' ha creat que l'interlocutor per la categoria : generacional la mundial de mà, en dos anys de presó. L' únic conseller delegat de Presidència de la qualitat que s'envolta malalts al segon nord. L' actriu es va produir a la presó Model de les conferències promocionals de notícies. Paral·lelament propietat que la referència es va pronunciar
