# T5 - Juan Luis Baldelomar Cabrera

In [4]:
# os
import random

# NLP and numpy
import nltk 
import numpy as np
import nltk
from nltk.probability import FreqDist
from nltk import TweetTokenizer
from nltk.corpus import stopwords
import pandas as pd

# torch
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from torch.nn import functional as F

# metrics
from sklearn.metrics import accuracy_score as accuracy

In [5]:
seed = 1111
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False

# Load Data

In [6]:
def load_data(filename, labels_filename):
    file = open(filename, 'r')
    labels_file = open(labels_filename, 'r')
    tweets = file.read()
    labels = labels_file.read()
    documents = tweets.split('\n')
    labels = labels.split('\n')
    documents.pop(-1)
    labels.pop(-1)
    file.close()
    labels_file.close()
    return documents, labels

In [7]:
documents, labels = load_data('data/mex_train.txt', 'data/mex_train_labels.txt')
val_documents, val_labels = load_data('data/mex_val.txt', 'data/mex_val_labels.txt')

# Vocabulary Utilities

In [8]:
def print_doc(doc:list, end=' ', stop=-1):
    stop = len(doc) if stop is None else stop
    for token in doc[:stop]:
        print(token, end=end)
    print('')

In [9]:
def get_vocabulary(tokenized_docs, n):
    tokens = [token for doc in tokenized_docs for token in doc]
    unique_tokens = FreqDist(tokens).most_common(n)
    return [token for token, _ in unique_tokens]

def word2ids(vocabulary):
    word2id = {}
    id2word = {}
    
    # build both dictionaries
    for i, word in enumerate(vocabulary):
        word2id[word] = i
        id2word[i] = word
    
    # add special tokens
    n = len(word2id)
    word2id['<s>']   = n 
    word2id['</s>']  = n + 1
    word2id['<unk>'] = n + 2
    id2word[n]       = '<s>'
    id2word[n + 1]   = '</s>'
    id2word[n + 2]   = '<unk>'
    
    return word2id, id2word

# NGram Builder Class

### Punctuation to Ignore

In [361]:
punctuation = ['.', '...', ',', '!', '¡', '¿', '?', ';', ';', '"', '|', '[', ']', '°', '(', ')', '*', '+', '/']

In [362]:
class NGramBuilder:
    def __init__(self, tokenizer=None, embeddings=None, d_model=256, sos='<s>', eos='</s>', unk='<unk>', punctuation=punctuation, postprocess=None):
        self.tokenizer = self.default_tokenizer() if tokenizer == None else tokenizer
        self.embeddings = embeddings
        self.d_model = d_model if embeddings is None else embeddings.d_model
        # special symbols
        self.SOS = sos
        self.EOS = eos
        self.UNK = unk
        # vocabulary 2 id and viceversa
        self.word2id  = None
        self.id2word  = None
        self.voc_size = 0
        # post tokenization functions
        self.punctuation = set(punctuation) if punctuation != None else None
        self.postprocess = postprocess if postprocess is not None else lambda x : x
        
    def default_tokenizer(doc):
        return TweetTokenizer().tokenize
    
    def get_vocabulary(self):
        return set(self.word2id.keys())
    
    def remove_punct(self, tokenized_documents):
        if self.punctuation == None:
            return tokenized_documents
        else:
            return [[token for token in doc if token not in self.punctuation] for doc in tokenized_documents]
        
    def get_ids(self, words:list):
        # transform list of words to list of ids
        unk_id = self.word2id.get(self.UNK, 0)
        ids = [self.word2id.get(word, unk_id) for word in words]
        return ids
    
    def _transform(self, tokenized_docs):
        N = self.N
        # docs and labels lists
        ngram_docs, ngram_targs = [], []
        # traverse each doc
        for doc in tokenized_docs:
            # add padding
            doc = [self.SOS]*(N - 1)  + doc + [self.EOS]
            ids = self.get_ids(doc)
            # traverse each word as center and build ngrams
            for i in range(N-1, len(doc)):    
                ngram_docs.append(ids[i-(N-1): i])
                ngram_targs.append(ids[i])
                
        return np.array(ngram_docs), np.array(ngram_targs)
    
    def _tokenize(self, documents):
        tokenized_docs = [self.tokenizer(doc.lower()) for doc in documents]
        tokenized_docs = self.remove_punct(tokenized_docs)
        tokenized_docs = self.postprocess(tokenized_docs)
        return tokenized_docs
    
    def build_emb_matrix(self):
        dim_v = len(self.word2id)
        if self.embeddings is None:
            self.emb_matrix = np.random.rand(dim_v, self.d_model)
        else:
            self.emb_matrix = np.random.rand(dim_v, self.d_model)
            for word in self.word2id.keys():
                if word in self.embeddings:
                    self.emb_matrix = self.embeddings[word]
                
    def fit(self, documents, N, t=10000):
        self.N = N
        # tokenize documents
        tokenized_docs = self._tokenize(documents)
        
        # get vocabulary and word2id and ids2word dicts
        vocabulary = get_vocabulary(tokenized_docs, t-3)
        self.word2id, self.id2word = word2ids(vocabulary)
        self.voc_size = len(self.word2id)
        self.build_emb_matrix()
        
        return self._transform(tokenized_docs)
    
    def transform(self, documents: list[list or str]):
        # list of documents as strings
        if type(documents[0]) is str:
            # tokenize documents
            tokenized_docs = self._tokenize(documents)
            return self._transform(tokenized_docs)
        
        # list of documents as list of tokens
        elif type(documents[0]) is list:
            return self._transform(documents)
        
        print('[ERR]: documents should be list of strings or list of lists of tokens')
        return None
    
    def inverse(self, docs_as_ids):
        # empty list
        if len(docs_as_ids) == 0:
            return None
        
        # multiple docs
        if type(docs_as_ids[0]) is list:
            return [self.id2word.get(tok_id) for doc in docs_as_ids 
                    for tok_id in doc]
        # single doc
        return [self.id2word.get(tok_id) for tok_id in docs_as_ids]

In [363]:
ngram_builder = NGramBuilder()
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=4)
val_ngram_docs, val_ngram_labels = ngram_builder.transform(val_documents)
ngram_builder.emb_matrix.shape

(10000, 256)

In [365]:
doc = ngram_builder.inverse(ngram_labels)
print_doc(doc[:30])

lo peor de todo es que no me dan por un tiempo y luego vuelven estoy hasta la verga de estl </s> a la vga no seas mamón 45 


In [189]:
ngram_docs[:10]

array([[9997, 9997, 9997],
       [9997, 9997,   28],
       [9997,   28,  282],
       [  28,  282,    1],
       [ 282,    1,   59],
       [   1,   59,   17],
       [  59,   17,    0],
       [  17,    0,    6],
       [   0,    6,    7],
       [   6,    7,  315]])

In [190]:
ngram_builder.transform(['hola como estas'])

(array([[9997, 9997, 9997],
        [9997, 9997,  670],
        [9997,  670,   30],
        [ 670,   30,  215]]),
 array([ 670,   30,  215, 9998]))

In [192]:
ngram_builder.transform([['hola', 'como', 'estas']])

(array([[9997, 9997, 9997],
        [9997, 9997,  670],
        [9997,  670,   30],
        [ 670,   30,  215]]),
 array([ 670,   30,  215, 9998]))

## Char NGram

In [54]:
def char_postprocess(documents):
    return [[c for c in word] for doc in documents for word in doc]        

In [55]:
ngram_builder = NGramBuilder(postprocess=char_postprocess)
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=4)
val_ngram_docs, val_ngram_labels = ngram_builder.transform(val_documents)

In [56]:
word = ngram_builder.inverse(ngram_labels)
print_doc(word[:100], end='', stop=-1)

lo</s>peor</s>de</s>todo</s>es</s>que</s>no</s>me</s>dan</s>por</s>un</s>tiempo</s>y</s>luego</s>vuelven</s>estoy</s>hasta</s>la</s>verga</s>de</s>estl</s>a</s>la</s>vg


In [57]:
ngram_docs[:10]

array([[364, 364, 364],
       [364, 364,   9],
       [364,   9,   2],
       [364, 364, 364],
       [364, 364,  13],
       [364,  13,   1],
       [ 13,   1,   2],
       [  1,   2,   4],
       [364, 364, 364],
       [364, 364,  10]])

# Dataset

In [343]:
def get_datasets(ngram_builder, N, train_docs, val_docs, batch_size=64, num_workers=2):
    ngram_docs, ngram_labels = ngram_builder.fit(documents, N=N)
    val_ngram_docs, val_ngram_labels = ngram_builder.transform(val_documents)
    
    train_ds = TensorDataset(torch.tensor(ngram_docs, dtype=torch.int64), torch.tensor(ngram_labels, dtype=torch.int64))
    train_loader = DataLoader(train_ds, shuffle=True, batch_size=batch_size, num_workers=num_workers)

    val_ds = TensorDataset(torch.tensor(val_ngram_docs, dtype=torch.int64), torch.tensor(val_ngram_labels, dtype=torch.int64))
    val_loader = DataLoader(val_ds, shuffle=False, batch_size=batch_size, num_workers=num_workers)
    
    return train_ds, train_loader, val_ds, val_loader

# Neural Language Model

In [58]:
class BengioModel(nn.Module):
    def __init__(self, N, voc_size, d_model, hidden_size=128, emb_mat=None, dropout=0.1):
        
        super(BengioModel, self).__init__()
        # parameters
        self.N           = N
        self.d_model     = d_model
        self.voc_size    = voc_size
        self.hidden_size = hidden_size
        
        # Matriz entrenable de embeddings, tamaño vocab_size x Ngram.d_model
        self.embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(emb_mat), freeze=False)
        
        # fully connected layers
        self.fc1 = nn.Linear(d_model * (N-1), hidden_size)
        self.fc2 = nn.Linear(hidden_size, voc_size)
        
        # dropout
        self.drop = nn.Dropout(dropout)
        
    
    def forward(self, input_seq):
        # Calcula el embedding para cada palabra.
        x = self.embeddings(input_seq)
        x = x.view(-1, (self.N-1) * self.d_model)
        x = self.fc1(x)
        x = self.drop(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

In [113]:
def get_preds(raw_logit):
    probs = F.softmax(raw_logit.detach(), dim=1)
    y_pred = torch.argmax(probs, dim=1).cpu().numpy()
    return y_pred

In [120]:
def get_probs(raw_logit):
    probs = F.softmax(raw_logit.detach(), dim=1)
    return probs.cpu().numpy()

## Test Model Forward

In [66]:
voc_size = ngram_builder.voc_size
N = ngram_builder.N
d_model = ngram_builder.d_model

model = BengioModel(N=N, voc_size=voc_size, d_model=d_model, emb_mat=ngram_builder.emb_matrix)

In [176]:
inputs, targs = list(train_loader)[0]

In [180]:
get_preds(model(inputs))

array([7311, 1509, 8174, 9097, 5687, 7311, 3779, 1184, 6567,  844,  692,
       2254,  778, 9837, 3779, 5687, 4098, 9837, 3779, 3779, 9837, 3823,
        778, 7311, 1184, 1623, 6604,  692,  692,  692, 5158, 3885, 3885,
       7311, 4908, 1961, 3885,  692,  692, 6122, 4098, 4098,  775, 7311,
        692,  310, 6604,  778,  692, 6604, 9544, 3779, 3779,  692, 7311,
       7311, 3779, 3779, 5687, 9837,  692,  692, 7788,  692])

In [67]:
model(torch.tensor(ngram_builder.transform(['hola como estas'])[0])).shape

torch.Size([4, 10000])

# Model 

In [203]:
def sample(probs):
    acc = np.cumsum(probs)       # build cumulative probability
    val = np.random.uniform()    # get random number between [0, 1]
    pos = np.argmax((val < acc)) # get the index of the word to sample
    return pos

In [310]:
class NGramNeuralModel:
    def __init__(self, NGram: NGramBuilder, neuralModel:nn.Module):
        self.model = neuralModel
        self.NGram = NGram
        self.model.eval()
    
    def predict(self, context:list):
        context = self.NGram.get_ids(context)
        context = torch.tensor([context])
        logits = self.model(context)
        cond_probs = get_probs(logits)
        index = sample(cond_probs)
        return self.NGram.inverse([index])[0]
    
    def estimate_prob(self, sequence:str):
        # feed model and get probs
        ngrams, targets = self.NGram.transform([sequence])
        ngrams = torch.tensor(ngrams)
        logits = self.model(ngrams)
        probs  = get_probs(logits)
        
        # get prob for each context and target
        num_target = [i for i in range(len(targets))]
        cond_probs = probs[num_target, targets]
        log_prob = np.sum(np.log(cond_probs))
        return np.exp(log_prob)
        
            
    def generate_sequence(self):
        sequence = ['<s>']*(self.NGram.N - 1)
        context = [token for token in sequence]
        while sequence[-1] != '</s>':
            word = self.predict(context)
            context.pop(0)
            context.append(word)
            sequence.append(word)
            
        return sequence
    
    def perplexity(self, test_set):
        ngrams, targets = self.NGram.transform(test_set)
        ngrams = torch.tensor(ngrams)
        logits = self.model(ngrams)
        probs = get_probs(logits)
        
        # get cond probs and perplexity
        num_target = [i for i in range(len(targets))]
        cond_probs = probs[num_target, targets]
        log_perp = np.sum(-np.log(cond_probs))     # log(1/cond_probs) = log(1) - log(cond_probs) = -log(cond_probs)
        perp = np.exp(1/len(targets) * log_perp)   # 1/N = 1/len(targets)
        return perp

# Eval

In [184]:
def eval_model(data, model, gpu=False):
    preds, targets = [], []
    with torch.no_grad():
        for inputs, labels in data:
            if gpu:
                # move inputs to gpu
                inputs = inputs.cuda()
            
            # compute output predictions    
            output = model(inputs)
            batch_preds = get_preds(output)
            # append preds and targets
            preds.append(batch_preds)
            targets.append(labels.numpy())
    
    # remove batch dimension
    preds = [p for batch_pred in preds for p in batch_pred]
    targets = [t for batch_tar in targets for t in batch_tar]
    return accuracy(preds, targets)

In [197]:
def checkpoint(state, path, is_best):
    if is_best:
        torch.save(state, path)

In [109]:
load_state = torch.load('best_model')

In [110]:
model.load_state_dict(load_state['model'])

<All keys matched successfully>

In [268]:
model(torch.tensor([2, 4, 5]))

tensor([[ 9.0609,  9.8458,  7.3451,  ..., -1.5828,  4.8862,  7.6487]],
       grad_fn=<AddmmBackward0>)

In [263]:
model.eval()

BengioModel(
  (embeddings): Embedding(10000, 256)
  (fc1): Linear(in_features=768, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10000, bias=True)
  (drop): Dropout(p=0.1, inplace=False)
)

# Hyperparameters

In [196]:
lr = 2.3e-1 
epochs = 50
patience = epochs//5

lr_patience = 10
lr_factor = 0.5

# gpu available?
use_gpu = torch.cuda.is_available()

# build model and move to gpu if possible
model = BengioModel(N=N, voc_size=voc_size, d_model=d_model, emb_mat=ngram_builder.emb_matrix)
if use_gpu:
    model = model.cuda()
    
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            'min',
            patience = lr_patience,
            verbose=True,
            factor = lr_factor
        )

criterion = nn.CrossEntropyLoss()

# Train Step

In [203]:
best_metric = 0
last_metric = 0
val_metrics = []
counter = 0

for epoch in range(epochs):
    print('epoch: ', epoch)
    epoch_metrics = []
    for inputs, targets in train_loader:
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda()
        
        # feed model and get loss
        output = model(inputs)
        loss = criterion(output, targets)
        
        # metric with train dataset
        preds = get_preds(output)
        epoch_metrics.append(accuracy(preds, targets.cpu().numpy()))
            
        # step to optimize 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # get metric for training set
    train_acc = np.mean(epoch_metrics)
    val_acc = eval_model(val_loader, model, use_gpu)
    val_metrics.append(val_acc)
    
    # print metrics
    print('train accuracy mean: ', train_acc)
    print('validation accuracy: ', val_acc)
    
    # patience and last metric update
    counter = counter + 1 if last_metric > best_metric else 0
    best_metric = val_acc if val_acc > best_metric else best_metric
    last_metric = val_acc
    
    state = {
            'epoch' : epoch + 1,
            'optimizer': optimizer.state_dict(),
            'model': model.state_dict(),
            'scheduler': scheduler.state_dict(),
            'best_metric': best_metric
    }
    
    checkpoint(state, 'best_model', val_acc>=best_metric)
    
    
    if counter > patience:
        break    

epoch:  0
train accuracy mean:  0.24574798145933013
validation accuracy:  0.12834224598930483
epoch:  1
train accuracy mean:  0.25416604366028706
validation accuracy:  0.12264964636881145
epoch:  2
train accuracy mean:  0.26014503588516746
validation accuracy:  0.12937726410212178
epoch:  3
train accuracy mean:  0.2661987888755981
validation accuracy:  0.1395549422114887
epoch:  4
train accuracy mean:  0.2721721740430622
validation accuracy:  0.1354148697602208
epoch:  5
train accuracy mean:  0.27782969497607657
validation accuracy:  0.12782473693289634
epoch:  6
train accuracy mean:  0.28803267045454545
validation accuracy:  0.12282214938761428
epoch:  7
train accuracy mean:  0.29380046351674644
validation accuracy:  0.11997584957736761
epoch:  8
train accuracy mean:  0.3018222936602871
validation accuracy:  0.12627220976367087
epoch:  9
train accuracy mean:  0.30753588516746416
validation accuracy:  0.13170605485596
epoch:  10
train accuracy mean:  0.3132831190191388
validation accur

In [204]:
eval_model(val_loader, model, use_gpu)

0.1217008797653959

# Test NGram Neural Model

In [311]:
NGramModel = NGramNeuralModel(ngram_builder, model)

In [289]:
NGramModel.estimate_prob('vete a la verga')

1.4464763e-05

In [281]:
NGramModel.generate_sequence()

['<s>',
 '<s>',
 '<s>',
 'la',
 'excitación',
 'de',
 'mi',
 'película',
 'si',
 'no',
 'les',
 'gusta',
 'ustds',
 'mi',
 'culo',
 '…',
 '<url>',
 '</s>']

In [312]:
NGramModel.perplexity(val_documents)

381.66231916274586

# Load Embeddings

In [332]:
class Embeddings:
    def __init__(self, filename):
        self.embeddings = {}
        with open(filename, 'r') as file:
            for line in file:
                values = line.split()
                word, rep = values[0], np.array(list(map(float, values[1:])))
                self.embeddings[word] = rep
                
        self.d_model = len(list(self.embeddings.values())[0])
            
    def __getitem__(self, index):
        return self.embeddings[index]

In [333]:
embeddings = Embeddings('data/word2vec_col.txt')

100

# Distancia Coseno

In [None]:
def cos_distance(data):
    N = len(data)
    distances = np.zeros((N, N))
    magnitudes = np.linalg.norm(data, axis=1)
    
    for i in range(N):
        for j in range(i+1):
            distances[i, j] = np.dot(data[i], data[j])/(magnitudes[i] * magnitudes[j])
            if i != j:
                distances[j, i] = distances[i, j]
    
    return distances

In [None]:
def get_most_similar(dist_matrix, n):
    N = len(dist_matrix)
    
    # get indexes of elements to be compared. dist_matrix should be symmetric, so we dont need to consider each pair of distances twice
    indexes = [(i,j) for i in range(N) for j in range(i+1) if i!=j]

    # get x and y indexes
    x_indexes = tuple([ind[0] for ind in indexes])
    y_indexes = tuple([ind[1] for ind in indexes])
    
    # get values of matrix
    row_max = dist_matrix[x_indexes, y_indexes]
    
    # desc sort elements retrieved and get their positions
    max_elements = np.flip(np.argsort(row_max))[:n]
    
    # return indexes in positions retrieved in previous step
    return [indexes[max_index] for max_index in max_elements]