# T5 - Juan Luis Baldelomar Cabrera

In [1]:
# os
import random

# NLP and numpy
import nltk 
import numpy as np
import nltk
from nltk.probability import FreqDist
from nltk import TweetTokenizer
from nltk.corpus import stopwords
import pandas as pd

# torch
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from torch.nn import functional as F

# metrics
from sklearn.metrics import accuracy_score as accuracy

In [2]:
seed = 1111
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False

# Load Data

In [3]:
def load_data(filename, labels_filename):
    file = open(filename, 'r')
    labels_file = open(labels_filename, 'r')
    tweets = file.read()
    labels = labels_file.read()
    documents = tweets.split('\n')
    labels = labels.split('\n')
    documents.pop(-1)
    labels.pop(-1)
    file.close()
    labels_file.close()
    return documents, labels

In [4]:
documents, labels = load_data('data/mex_train.txt', 'data/mex_train_labels.txt')
val_documents, val_labels = load_data('data/mex_val.txt', 'data/mex_val_labels.txt')

# Vocabulary Utilities

In [5]:
def print_doc(doc:list, end=' ', stop=-1):
    stop = len(doc) if stop is None else stop
    for token in doc[:stop]:
        print(token, end=end)
    print('')

In [6]:
def get_vocabulary(tokenized_docs, n):
    tokens = [token for doc in tokenized_docs for token in doc]
    unique_tokens = FreqDist(tokens).most_common(n)
    return [token for token, _ in unique_tokens]

def word2ids(vocabulary):
    word2id = {}
    id2word = {}
    
    # build both dictionaries
    for i, word in enumerate(vocabulary):
        word2id[word] = i
        id2word[i] = word
    
    # add special tokens
    n = len(word2id)
    word2id['<s>']   = n 
    word2id['</s>']  = n + 1
    word2id['<unk>'] = n + 2
    id2word[n]       = '<s>'
    id2word[n + 1]   = '</s>'
    id2word[n + 2]   = '<unk>'
    
    return word2id, id2word

# NGram Builder Class

### Punctuation to Ignore

In [7]:
punctuation = ['.', '...', ',', '!', '¡', '¿', '?', ';', ';', '"', '|', '[', ']', '°', '(', ')', '*', '+', '/']

In [8]:
class NGramBuilder:
    def __init__(self, tokenizer=None, embeddings=None, d_model=256, sos='<s>', eos='</s>', unk='<unk>', punctuation=punctuation, postprocess=None):
        self.tokenizer = self.default_tokenizer() if tokenizer == None else tokenizer
        self.embeddings = embeddings
        self.d_model = d_model if embeddings is None else embeddings.d_model
        # special symbols
        self.SOS = sos
        self.EOS = eos
        self.UNK = unk
        # vocabulary 2 id and viceversa
        self.word2id  = None
        self.id2word  = None
        self.voc_size = 0
        # post tokenization functions
        self.punctuation = set(punctuation) if punctuation != None else None
        self.postprocess = postprocess if postprocess is not None else lambda x : x
        
    def default_tokenizer(doc):
        return TweetTokenizer().tokenize
    
    def get_vocabulary(self):
        return set(self.word2id.keys())
    
    def remove_punct(self, tokenized_documents):
        if self.punctuation == None:
            return tokenized_documents
        else:
            return [[token for token in doc if token not in self.punctuation] for doc in tokenized_documents]
        
    def get_ids(self, words:list):
        # transform list of words to list of ids
        unk_id = self.word2id.get(self.UNK, 0)
        ids = [self.word2id.get(word, unk_id) for word in words]
        return ids
    
    def __transform(self, tokenized_docs, start_padding:bool, end_padding:bool):
        N = self.N
        # docs and labels lists
        ngram_docs, ngram_targs = [], []
        # traverse each doc
        for doc in tokenized_docs:
            # add padding
            doc = ([self.SOS]*(N - 1) if start_padding else []) + \
                    doc + ([self.EOS] if end_padding else [])
            # get ids    
            ids = self.get_ids(doc)
            # traverse each word as center and build ngrams
            for i in range(N-1, len(doc)):    
                ngram_docs.append(ids[i-(N-1): i])
                ngram_targs.append(ids[i])
                
        return np.array(ngram_docs), np.array(ngram_targs)
    
    def _tokenize(self, documents):
        tokenized_docs = [self.tokenizer(doc.lower()) for doc in documents]
        tokenized_docs = self.remove_punct(tokenized_docs)
        tokenized_docs = self.postprocess(tokenized_docs)
        return tokenized_docs
    
    def build_emb_matrix(self):
        dim_v = len(self.word2id)
        if self.embeddings is None:
            self.emb_matrix = np.random.rand(dim_v, self.d_model)
        else:
            self.emb_matrix = np.random.rand(dim_v, self.d_model)
            for word in self.word2id.keys():
                if word in self.embeddings:
                    self.emb_matrix = self.embeddings[word]
                
    def fit(self, documents, N, t=10000):
        self.N = N
        # tokenize documents
        tokenized_docs = self._tokenize(documents)
        
        # get vocabulary and word2id and ids2word dicts
        vocabulary = get_vocabulary(tokenized_docs, t-3)
        self.word2id, self.id2word = word2ids(vocabulary)
        self.voc_size = len(self.word2id)
        self.build_emb_matrix()
        
        return self.__transform(tokenized_docs, start_padding=True, end_padding=True)
    
    def transform(self, documents: list[list or str], start_padding=True, end_padding=True):
        # list of documents as strings
        if type(documents[0]) is str:
            # tokenize documents
            tokenized_docs = self._tokenize(documents)
            return self.__transform(tokenized_docs, start_padding, end_padding)
        
        # list of documents as list of tokens
        elif type(documents[0]) is list:
            return self.__transform(documents, start_padding, end_padding)
        
        print('[ERR]: documents should be list of strings or list of lists of tokens')
        return None
    
    def inverse(self, docs_as_ids):
        # empty list
        if len(docs_as_ids) == 0:
            return None
        
        # multiple docs
        if type(docs_as_ids[0]) in (list, np.ndarray):
            return [[self.id2word.get(tok_id) for tok_id in doc] 
                    for doc in docs_as_ids ]
        # single doc
        return [self.id2word.get(tok_id) for tok_id in docs_as_ids]

In [9]:
ngram_builder = NGramBuilder()
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=4)
val_ngram_docs, val_ngram_labels = ngram_builder.transform(val_documents)
ngram_builder.emb_matrix.shape

(10000, 256)

In [10]:
doc = ngram_builder.inverse(ngram_labels)
print_doc(doc[:30])

del(ngram_builder);

lo peor de todo es que no me dan por un tiempo y luego vuelven estoy hasta la verga de estl </s> a la vga no seas mamón 45 


## Char NGram

In [11]:
# function to call after normal tokenization to get each word as a document, i.e <s> word1 </s>, <s> word2 </s>, ...
def char_postprocess(documents):
    return [[c for c in word] for doc in documents for word in doc]

# tokenize documents char by char so you can add <s> and </s> at end of each doc
def char_tokenizer(doc):
    return [char for char in doc]

In [12]:
char_tokenizer('hola mundo')

['h', 'o', 'l', 'a', ' ', 'm', 'u', 'n', 'd', 'o']

In [13]:
char_ngram_builder = NGramBuilder(tokenizer=char_tokenizer, d_model=100, punctuation=punctuation)
ngram_docs, ngram_labels = char_ngram_builder.fit(documents, N=6)
val_ngram_docs, val_ngram_labels = char_ngram_builder.transform(val_documents)

In [53]:
char_ngram_builder.inverse(ngram_docs[:5])

[['<s>', '<s>', '<s>', '<s>', '<s>'],
 ['<s>', '<s>', '<s>', '<s>', 'l'],
 ['<s>', '<s>', '<s>', 'l', 'o'],
 ['<s>', '<s>', 'l', 'o', ' '],
 ['<s>', 'l', 'o', ' ', 'p']]

In [41]:
ngram_docs.shape

(490412, 5)

In [58]:
words = char_ngram_builder.inverse(ngram_labels)
print_doc(words[:100], end='', stop=-1)

lo peor de todo es que no me dan por un tiempo y luego vuelven estoy hasta la verga de estl</s>a la vg


# Dataset

In [14]:
def get_datasets(ngram_builder, N, train_docs, val_docs, batch_size=64, num_workers=2):
    ngram_docs, ngram_labels = ngram_builder.fit(documents, N=N)
    val_ngram_docs, val_ngram_labels = ngram_builder.transform(val_documents)
    
    train_ds = TensorDataset(torch.tensor(ngram_docs, dtype=torch.int64), torch.tensor(ngram_labels, dtype=torch.int64))
    train_loader = DataLoader(train_ds, shuffle=True, batch_size=batch_size, num_workers=num_workers)

    val_ds = TensorDataset(torch.tensor(val_ngram_docs, dtype=torch.int64), torch.tensor(val_ngram_labels, dtype=torch.int64))
    val_loader = DataLoader(val_ds, shuffle=False, batch_size=batch_size, num_workers=num_workers)
    
    return train_ds, train_loader, val_ds, val_loader

# Neural Language Model

In [15]:
class BengioModel(nn.Module):
    def __init__(self, N, voc_size, d_model, hidden_size=128, emb_mat=None, dropout=0.1):
        
        super(BengioModel, self).__init__()
        # parameters
        self.N           = N
        self.d_model     = d_model
        self.voc_size    = voc_size
        self.hidden_size = hidden_size
        
        # Matriz entrenable de embeddings, tamaño vocab_size x Ngram.d_model
        self.embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(emb_mat), freeze=False)
        
        # fully connected layers
        self.fc1 = nn.Linear(d_model * (N-1), hidden_size)
        self.fc2 = nn.Linear(hidden_size, voc_size, bias=False)
        
        # dropout
        self.drop = nn.Dropout(dropout)
        
    
    def forward(self, input_seq):
        # Calcula el embedding para cada palabra.
        x = self.embeddings(input_seq)
        x = x.view(-1, (self.N-1) * self.d_model)
        x = self.fc1(x)
        x = self.drop(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

In [16]:
def get_preds(raw_logit):
    probs = F.softmax(raw_logit.detach(), dim=1)
    y_pred = torch.argmax(probs, dim=1).cpu().numpy()
    return y_pred

In [17]:
def get_probs(raw_logit):
    probs = F.softmax(raw_logit.detach(), dim=1)
    return probs.cpu().numpy()

## Test Model Forward

# Model 

In [18]:
def sample(probs):
    acc = np.cumsum(probs)       # build cumulative probability
    val = np.random.uniform()    # get random number between [0, 1]
    pos = np.argmax((val < acc)) # get the index of the word to sample
    return pos

In [72]:
class NGramNeuralModel:
    def __init__(self, NGram: NGramBuilder, neuralModel:nn.Module):
        self.model = neuralModel
        self.NGram = NGram
        self.model.eval()
    
    def predict(self, context:list, use_gpu=False):
        context = self.NGram.get_ids(context)
        context = torch.tensor([context])
        if use_gpu:
            context = context.cuda()
            
        logits = self.model(context)
        cond_probs = get_probs(logits)
        index = sample(cond_probs)
        return self.NGram.inverse([index])[0]
    
    def estimate_prob(self, sequence:str, use_gpu=False, ret_probs=False, start_padding=False, end_padding=False):
        # feed model and get probs
        ngrams, targets = self.NGram.transform([sequence], start_padding, end_padding)
        ngrams = torch.tensor(ngrams)
        if use_gpu:
            ngrams = ngrams.cuda()
            
        logits = self.model(ngrams)
        probs  = get_probs(logits)
        
        # get prob for each context and target
        num_target = [i for i in range(len(targets))]
        cond_probs = probs[num_target, targets]
        log_prob = np.sum(np.log(cond_probs))
        return np.exp(log_prob) if ret_probs else log_prob
        
            
    def generate_sequence(self, use_gpu=False, max_length=100):
        sequence = ['<s>']*(self.NGram.N - 1)
        context = [token for token in sequence]
        while sequence[-1] != '</s>' and len(sequence) < max_length:
            word = self.predict(context, use_gpu)
            context.pop(0)
            context.append(word)
            sequence.append(word)
            
        return sequence
    
    def perplexity(self, test_set, use_gpu=False):
        ngrams, targets = self.NGram.transform(test_set)
        ngrams = torch.tensor(ngrams)
        if use_gpu:
            ngrams = ngrams.cuda()
        logits = self.model(ngrams)
        probs = get_probs(logits)
        
        # get cond probs and perplexity
        num_target = [i for i in range(len(targets))]
        cond_probs = probs[num_target, targets]
        log_perp = np.sum(-np.log(cond_probs))     # log(1/cond_probs) = log(1) - log(cond_probs) = -log(cond_probs)
        perp = np.exp(1/len(targets) * log_perp)   # 1/N = 1/len(targets)
        return perp

# Eval

In [20]:
def eval_model(data, model, gpu=False):
    preds, targets = [], []
    with torch.no_grad():
        for inputs, labels in data:
            if gpu:
                # move inputs to gpu
                inputs = inputs.cuda()
            
            # compute output predictions    
            output = model(inputs)
            batch_preds = get_preds(output)
            # append preds and targets
            preds.append(batch_preds)
            targets.append(labels.numpy())
    
    # remove batch dimension
    preds = [p for batch_pred in preds for p in batch_pred]
    targets = [t for batch_tar in targets for t in batch_tar]
    return accuracy(preds, targets)

In [32]:
def checkpoint(state, path, val_acc, best_metric, override=False):
    if val_acc > best_metric or override: 
        print('Storing best model to {0}. Current acc: {1}, last best metric: {2}'.format(path, val_acc, best_metric))
        torch.save(state, path)

# Load Model

In [90]:
load_state = torch.load('best_model')
model.load_state_dict(load_state['model'])

<All keys matched successfully>

# Hyperparameters

In [26]:
# model hyperparameters
voc_size = char_ngram_builder.voc_size
N = char_ngram_builder.N
d_model = char_ngram_builder.d_model

# optimizer hyperparameters
lr = 2.3e-1 
epochs = 100
patience = epochs//5

# scheduler hyperparameters
lr_patience = 10
lr_factor = 0.5

# gpu available?
use_gpu = torch.cuda.is_available()

# build model and move to gpu if possible
model = BengioModel(N=N, voc_size=voc_size, d_model=d_model, emb_mat=char_ngram_builder.emb_matrix)
if use_gpu:
    model = model.cuda()
    
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                'min',
                patience = lr_patience,
                verbose=True,
                factor = lr_factor
            )

criterion = nn.CrossEntropyLoss()

In [27]:
torch.cuda.is_available()

True

# Training

In [87]:
#import torch.multiprocessing
#torch.multiprocessing.set_sharing_strategy('file_descriptor')

In [24]:
train_ds, train_loader, val_ds, val_loader = get_datasets(char_ngram_builder, 6, documents, val_documents, batch_size=256, num_workers=1)

In [28]:
best_metric = 0
last_metric = 0
val_metrics = []
counter = 0

for epoch in range(epochs):
    print('epoch: ', 1 + epoch)
    epoch_metrics = []
    for inputs, targets in train_loader:
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda()
        
        # feed model and get loss
        output = model(inputs)
        loss = criterion(output, targets)
        
        # metric with train dataset
        preds = get_preds(output)
        epoch_metrics.append(accuracy(preds, targets.cpu().numpy()))
            
        # step to optimize 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # close for each step
    
    # get metric for training set
    train_acc = np.mean(epoch_metrics)
    val_acc = eval_model(val_loader, model, use_gpu)
    val_metrics.append(val_acc)
    
    # print metrics
    print('train accuracy mean: ', train_acc)
    print('validation accuracy: ', val_acc)
    
    # store model if necessary
    state = {
                'epoch' : epoch + 1,
                'optimizer': optimizer.state_dict(),
                'model': model.state_dict(),
                'scheduler': scheduler.state_dict(),
                'best_metric': best_metric
            }
    checkpoint(state, 'char_best_model', val_acc, best_metric)
    
    # patience and last_metric and best_metric update
    counter = counter + 1 if last_metric > best_metric else 0
    best_metric = val_acc if val_acc > best_metric else best_metric
    last_metric = val_acc
    
    # check if patience run out
    if counter > patience:
        break
# close for each epoch

epoch:  1
train accuracy mean:  0.33339849439542163
validation accuracy:  0.3890792049390152
Storing best model to char_best_model. Current acc: 0.3890792049390152, last best metric: 0
epoch:  2
train accuracy mean:  0.41890894684298685
validation accuracy:  0.42254555036892033
Storing best model to char_best_model. Current acc: 0.42254555036892033, last best metric: 0.3890792049390152
epoch:  3
train accuracy mean:  0.43695091176809725
validation accuracy:  0.4337637404005421
Storing best model to char_best_model. Current acc: 0.4337637404005421, last best metric: 0.42254555036892033
epoch:  4
train accuracy mean:  0.44869194580824884
validation accuracy:  0.4465253726848366
Storing best model to char_best_model. Current acc: 0.4465253726848366, last best metric: 0.4337637404005421
epoch:  5
train accuracy mean:  0.4562218178011361
validation accuracy:  0.453132058424936
Storing best model to char_best_model. Current acc: 0.453132058424936, last best metric: 0.4465253726848366
epoch: 

In [29]:
model.train(False)

BengioModel(
  (embeddings): Embedding(352, 100)
  (fc1): Linear(in_features=500, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=352, bias=False)
  (drop): Dropout(p=0.1, inplace=False)
)

## Store Last Model

In [33]:
# store model if necessary
state = {
            'epoch' : 100,
            'optimizer': optimizer.state_dict(),
            'model': model.state_dict(),
            'scheduler': scheduler.state_dict(),
            'best_metric': best_metric
        }
checkpoint(state, 'char_last_model', 0, best_metric, override=True)

Storing best model to char_last_model. Current acc: 0, last best metric: 0.4938826983887969


In [31]:
eval_model(val_loader, model, use_gpu)

0.5069454901370276

# Test NGram Neural Model

In [34]:
model.eval()

BengioModel(
  (embeddings): Embedding(352, 100)
  (fc1): Linear(in_features=500, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=352, bias=False)
  (drop): Dropout(p=0.1, inplace=False)
)

In [73]:
NGramModel = NGramNeuralModel(char_ngram_builder, model)

## Sequence Generation

In [54]:
seq = NGramModel.generate_sequence(use_gpu=use_gpu)
print_doc(seq, end='')

<s><s><s><s><s>oye el díleco fea por mierda que chingas a ✊ zya agarrasos @usuario la digar y vene un veo no que diga quet pasado nato dwegt a justo 😡😡😡😡


In [76]:
seq = NGramModel.generate_sequence(use_gpu=use_gpu, max_length=300)
print_doc(seq, end='')

<s><s><s><s><s>me voy a chingad y en it paraguador verga en el c_x mamen llagon ensables la kiros patos digo a meses su mise mucho


In [77]:
seq = NGramModel.generate_sequence(use_gpu=use_gpu, max_length=300)
print_doc(seq, end='')

<s><s><s><s><s>el robo son por atuger mañana de puto en no dejones puto así solo madre tonen no de verga que te wará el arme este el tiemposin no lo tembici el poctando maricónsiso me de que marica de verga de páginas del my pagudite y ella a sentalya jajajajajajaja


## Sequence Probability Estimation

In [41]:
NGramModel.estimate_prob('vete a la verga', use_gpu=use_gpu)

-5.2720284

In [43]:
NGramModel.estimate_prob('a la vete verga', use_gpu=use_gpu)

-11.524015

In [59]:
NGramModel.estimate_prob('esos hijos de la chingada', use_gpu=use_gpu)

-17.122591

In [60]:
NGramModel.estimate_prob('esos chingada de los hijos', use_gpu=use_gpu)

-21.678864

In [58]:
NGramModel.estimate_prob('estuvieron', use_gpu=use_gpu)

-4.1125507

In [57]:
NGramModel.estimate_prob('estuveiron', use_gpu=use_gpu)

-20.756233

In [61]:
NGramModel.estimate_prob('vete alv', use_gpu=use_gpu)

-6.3073807

In [62]:
NGramModel.estimate_prob('vete avl', use_gpu=use_gpu)

-16.073044

In [63]:
NGramModel.estimate_prob('veet avl', use_gpu=use_gpu)

-15.13043

## Perplexity

In [79]:
NGramModel.perplexity(val_documents, use_gpu=use_gpu)

5.349459063337847

# Ejercicio 2

## Load Embeddings

In [332]:
class Embeddings:
    def __init__(self, filename):
        self.embeddings = {}
        with open(filename, 'r') as file:
            for line in file:
                values = line.split()
                word, rep = values[0], np.array(list(map(float, values[1:])))
                self.embeddings[word] = rep
                
        self.d_model = len(list(self.embeddings.values())[0])
            
    def __getitem__(self, index):
        return self.embeddings[index]

In [333]:
embeddings = Embeddings('data/word2vec_col.txt')

## Train Embeddings Model

In [None]:
best_metric = 0
last_metric = 0
val_metrics = []
counter = 0

for epoch in range(epochs):
    print('epoch: ', 1 + epoch)
    epoch_metrics = []
    for inputs, targets in train_loader:
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda()
        
        # feed model and get loss
        output = model(inputs)
        loss = criterion(output, targets)
        
        # metric with train dataset
        preds = get_preds(output)
        epoch_metrics.append(accuracy(preds, targets.cpu().numpy()))
            
        # step to optimize 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # close for each step
    
    # get metric for training set
    train_acc = np.mean(epoch_metrics)
    val_acc = eval_model(val_loader, model, use_gpu)
    val_metrics.append(val_acc)
    
    # print metrics
    print('train accuracy mean: ', train_acc)
    print('validation accuracy: ', val_acc)
    
    # store model if necessary
    state = {
                'epoch' : epoch + 1,
                'optimizer': optimizer.state_dict(),
                'model': model.state_dict(),
                'scheduler': scheduler.state_dict(),
                'best_metric': best_metric
            }
    checkpoint(state, 'char_best_model', val_acc, best_metric)
    
    # patience and last_metric and best_metric update
    counter = counter + 1 if last_metric > best_metric else 0
    best_metric = val_acc if val_acc > best_metric else best_metric
    last_metric = val_acc
    
    # check if patience run out
    if counter > patience:
        break
# close for each epoch

# Distancia Coseno

In [72]:
def cos_distance(data):
    N = len(data)
    distances = np.zeros((N, N))
    magnitudes = np.linalg.norm(data, axis=1)
    
    for i in range(N):
        for j in range(i+1):
            distances[i, j] = np.dot(data[i], data[j])/(magnitudes[i] * magnitudes[j])
            if i != j:
                distances[j, i] = distances[i, j]
    
    return distances

In [73]:
def get_most_similar(dist_matrix, n):
    N = len(dist_matrix)
    
    # get indexes of elements to be compared. dist_matrix should be symmetric, so we dont need to consider each pair of distances twice
    indexes = [(i,j) for i in range(N) for j in range(i+1) if i!=j]

    # get x and y indexes
    x_indexes = tuple([ind[0] for ind in indexes])
    y_indexes = tuple([ind[1] for ind in indexes])
    
    # get values of matrix
    row_max = dist_matrix[x_indexes, y_indexes]
    
    # desc sort elements retrieved and get their positions
    max_elements = np.flip(np.argsort(row_max))[:n]
    
    # return indexes in positions retrieved in previous step
    return [indexes[max_index] for max_index in max_elements]

# Get Most Similar

In [91]:
dist_matrix = cos_distance(model.embeddings.weight.detach().cpu().numpy())

In [92]:
similar = get_most_similar(dist_matrix, 10)
similar = [list(pair) for pair in similar]
ngram_builder.inverse(similar)