# T5 - Juan Luis Baldelomar Cabrera

In [1]:
# os
import random

# NLP and numpy
import nltk 
import numpy as np
import nltk
from nltk.probability import FreqDist
from nltk import TweetTokenizer
from nltk.corpus import stopwords
import pandas as pd

# NGrams File
from NGrams import NGramBuilder
from NGrams import NGramNeuralModel

# torch
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from torch.nn import functional as F

# metrics
from sklearn.metrics import accuracy_score as accuracy

In [18]:
seed = 1111
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False

# Load Data

In [19]:
def load_data(filename, labels_filename):
    file = open(filename, 'r')
    labels_file = open(labels_filename, 'r')
    tweets = file.read()
    labels = labels_file.read()
    documents = tweets.split('\n')
    labels = labels.split('\n')
    documents.pop(-1)
    labels.pop(-1)
    file.close()
    labels_file.close()
    return documents, labels

In [20]:
documents, labels = load_data('data/mex_train.txt', 'data/mex_train_labels.txt')
val_documents, val_labels = load_data('data/mex_val.txt', 'data/mex_val_labels.txt')

# Vocabulary Utilities

In [50]:
def bold_string(string):
    return '\033[1m' + string + '\033[0m '

def print_doc(doc:list, end=' ', stop=-1):
    stop = len(doc) if stop is None else stop
    for token in doc[:stop]:
        print(token, end=end)
    print('')

# NGram Builder Class

In [32]:
ngram_builder = NGramBuilder()
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=4)
val_ngram_docs, val_ngram_labels = ngram_builder.transform(val_documents)
ngram_builder.emb_matrix.shape

(10000, 256)

In [10]:
doc = ngram_builder.inverse(ngram_labels)
print_doc(doc[:30])
del(ngram_builder);

lo peor de todo es que no me dan por un tiempo y luego vuelven estoy hasta la verga de estl </s> a la vga no seas mamón 45 


## Char NGram

In [6]:
# function to call after normal tokenization to get each word as a document, i.e <s> word1 </s>, <s> word2 </s>, ...
def char_postprocess(documents):
    return [[c for c in word] for doc in documents for word in doc]

# tokenize documents char by char so you can add <s> and </s> at end of each doc
def char_tokenizer(doc):
    return [char for char in doc]

In [7]:
char_ngram_builder = NGramBuilder(tokenizer=char_tokenizer, d_model=100)
ngram_docs, ngram_labels = char_ngram_builder.fit(documents, N=6)
val_ngram_docs, val_ngram_labels = char_ngram_builder.transform(val_documents)

In [66]:
words = char_ngram_builder.inverse(ngram_labels)
print_doc(words[:100], end='', stop=-1)

lo peor de todo es que no me dan por un tiempo y luego vuelven estoy hasta la verga de estl</s>a la vg


# Dataset

In [22]:
def get_datasets(ngram_builder, N, train_docs, val_docs, batch_size=64, num_workers=2):
    ngram_docs, ngram_labels = ngram_builder.fit(documents, N=N)
    val_ngram_docs, val_ngram_labels = ngram_builder.transform(val_documents)
    
    train_ds = TensorDataset(torch.tensor(ngram_docs, dtype=torch.int64), torch.tensor(ngram_labels, dtype=torch.int64))
    train_loader = DataLoader(train_ds, shuffle=True, batch_size=batch_size, num_workers=num_workers)

    val_ds = TensorDataset(torch.tensor(val_ngram_docs, dtype=torch.int64), torch.tensor(val_ngram_labels, dtype=torch.int64))
    val_loader = DataLoader(val_ds, shuffle=False, batch_size=batch_size, num_workers=num_workers)
    
    return train_ds, train_loader, val_ds, val_loader

# Test Syntactical and Morphological Structures 

In [23]:
from itertools import permutations

def get_perms(tokens):
    perms = set(permutations(tokens))
    return list(perms)

def test_structures(tokens, ngram_model):
    perms = get_perms(tokens)
    likelihoods = [(ngram_model.estimate_prob(' '.join(perm), use_gpu=use_gpu), ' '.join(perm)) for perm in perms]
    likelihoods = sorted(likelihoods, reverse=True)
    for l, sentence in likelihoods:
        print(sentence)
        print('likelihood: ', l, end='\n\n')

# Neural Language Model

In [24]:
class BengioModel(nn.Module):
    def __init__(self, N, voc_size, d_model, hidden_size=128, emb_mat=None, dropout=0.1):
        
        super(BengioModel, self).__init__()
        # parameters
        self.N           = N
        self.d_model     = d_model
        self.voc_size    = voc_size
        self.hidden_size = hidden_size
        
        # Matriz entrenable de embeddings, tamaño vocab_size x Ngram.d_model
        self.embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(emb_mat), freeze=False)
        
        # fully connected layers
        self.fc1 = nn.Linear(d_model * (N-1), hidden_size)
        self.fc2 = nn.Linear(hidden_size, voc_size, bias=False)
        
        # dropout
        self.drop = nn.Dropout(dropout)
        
    
    def forward(self, input_seq):
        # Calcula el embedding para cada palabra.
        x = self.embeddings(input_seq)
        x = x.view(-1, (self.N-1) * self.d_model)
        x = self.fc1(x)
        x = self.drop(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

In [25]:
def get_preds(raw_logit):
    probs = F.softmax(raw_logit.detach(), dim=1)
    y_pred = torch.argmax(probs, dim=1).cpu().numpy()
    return y_pred

In [26]:
def get_probs(raw_logit):
    probs = F.softmax(raw_logit.detach(), dim=1)
    return probs.cpu().numpy()

# Eval

In [27]:
def eval_model(data, model, gpu=False):
    preds, targets = [], []
    with torch.no_grad():
        for inputs, labels in data:
            if gpu:
                # move inputs to gpu
                inputs = inputs.cuda()
            
            # compute output predictions    
            output = model(inputs)
            batch_preds = get_preds(output)
            # append preds and targets
            preds.append(batch_preds)
            targets.append(labels.numpy())
    
    # remove batch dimension
    preds = [p for batch_pred in preds for p in batch_pred]
    targets = [t for batch_tar in targets for t in batch_tar]
    return accuracy(preds, targets)

In [28]:
def checkpoint(state, path, val_acc, best_metric, override=False):
    if val_acc > best_metric or override: 
        print('Storing best model to {0}. Current acc: {1}, last best metric: {2}'.format(path, val_acc, best_metric))
        torch.save(state, path)

# Hyperparameters

In [15]:
# model hyperparameters
voc_size = char_ngram_builder.voc_size
N = char_ngram_builder.N
d_model = char_ngram_builder.d_model

# optimizer hyperparameters
lr = 2.3e-1 
epochs = 100
patience = epochs//5

# scheduler hyperparameters
lr_patience = 10
lr_factor = 0.5

# gpu available?
use_gpu = torch.cuda.is_available()

# build model and move to gpu if possible
model = BengioModel(N=N, voc_size=voc_size, d_model=d_model, hidden_size=200, emb_mat=char_ngram_builder.emb_matrix)
if use_gpu:
    model = model.cuda()
    
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                'min',
                patience = lr_patience,
                verbose=True,
                factor = lr_factor
            )

criterion = nn.CrossEntropyLoss()

# Training

In [87]:
#import torch.multiprocessing
#torch.multiprocessing.set_sharing_strategy('file_descriptor')

In [16]:
train_ds, train_loader, val_ds, val_loader = get_datasets(char_ngram_builder, 6, documents, val_documents, batch_size=256, num_workers=1)

In [72]:
best_metric = 0
last_metric = 0
val_metrics = []
counter = 0

for epoch in range(epochs):
    print('epoch: ', 1 + epoch)
    epoch_metrics = []
    epoch_losses = []
    for inputs, targets in train_loader:
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda()

        # feed model and get loss
        output = model(inputs)
        loss = criterion(output, targets)
        epoch_losses.append(loss.item())

        # metric with train dataset
        preds = get_preds(output)
        epoch_metrics.append(accuracy(preds, targets.cpu().numpy()))

        # step to optimize 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # close for each step

    # get metric for training set
    train_acc = np.mean(epoch_metrics)
    val_acc = eval_model(val_loader, model, use_gpu)
    val_metrics.append(val_acc)

    # print metrics
    print('train accuracy mean: ', train_acc)
    print('validation accuracy: ', val_acc)
    print('mean loss: ', np.mean(epoch_losses))

    # store model if necessary
    state = {
                'epoch' : epoch + 1,
                'optimizer': optimizer.state_dict(),
                'model': model.state_dict(),
                'scheduler': scheduler.state_dict(),
                'best_metric': best_metric
            }
    checkpoint(state, 'char_best_model', val_acc, best_metric)

    # patience and last_metric and best_metric update
    last_metric = val_acc
    counter = counter + 1 if last_metric <= best_metric else 0
    best_metric = val_acc if val_acc > best_metric else best_metric

    # check if patience run out
    if counter >= patience:
        break
# close for each epoch

epoch:  1
train accuracy mean:  0.3443141102940012
validation accuracy:  0.4061286064491797
mean loss:  2.297941785050635
Storing best model to char_best_model. Current acc: 0.4061286064491797, last best metric: 0
epoch:  2
train accuracy mean:  0.4265518076082308
validation accuracy:  0.4336413350933434
mean loss:  1.960996209173008
Storing best model to char_best_model. Current acc: 0.4336413350933434, last best metric: 0.4061286064491797
epoch:  3
train accuracy mean:  0.44810113517559286
validation accuracy:  0.44148595134829344
mean loss:  1.875410953305868
Storing best model to char_best_model. Current acc: 0.44148595134829344, last best metric: 0.4336413350933434
epoch:  4
train accuracy mean:  0.46165679848009633
validation accuracy:  0.4551386007920045
mean loss:  1.8260459936045255
Storing best model to char_best_model. Current acc: 0.4551386007920045, last best metric: 0.44148595134829344
epoch:  5
train accuracy mean:  0.47037113489838267
validation accuracy:  0.46671695266

# Load Model

In [17]:
load_state = torch.load('char_best_model')
model.load_state_dict(load_state['model'])

<All keys matched successfully>

In [18]:
model.eval()
eval_model(val_loader, model, use_gpu)

0.5226664152366585

## Test NGram Neural Model

In [19]:
NGramModel = NGramNeuralModel(char_ngram_builder, model)

### Sequence Generation

In [20]:
seq = NGramModel.generate_sequence(use_gpu=use_gpu)
print_doc(seq, end='')

<s><s><s><s><s>padre por 35 el que se valer tal al estoy loca jajajaja de lefandarse no tengo monzaba que con


In [22]:
seq = NGramModel.generate_sequence(use_gpu=use_gpu, max_length=300)
print_doc(seq, end='')

<s><s><s><s><s>copanaduerdos aún así juegas a togerdistaso ajenatacianta que se volvera putas cosalo idea


In [77]:
seq = NGramModel.generate_sequence(use_gpu=use_gpu, max_length=300)
print_doc(seq, end='')

<s><s><s><s><s>el robo son por atuger mañana de puto en no dejones puto así solo madre tonen no de verga que te wará el arme este el tiemposin no lo tembici el poctando maricónsiso me de que marica de verga de páginas del my pagudite y ella a sentalya jajajajajajaja


### Sequence Probability Estimation

In [23]:
NGramModel.estimate_prob('vete a la verga', use_gpu=use_gpu)

-4.710024

In [24]:
NGramModel.estimate_prob('a la vete verga', use_gpu=use_gpu)

-13.1000185

In [25]:
NGramModel.estimate_prob('esos hijos de la chingada', use_gpu=use_gpu)

-16.086407

In [60]:
NGramModel.estimate_prob('esos chingada de los hijos', use_gpu=use_gpu)

-21.678864

In [58]:
NGramModel.estimate_prob('estuvieron', use_gpu=use_gpu)

-4.1125507

In [57]:
NGramModel.estimate_prob('estuveiron', use_gpu=use_gpu)

-20.756233

In [61]:
NGramModel.estimate_prob('vete alv', use_gpu=use_gpu)

-6.3073807

In [62]:
NGramModel.estimate_prob('vete avl', use_gpu=use_gpu)

-16.073044

In [63]:
NGramModel.estimate_prob('veet avl', use_gpu=use_gpu)

-15.13043

## Permutations

In [33]:
test_structures(list('amor '), NGramModel)

r o   a m
likelihood:  -8.757793

  r o a m
likelihood:  -9.794529

o r   a m
likelihood:  -10.665074

m o   a r
likelihood:  -11.240616

r   o a m
likelihood:  -11.738606

  m o a r
likelihood:  -11.784495

r m o   a
likelihood:  -12.62795

r a   o m
likelihood:  -12.659768

r m o a  
likelihood:  -12.839628

o m   a r
likelihood:  -12.941523

a r   o m
likelihood:  -12.956074

m   o a r
likelihood:  -13.132235

r m   o a
likelihood:  -13.270688

m r o a  
likelihood:  -13.477239

a r o m  
likelihood:  -13.480266

m r o   a
likelihood:  -13.740321

r o m a  
likelihood:  -13.768693

r m a   o
likelihood:  -13.921881

o m a   r
likelihood:  -13.964831

  r o m a
likelihood:  -14.001951

r m   a o
likelihood:  -14.005707

m r   a o
likelihood:  -14.123345

m r   o a
likelihood:  -14.132148

r   a o m
likelihood:  -14.150971

a r o   m
likelihood:  -14.210142

a m o   r
likelihood:  -14.219949

r o a m  
likelihood:  -14.234589

r   o m a
likelihood:  -14.533393

o r a   m
likelihood:  

### Perplexity

In [34]:
NGramModel.perplexity(val_documents, use_gpu=use_gpu)

5.051324235262274

# Ejercicio 2

## No Embeddings Model

In [54]:
ngram_builder = NGramBuilder(d_model=100)
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=4)

In [14]:
train_ds, train_loader, val_ds, val_loader = get_datasets(ngram_builder, 4, documents, val_documents, batch_size=64, num_workers=2)

In [55]:
# model hyperparameters
voc_size = ngram_builder.voc_size
N = ngram_builder.N
d_model = ngram_builder.d_model

# optimizer hyperparameters
lr = 2.3e-1 
epochs = 100
patience = epochs//5

# scheduler hyperparameters
lr_patience = 10
lr_factor = 0.5

# gpu available?
use_gpu = torch.cuda.is_available()

# build model and move to gpu if possible
model = BengioModel(N=N, voc_size=voc_size, d_model=d_model, hidden_size=200, emb_mat=ngram_builder.emb_matrix)
if use_gpu:
    model = model.cuda()

# optimizer and scheduler
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                'min',
                patience = lr_patience,
                verbose=True,
                factor = lr_factor
            )

criterion = nn.CrossEntropyLoss()

In [41]:
best_metric = 0
last_metric = 0
val_metrics = []
counter = 0

for epoch in range(epochs):
    print('epoch: ', 1 + epoch)
    epoch_metrics = []
    epoch_losses = []
    for inputs, targets in train_loader:
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda()

        # feed model and get loss
        output = model(inputs)
        loss = criterion(output, targets)
        epoch_losses.append(loss.item())

        # metric with train dataset
        preds = get_preds(output)
        epoch_metrics.append(accuracy(preds, targets.cpu().numpy()))

        # step to optimize 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # close for each step

    # get metric for training set
    train_acc = np.mean(epoch_metrics)
    val_acc = eval_model(val_loader, model, use_gpu)
    val_metrics.append(val_acc)

    # print metrics
    print('train accuracy mean: ', train_acc)
    print('validation accuracy: ', val_acc)
    print('mean loss: ', np.mean(epoch_losses))

    # store model if necessary
    state = {
                'epoch' : epoch + 1,
                'optimizer': optimizer.state_dict(),
                'model': model.state_dict(),
                'scheduler': scheduler.state_dict(),
                'best_metric': best_metric
            }
    checkpoint(state, 'no_embeddings_best_model', val_acc, best_metric)

    # patience and last_metric and best_metric update
    last_metric = val_acc
    counter = counter + 1 if last_metric <= best_metric else 0
    best_metric = val_acc if val_acc > best_metric else best_metric

    # check if patience run out
    if counter >= patience:
        break
# close for each epoch

epoch:  1
train accuracy mean:  0.061523921906001984
validation accuracy:  0.04769953051643192
mean loss:  6.571348320382337
Storing best model to no_embeddings_best_model. Current acc: 0.04769953051643192, last best metric: 0
epoch:  2
train accuracy mean:  0.09239850725446429
validation accuracy:  0.06582159624413146
mean loss:  6.177088140820463
Storing best model to no_embeddings_best_model. Current acc: 0.06582159624413146, last best metric: 0.04769953051643192
epoch:  3
train accuracy mean:  0.10496206132192461
validation accuracy:  0.11126760563380282
mean loss:  5.9777576358368
Storing best model to no_embeddings_best_model. Current acc: 0.11126760563380282, last best metric: 0.06582159624413146
epoch:  4
train accuracy mean:  0.11347501240079365
validation accuracy:  0.11774647887323944
mean loss:  5.826832549025615
Storing best model to no_embeddings_best_model. Current acc: 0.11774647887323944, last best metric: 0.11126760563380282
epoch:  5
train accuracy mean:  0.119996570

## Load Model

In [56]:
load_state = torch.load('no_embeddings_best_model')
model.load_state_dict(load_state['model'])
model.eval()
NGramModel = NGramNeuralModel(ngram_builder, model)

BengioModel(
  (embeddings): Embedding(10000, 100)
  (fc1): Linear(in_features=300, out_features=200, bias=True)
  (fc2): Linear(in_features=200, out_features=10000, bias=False)
  (drop): Dropout(p=0.1, inplace=False)
)

## Most Similar Words

In [61]:
word = 'chinga'
indexes = most_similar_to(word, ngram_builder, model.embeddings.weight.detach().cpu().numpy(), 10)
print('Most Similar to: ', bold_string(word))
ngram_builder.inverse(indexes)

Most Similar to:  [1mchinga[0m 


['tata',
 'teléfono',
 'linchar',
 'predicar',
 'rededor',
 'creó',
 'curso',
 'pegara',
 'sufrir',
 'elimino']

In [64]:
word = 'amor'
indexes = most_similar_to(word, ngram_builder, model.embeddings.weight.detach().cpu().numpy(), 10)
print('Most Similar to: ', bold_string(word))
ngram_builder.inverse(indexes)

Most Similar to:  [1mamor[0m 


['puntome',
 'narcotrafico',
 'entender',
 'cool',
 'sobredosis',
 '#milesheizer',
 'clientes',
 'ves',
 'drastico',
 'reparaciones']

In [65]:
word = 'verga'
indexes = most_similar_to(word, ngram_builder, model.embeddings.weight.detach().cpu().numpy(), 10)
print('Most Similar to: ', bold_string(word))
ngram_builder.inverse(indexes)

Most Similar to:  [1mverga[0m 


['clases',
 '#btw',
 'hablen',
 'abdomen',
 'blog',
 'tardas',
 'rayo',
 'cabroncito',
 'inundo',
 'cerebro']

## Perplexity

In [63]:
NGramModel.perplexity(val_documents, use_gpu=use_gpu)

185.78585693298882

# Embeddings Model

## Load Embeddings

In [13]:
class Embeddings:
    def __init__(self, filename):
        self.embeddings = {}
        with open(filename, 'r') as file:
            for line in file:
                values = line.split()
                word, rep = values[0], np.array(list(map(float, values[1:])))
                self.embeddings[word] = rep
                self.d_model = len(rep)
            
    def __getitem__(self, index):
        return self.embeddings.get(index)

In [14]:
embeddings = Embeddings('data/word2vec_col.txt')
embeddings['de']

array([-1.64168 ,  1.447671, -2.283216, -1.965226, -0.222943,  5.105217,
       -0.120701, -0.126822, -3.177338, -3.454396, -0.943083, -0.094476,
       -1.18936 , -0.812092, -2.572975, -0.613877, -2.311841,  1.05097 ,
        5.634725, -5.827006,  1.237639,  1.071621,  3.822072,  2.395414,
        0.169883,  3.256835,  2.897348,  3.274827, -2.936382,  0.272003,
       -1.029505, -2.617288, -1.807143,  1.737624,  0.33913 ,  3.93293 ,
        1.571361, -4.100074,  4.156816,  1.162366, -0.552316, -0.585887,
       -4.767187,  0.253338, -1.124162, -0.115079, -5.606624,  2.976579,
        4.426022,  1.019932,  3.76072 , -2.298347,  4.416567, -1.383988,
       -1.862506,  0.399053, -1.09689 , -2.28599 ,  2.992802,  0.044008,
        3.762375, -6.523126,  0.621278,  2.641829, -1.924327, -1.141184,
       -3.831767,  0.549591,  2.260839, -1.318358, -1.134662, -3.788221,
       -0.775024,  3.956695, -3.579425, -4.423733,  4.505686,  0.719133,
       -1.399557,  3.097209,  0.107541,  2.829867, 

In [32]:
ngram_builder = NGramBuilder(embeddings=embeddings)
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=4)

## Hyperparameters

In [33]:
# model hyperparameters
voc_size = ngram_builder.voc_size
N = ngram_builder.N
d_model = ngram_builder.d_model

# optimizer hyperparameters
lr = 2.3e-1 
epochs = 100
patience = epochs//5

# scheduler hyperparameters
lr_patience = 10
lr_factor = 0.5

# gpu available?
use_gpu = torch.cuda.is_available()

# build model and move to gpu if possible
model = BengioModel(N=N, voc_size=voc_size, d_model=d_model, hidden_size=200, emb_mat=ngram_builder.emb_matrix)
if use_gpu:
    model = model.cuda()
    
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                'min',
                patience = lr_patience,
                verbose=True,
                factor = lr_factor
            )

criterion = nn.CrossEntropyLoss()

## Train Embeddings Model

In [35]:
best_metric = 0
last_metric = 0
val_metrics = []
counter = 0

for epoch in range(epochs):
    print('epoch: ', 1 + epoch)
    epoch_metrics = []
    epoch_losses = []
    for inputs, targets in train_loader:
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda()

        # feed model and get loss
        output = model(inputs)
        loss = criterion(output, targets)
        epoch_losses.append(loss.item())

        # metric with train dataset
        preds = get_preds(output)
        epoch_metrics.append(accuracy(preds, targets.cpu().numpy()))

        # step to optimize 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # close for each step

    # get metric for training set
    train_acc = np.mean(epoch_metrics)
    val_acc = eval_model(val_loader, model, use_gpu)
    val_metrics.append(val_acc)

    # print metrics
    print('train accuracy mean: ', train_acc)
    print('validation accuracy: ', val_acc)
    print('mean loss: ', np.mean(epoch_losses))

    # store model if necessary
    state = {
                'epoch' : epoch + 1,
                'optimizer': optimizer.state_dict(),
                'model': model.state_dict(),
                'scheduler': scheduler.state_dict(),
                'best_metric': best_metric
            }
    checkpoint(state, 'embeddings_best_model', val_acc, best_metric)

    # patience and last_metric and best_metric update
    last_metric = val_acc
    counter = counter + 1 if last_metric <= best_metric else 0
    best_metric = val_acc if val_acc > best_metric else best_metric

    # check if patience run out
    if counter >= patience:
        break
# close for each epoch

epoch:  1
train accuracy mean:  0.09645589192708333
validation accuracy:  0.11126760563380282
mean loss:  6.4376890966668725
Storing best model to embeddings_best_model. Current acc: 0.11126760563380282, last best metric: 0
epoch:  2
train accuracy mean:  0.11189923967633929
validation accuracy:  0.12929577464788733
mean loss:  5.913514082009594
Storing best model to embeddings_best_model. Current acc: 0.12929577464788733, last best metric: 0.11126760563380282
epoch:  3
train accuracy mean:  0.11773778521825397
validation accuracy:  0.13483568075117372
mean loss:  5.65478578582406
Storing best model to embeddings_best_model. Current acc: 0.13483568075117372, last best metric: 0.12929577464788733
epoch:  4
train accuracy mean:  0.12334284706721231
validation accuracy:  0.1031924882629108
mean loss:  5.421841428615153
epoch:  5
train accuracy mean:  0.12587629045758927
validation accuracy:  0.11192488262910798
mean loss:  5.222658311327298
epoch:  6
train accuracy mean:  0.13140966021825

## Load Model

In [34]:
load_state = torch.load('embeddings_best_model')
model.load_state_dict(load_state['model'])
model.eval()

BengioModel(
  (embeddings): Embedding(10000, 100)
  (fc1): Linear(in_features=300, out_features=200, bias=True)
  (fc2): Linear(in_features=200, out_features=10000, bias=False)
  (drop): Dropout(p=0.1, inplace=False)
)

In [35]:
NGramModel = NGramNeuralModel(ngram_builder, model)

## Generate Sequence

In [22]:
seq = NGramModel.generate_sequence('vete a la'.split(), use_gpu=use_gpu)
print_doc(seq)

['vete', 'a', 'la', 'verga', '🤔', '</s>']

In [23]:
seq = NGramModel.generate_sequence('chinga a tu '.split(), use_gpu=use_gpu)
print_doc(seq)

['chinga', 'a', 'tu', 'madre', 'mátenme', '</s>']

In [31]:
seq = NGramModel.generate_sequence('estoy a punto'.split(), use_gpu=use_gpu)
print_doc(seq)

estoy a punto vergazos este capítulo de como 💩 haz … <unk> a unos putos putos 


## Likelihood

In [34]:
NGramModel.estimate_prob('voy a estar en mi casa', use_gpu=use_gpu)

-12.998171

In [35]:
NGramModel.estimate_prob('voy a en estar mi casa', use_gpu=use_gpu)

-19.667858

In [36]:
NGramModel.estimate_prob('chinga a tu madre', use_gpu=use_gpu)

-0.023245202

In [41]:
NGramModel.estimate_prob('chinga a tu padre', use_gpu=use_gpu)

-9.934837

In [37]:
NGramModel.estimate_prob('a madre tu chinga', use_gpu=use_gpu)

-8.591314

## Permutations

In [42]:
test_structures('vas a chingar a tu madre'.split(), NGramModel)

a vas chingar a tu madre
likelihood:  -4.3084855

vas a chingar a tu madre
likelihood:  -5.482989

vas madre a chingar a tu
likelihood:  -7.2738085

chingar a tu madre vas a
likelihood:  -7.515975

vas a chingar tu madre a
likelihood:  -7.5483837

tu vas madre a chingar a
likelihood:  -7.9070125

tu madre vas a chingar a
likelihood:  -7.93376

a chingar tu madre vas a
likelihood:  -7.964194

a vas a chingar tu madre
likelihood:  -8.242043

a chingar vas a tu madre
likelihood:  -8.520243

a vas chingar tu madre a
likelihood:  -8.681716

a madre vas a chingar tu
likelihood:  -8.682941

madre a vas a chingar tu
likelihood:  -8.961501

vas chingar a tu madre a
likelihood:  -9.287172

madre tu vas a chingar a
likelihood:  -9.424377

a chingar vas tu madre a
likelihood:  -9.499789

a vas tu madre chingar a
likelihood:  -9.680814

vas tu madre a chingar a
likelihood:  -9.72591

a vas tu madre a chingar
likelihood:  -9.936181

madre tu chingar a vas a
likelihood:  -10.059231

chingar a vas a t

# Most Similar Words

In [41]:
def most_similar_to(word, ngram_builder, embeddings, N):
    # get word id
    word_id = ngram_builder.get_ids([word])[0]
    word_rep = embeddings[word_id]
    # get norms to normalize later
    embeddings_norm = np.linalg.norm(embeddings, axis=1)
    word_norm = embeddings_norm[word_id]
    # sim distance
    distances = np.dot(word_rep, embeddings.T)
    # normalize distances (cos distance)
    distances = np.squeeze(distances/(word_norm * embeddings_norm))
    
    # most similar word is surely the word itself, so ignore the most similar
    return np.argsort(distances)[-(N+1):-1]

In [51]:
word = 'chinga'
indexes = most_similar_to(word, ngram_builder, model.embeddings.weight.detach().cpu().numpy(), 10)
print('Most Similar to: ', bold_string(word))
ngram_builder.inverse(indexes)

Most Similar to:  [1mchinga[0m 


['mentarte',
 'reputisima',
 'concha',
 'chingue',
 'put',
 'reputa',
 'putisima',
 'chingar',
 'chiga',
 'chingas']

In [52]:
word = 'amor'
indexes = most_similar_to(word, ngram_builder, model.embeddings.weight.detach().cpu().numpy(), 10)
print('Most Similar to: ', bold_string(word))
ngram_builder.inverse(indexes)

Most Similar to:  [1mamor[0m 


['llanto',
 'hombre',
 'pensamiento',
 'desprecio',
 'orgullo',
 'sufrimiento',
 'alma',
 'corazon',
 'corazón',
 'cariño']

In [53]:
word = 'verga'
indexes = most_similar_to(word, ngram_builder, model.embeddings.weight.detach().cpu().numpy(), 10)
print('Most Similar to: ', bold_string(word))
ngram_builder.inverse(indexes)

Most Similar to:  [1mverga[0m 


['berga',
 'caca',
 'fregada',
 'vergaaaaaa',
 'ñonga',
 'mierda',
 'chingada',
 'pija',
 'verg',
 'vrg']

## Cos Distance Among all Data

In [45]:
def cos_distance(data):
    N = len(data)
    distances = np.zeros((N, N))
    magnitudes = np.linalg.norm(data, axis=1)
    
    for i in range(N):
        for j in range(i+1):
            distances[i, j] = np.dot(data[i], data[j])/(magnitudes[i] * magnitudes[j])
            if i != j:
                distances[j, i] = distances[i, j]
    
    return distances

In [46]:
def get_most_similar(dist_matrix, n):
    N = len(dist_matrix)
    
    # get indexes of elements to be compared. dist_matrix should be symmetric, so we dont need to consider each pair of distances twice
    indexes = [(i,j) for i in range(N) for j in range(i+1) if i!=j]

    # get x and y indexes
    x_indexes = tuple([ind[0] for ind in indexes])
    y_indexes = tuple([ind[1] for ind in indexes])
    
    # get values of matrix
    row_max = dist_matrix[x_indexes, y_indexes]
    
    # desc sort elements retrieved and get their positions
    max_elements = np.flip(np.argsort(row_max))[:n]
    
    # return indexes in positions retrieved in previous step
    return [indexes[max_index] for max_index in max_elements]

## Get Most Similar among all words

In [47]:
dist_matrix = cos_distance(model.embeddings.weight.detach().cpu().numpy())

In [48]:
similar = get_most_similar(dist_matrix, 10)
similar = [list(pair) for pair in similar]
ngram_builder.inverse(similar)

[['goooooool', 'gooooool'],
 ['jajajajajajajajajaja', 'jajajajajajajajaja'],
 ['goooool', 'gooooool'],
 ['jajajajajajajajaja', 'jajajajajajajaja'],
 ['goooool', 'gooool'],
 ['goooooool', 'goooool'],
 ['jajajajajajajaja', 'jajajajajajaja'],
 ['jajajajajajajajajajaja', 'jajajajajajajajajaja'],
 ['jajajajajajaja', 'jajajajajaja'],
 ['yaaaaaa', 'yaaaaa']]

## Perplexity

In [32]:
NGramModel.perplexity(val_documents, use_gpu=use_gpu)

258.8639973177619

# Ejercicio 3

In [42]:
class BengioModel(nn.Module):
    def __init__(self, N, voc_size, d_model, hidden_size=128, emb_mat=None, dropout=0.1):
        
        super(BengioModel, self).__init__()
        # parameters
        self.N           = N
        self.d_model     = d_model
        self.voc_size    = voc_size
        self.hidden_size = hidden_size
        
        # Matriz entrenable de embeddings, tamaño vocab_size x Ngram.d_model
        self.embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(emb_mat), freeze=False)
        
        # fully connected layers
        self.fc1 = nn.Linear(d_model * (N-1), hidden_size)
        self.fc2 = nn.Linear(hidden_size, voc_size, bias=False)
        self.W = nn.Linear(d_model * (N-1), voc_size, bias=False)
        
        # dropout
        self.drop = nn.Dropout(dropout)
        
    
    def forward(self, input_seq):
        # Calcula el embedding para cada palabra.
        x = self.embeddings(input_seq)
        x = x.view(-1, (self.N-1) * self.d_model)
        direct_link = self.W(x)
        x = self.fc1(x)
        x = self.drop(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x + direct_link

## Hyperparameters

In [44]:
ngram_builder = NGramBuilder(embeddings=embeddings)
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=4)

In [45]:
# model hyperparameters
voc_size = ngram_builder.voc_size
N = ngram_builder.N
d_model = ngram_builder.d_model

# optimizer hyperparameters
lr = 2.3e-1 
epochs = 100
patience = epochs//5

# scheduler hyperparameters
lr_patience = 10
lr_factor = 0.5

# gpu available?
use_gpu = torch.cuda.is_available()

# build model and move to gpu if possible
model = BengioModel(N=N, voc_size=voc_size, d_model=d_model, hidden_size=200, emb_mat=ngram_builder.emb_matrix)
if use_gpu:
    model = model.cuda()
    
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                'min',
                patience = lr_patience,
                verbose=True,
                factor = lr_factor
            )

criterion = nn.CrossEntropyLoss()

In [46]:
model.embeddings.weight[ngram_builder.word2id['de']]

tensor([-1.6417,  1.4477, -2.2832, -1.9652, -0.2229,  5.1052, -0.1207, -0.1268,
        -3.1773, -3.4544, -0.9431, -0.0945, -1.1894, -0.8121, -2.5730, -0.6139,
        -2.3118,  1.0510,  5.6347, -5.8270,  1.2376,  1.0716,  3.8221,  2.3954,
         0.1699,  3.2568,  2.8973,  3.2748, -2.9364,  0.2720, -1.0295, -2.6173,
        -1.8071,  1.7376,  0.3391,  3.9329,  1.5714, -4.1001,  4.1568,  1.1624,
        -0.5523, -0.5859, -4.7672,  0.2533, -1.1242, -0.1151, -5.6066,  2.9766,
         4.4260,  1.0199,  3.7607, -2.2983,  4.4166, -1.3840, -1.8625,  0.3991,
        -1.0969, -2.2860,  2.9928,  0.0440,  3.7624, -6.5231,  0.6213,  2.6418,
        -1.9243, -1.1412, -3.8318,  0.5496,  2.2608, -1.3184, -1.1347, -3.7882,
        -0.7750,  3.9567, -3.5794, -4.4237,  4.5057,  0.7191, -1.3996,  3.0972,
         0.1075,  2.8299, -0.7602, -0.2772, -1.2257,  2.1579,  0.5184,  3.4380,
        -1.0875, -1.5299,  0.3995,  0.8874, -1.9949,  2.9122,  3.2580, -2.6000,
        -1.9951, -0.1765,  3.9469, -2.79

## Train Embeddings Model

In [47]:
best_metric = 0
last_metric = 0
val_metrics = []
counter = 0

for epoch in range(epochs):
    print('epoch: ', 1 + epoch)
    epoch_metrics = []
    epoch_losses = []
    for inputs, targets in train_loader:
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda()

        # feed model and get loss
        output = model(inputs)
        loss = criterion(output, targets)
        epoch_losses.append(loss.item())

        # metric with train dataset
        preds = get_preds(output)
        epoch_metrics.append(accuracy(preds, targets.cpu().numpy()))

        # step to optimize 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # close for each step

    # get metric for training set
    train_acc = np.mean(epoch_metrics)
    val_acc = eval_model(val_loader, model, use_gpu)
    val_metrics.append(val_acc)

    # print metrics
    print('train accuracy mean: ', train_acc)
    print('validation accuracy: ', val_acc)
    print('mean loss: ', np.mean(epoch_losses))

    # store model if necessary
    state = {
                'epoch' : epoch + 1,
                'optimizer': optimizer.state_dict(),
                'model': model.state_dict(),
                'scheduler': scheduler.state_dict(),
                'best_metric': best_metric
            }
    checkpoint(state, 'dirlink_best_model', val_acc, best_metric)

    # patience and last_metric and best_metric update
    last_metric = val_acc
    counter = counter + 1 if last_metric <= best_metric else 0
    best_metric = val_acc if val_acc > best_metric else best_metric

    # check if patience run out
    if counter >= patience:
        break
# close for each epoch

epoch:  1
train accuracy mean:  0.07270352802579365
validation accuracy:  0.06816901408450704
mean loss:  9.02942226951321
Storing best model to dirlink_best_model. Current acc: 0.06816901408450704, last best metric: 0
epoch:  2
train accuracy mean:  0.087799072265625
validation accuracy:  0.06938967136150234
mean loss:  7.408335320651531
Storing best model to dirlink_best_model. Current acc: 0.06938967136150234, last best metric: 0.06816901408450704
epoch:  3
train accuracy mean:  0.10818626767113095
validation accuracy:  0.07549295774647888
mean loss:  6.580813377474745
Storing best model to dirlink_best_model. Current acc: 0.07549295774647888, last best metric: 0.06938967136150234
epoch:  4
train accuracy mean:  0.13936505998883927
validation accuracy:  0.07971830985915493
mean loss:  6.044397950793306
Storing best model to dirlink_best_model. Current acc: 0.07971830985915493, last best metric: 0.07549295774647888
epoch:  5
train accuracy mean:  0.16627139136904762
validation accura

In [48]:
model.embeddings.weight[ngram_builder.word2id['de']]

tensor([-1.0190,  1.2483, -0.7047, -0.7333, -0.0585,  1.7611, -0.0795, -0.7876,
        -0.5539, -1.0019, -0.2009, -0.3094, -0.8989, -0.5496, -0.8654, -0.6820,
        -0.3051,  0.2228,  2.1209, -2.1065,  0.6612,  0.7110,  1.2170,  0.4393,
        -0.1922,  0.6692,  1.3587,  1.2519, -1.1299,  0.3805, -0.9622, -0.5148,
        -0.9844,  0.7157, -0.2246,  2.2004,  0.4344, -1.5050,  1.8277,  0.2176,
         0.4949, -0.3045, -1.7985, -0.2959, -0.9041,  0.5150, -1.7921,  0.8672,
         3.4313,  0.4163,  0.8759, -0.5347,  1.3212, -1.1592, -0.9640, -0.8578,
         0.1290, -0.3243,  0.9489,  0.2504,  1.6267, -1.8357, -0.1039,  1.0557,
        -0.0244, -0.7485, -1.3023, -0.5901,  0.4038, -0.1492,  0.2911, -2.1364,
        -0.4225,  1.5543, -1.0871, -2.0006,  2.5803, -0.4677, -0.4921, -0.0424,
         0.1109,  1.0093, -0.8610,  0.0930, -0.2415,  0.3658, -0.0446,  0.8197,
        -0.3869, -0.7651,  0.7372,  0.1834, -0.4229,  1.2800,  0.9234, -0.3022,
        -1.0632,  0.7113,  0.9043, -1.06

In [81]:
NGramModel = NGramNeuralModel(ngram_builder, model)

In [90]:
tokens = 'chinga a tu madre'.split()
test_structures(tokens, NGramModel)

a chinga tu madre
likelihood:  -0.02420224

chinga a tu madre
likelihood:  -0.077396

madre a chinga tu
likelihood:  -0.5944432

a madre chinga tu
likelihood:  -0.95215386

madre chinga a tu
likelihood:  -1.2357066

a tu chinga madre
likelihood:  -1.5082778

chinga tu a madre
likelihood:  -2.3956115

tu madre chinga a
likelihood:  -3.570664

chinga madre a tu
likelihood:  -3.5873508

a chinga madre tu
likelihood:  -4.7002716

tu chinga madre a
likelihood:  -4.8660855

tu chinga a madre
likelihood:  -4.930147

chinga a madre tu
likelihood:  -5.6753926

madre tu chinga a
likelihood:  -5.90559

chinga tu madre a
likelihood:  -8.489605

madre tu a chinga
likelihood:  -9.7399435

tu a chinga madre
likelihood:  -12.767059

a tu madre chinga
likelihood:  -13.23884

chinga madre tu a
likelihood:  -14.129946

tu a madre chinga
likelihood:  -16.036716

tu madre a chinga
likelihood:  -18.482548

madre a tu chinga
likelihood:  -21.813423

madre chinga tu a
likelihood:  -24.532393

a madre tu ching