In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk
import shutil
import time

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
from nltk import ngrams
from argparse import Namespace
from typing import Tuple
from sklearn.metrics import r2_score, accuracy_score, f1_score
from torch.utils.data import DataLoader, TensorDataset

In [45]:
seed = 1111
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.benchmark = True

In [46]:
X_train = pd.read_csv('./DatasetAgresividad/mex_train.txt', sep='\r\n', engine='python', header=None)[0]
X_test = pd.read_csv('./DatasetAgresividad/mex_val.txt', sep='\r\n', engine='python', header=None)[0]

X_train.head()

0    lo peor de todo es que no me dan por un tiempo...
1    a la vga no seas mamón 45 putos minutos despué...
2    considero que lo más conveniente seria que lo ...
3    el marica de mi ex me tiene bloqueada de todo ...
4    mujer despechadaya pinche amlo hazle esta que ...
Name: 0, dtype: object

In [47]:
class NGramData():

    def __init__(self, N: int, vocab_size: int=5000, tokenizer=None, embedding_model=None):
        self.N = N
        self.vocab_size = vocab_size
        self.tokenizer = tokenizer if tokenizer != None else self.self_tokenizer()
        self.punctuations = set([
            '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}','!', '?', '¡', '¿', '💛', '😡''🎵', '...', '😰', ''
        ])
        self.stopwords = set(stopwords.words('spanish'))
        self.embedding_model = embedding_model
        self.UNK = '<unk>'
        self.SOS = '<s>'
        self.EOS = '</s>'
    
    def fit(self, X_train):
        self.vocab = self.get_vocab(X_train)
        self.word2idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx2word = {idx: word for idx, word in enumerate(self.vocab)}

        if self.embedding_model is not None:
            self.embedding_matrix = self.get_embedding_matrix()

    def get_embedding_matrix(self) -> np.ndarray:
        embedding_matrix = np.zeros((self.vocab_size, self.embedding_model.vector_size))
        for word in self.vocab:
            if word in self.embedding_model:
                embedding_matrix[self.word2idx[word]] = self.embedding_model[word]
        return embedding_matrix 

    def get_vocab(self, corpus: list) -> set:
        fdist = FreqDist(
            [
                word.lower() 
                for sentence in corpus 
                    for word in self.tokenizer(sentence) 
                        if not self.remove_word(word)
            ]
        )
        fdist = dict(fdist)
        fdist = sorted(fdist, key=fdist.get, reverse=True)
        vocab = fdist[:self.vocab_size]
        vocab = set([self.SOS, self.EOS, self.UNK] + vocab)
        return set(vocab)
    
    def remove_word(self, word: str) -> bool:
        word = word.lower() 
        return word.isnumeric() or word in self.punctuations

    def self_tokenizer(self, doc: str) -> list:
        return doc.split(' ')
    
    def transform(self, corpus: list) -> Tuple[np.ndarray, np.ndarray]:
        X_ngrams, y = [], []
        for doc in corpus:
            doc_ngram = self.get_ngram_doc(doc)
            for word_window in doc_ngram:
                word_window_ids = [
                    self.word2idx[word] 
                    for word in word_window]
                X_ngrams.append(list(word_window_ids[:-1]))
                y.append(word_window_ids[-1])
        return np.array(X_ngrams), np.array(y)
    
    def get_ngram_doc(self, doc: str):
        doc_token = self.tokenizer(doc)
        doc_token = self.replace_unk(doc_token)
        doc_token = [word.lower() for word in doc_token]
        doc_token = [self.SOS] * (self.N-1) + doc_token + [self.EOS]
        return list(ngrams(doc_token, self.N))
    
    def replace_unk(self, doc_token: list) -> list:
        return [word if word in self.vocab else self.UNK for word in doc_token]
    
    def get_vocab_size(self) -> int:
        return len(self.vocab)

In [48]:
args = Namespace()
args.N = 4

tk = TweetTokenizer()
ngram_data = NGramData(args.N, 5000, tk.tokenize)
ngram_data.fit(X_train)

len(ngram_data.vocab)

5003

In [49]:
X_ngram_train, y_ngram_train = ngram_data.transform(X_train)
X_ngram_test, y_ngram_test = ngram_data.transform(X_test)   
X_ngram_train, y_ngram_train

(array([[ 616,  616,  616],
        [ 616,  616, 2183],
        [ 616, 2183, 3387],
        ...,
        [1648, 1472, 2701],
        [1472, 2701, 2337],
        [2701, 2337, 1579]]),
 array([2183, 3387,  380, ..., 2337, 1579, 4981]))

In [50]:
args.batch_size = 32
args.epochs = 100
args.lr = 2.3e-1
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
args.num_workers = 0

args.vocab_size = ngram_data.get_vocab_size()
args.embedding_dim = 30
args.hidden_dim = 64
args.num_layers = 2
args.dropout = 0.25

train_dataset = TensorDataset(
    torch.tensor(X_ngram_train, dtype=torch.int64),
    torch.tensor(y_ngram_train, dtype=torch.int64),
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=args.batch_size,
    shuffle=True,
    num_workers=args.num_workers,
)

test_dataset = TensorDataset(
    torch.tensor(X_ngram_test, dtype=torch.int64),
    torch.tensor(y_ngram_test, dtype=torch.int64),
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=args.batch_size,
    shuffle=False,
    num_workers=args.num_workers,
)


In [51]:
class NeuralLM(nn.Module):

    def __init__(self, args):
        super(NeuralLM, self).__init__()
        self.args = args
        self.window_size = args.N-1
        self.embedding_size = args.embedding_dim
        self.hidden_size = args.hidden_dim

        self.embedding = nn.Embedding(
            num_embeddings=args.vocab_size,
            embedding_dim=args.embedding_dim,
        )
        self.fc1 = nn.Linear(
            self.embedding_size * self.window_size,
            self.hidden_size,
        )
        self.fc2 = nn.Linear(
            self.hidden_size,
            args.vocab_size,
            bias=False
        )
        self.dropout = nn.Dropout(args.dropout)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(-1, self.embedding_size * self.window_size)
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [52]:
def predict(x):
    with torch.no_grad():
        probs = F.softmax(x.detach(), dim=1)
        return torch.argmax(probs, dim=1).cpu().numpy()

def model_eval(data, model, gpu=False):
    with torch.no_grad():
        preds, targets = [], []
        for window_words, labels in data:
            if gpu:
                window_words = window_words.cuda()
            output = model(window_words)
            pred = predict(output)
            target = labels.numpy()
            preds.extend(pred)
            targets.extend(target)
    return accuracy_score(targets, preds)

def model_eval_f1(data, model, gpu=False):
    with torch.no_grad():
        preds, targets = [], []
        for window_words, labels in data:
            if gpu:
                window_words = window_words.cuda()
            output = model(window_words)
            pred = predict(output)
            target = labels.numpy()
            preds.extend(pred)
            targets.extend(target)
    return f1_score(targets, preds, average='macro')

def save_checkpoint(state, is_best, ckp_path, filename):
    torch.save(state, os.path.join(ckp_path, filename))
    if is_best:
        shutil.copyfile(
            os.path.join(ckp_path, filename),
            os.path.join(ckp_path, 'model_best.pth.tar')
        )

In [53]:
args.patience = 20
args.lr_patience = 10
args.lr_factor = 0.5
args.ckp_path = 'checkpoints'
args.model_name = 'ngram_lm.pt'
args.savedir = 'model'

if not os.path.exists(args.ckp_path):
    os.makedirs(args.ckp_path)
            
model = NeuralLM(args)

if args.device.type == 'cuda':
    model = model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=args.lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=args.lr_factor,
    patience=args.lr_patience,
    verbose=True,
)

In [54]:
if os.path.exists(os.path.join(args.ckp_path, args.model_name)):
    checkpoint = torch.load(os.path.join(args.ckp_path, args.model_name))
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    scheduler.load_state_dict(checkpoint['scheduler'])
    start_epoch = checkpoint['epoch']
    best_loss = checkpoint['loss']
    print('Loaded model from epoch {0}, with best loss {1:.3f}'.format(
        start_epoch, best_loss))
else:
    start_epoch = 1
    best_loss = float('inf')

Loaded model from epoch 100, with best loss 4.517


In [55]:
start_time = time.time()    
best_acc = 0
metric_history = []
train_metric_history = []
for epoch in range(args.epochs):
    epoch_start_time = time.time()
    model.train()
    for window_words, labels in train_loader:
        if args.device.type == 'cuda':
            window_words = window_words.cuda()
            labels = labels.cuda()
        optimizer.zero_grad()
        output = model(window_words)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    train_acc = model_eval(train_loader, model, args.device.type == 'cuda')
    train_metric_history.append(train_acc)

    scheduler.step(loss)

    if train_acc > best_acc:
        best_acc = train_acc
        is_best = True
        patience_counter = 0
    else:
        is_best = False
        patience_counter += 1

    if patience_counter >= args.patience:
        print('Early stopping')
        break

    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_acc': best_acc,
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict(),
        'loss': loss,
    }, is_best, args.ckp_path, args.model_name)

    epoch_end_time = time.time()
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_end_time-epoch_start_time:.2f}s')
    print(f'\tTrain Loss: {loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    
end_time = time.time()
print(f'Total Training Time: {end_time-start_time:.2f}s')

Epoch: 01 | Epoch Time: 6.34s
	Train Loss: 8.397 | Train Acc: 13.94%
Epoch: 02 | Epoch Time: 6.57s
	Train Loss: 6.564 | Train Acc: 14.92%
Epoch: 03 | Epoch Time: 6.80s
	Train Loss: 6.905 | Train Acc: 15.21%
Epoch: 04 | Epoch Time: 7.12s
	Train Loss: 7.123 | Train Acc: 15.34%
Epoch: 05 | Epoch Time: 7.05s
	Train Loss: 7.103 | Train Acc: 15.36%
Epoch 00107: reducing learning rate of group 0 to 3.5938e-03.
Epoch: 06 | Epoch Time: 7.18s
	Train Loss: 5.047 | Train Acc: 15.40%
Epoch: 07 | Epoch Time: 6.98s
	Train Loss: 7.173 | Train Acc: 15.42%
Epoch: 08 | Epoch Time: 7.01s
	Train Loss: 5.729 | Train Acc: 15.45%
Epoch: 09 | Epoch Time: 7.02s
	Train Loss: 6.025 | Train Acc: 15.67%
Epoch: 10 | Epoch Time: 6.97s
	Train Loss: 4.298 | Train Acc: 15.66%
Epoch: 11 | Epoch Time: 7.05s
	Train Loss: 6.750 | Train Acc: 15.75%
Epoch: 12 | Epoch Time: 7.01s
	Train Loss: 6.388 | Train Acc: 15.76%
Epoch: 13 | Epoch Time: 7.15s
	Train Loss: 4.595 | Train Acc: 15.82%
Epoch: 14 | Epoch Time: 6.50s
	Train Loss

In [56]:
model.eval()
test_acc = model_eval_f1(test_loader, model, args.device.type == 'cuda')
print(f'Test F1: {test_acc}')

Test F1: 0.0011575516149752347


In [57]:
def print_closest_words(embeddings, ngram_data, word, n):
    index = ngram_data.word2idx[word]
    embedding = embeddings[index]
    cosines = np.dot(embeddings, embedding)
    idxs = np.argsort(cosines)[::-1][:n]
    for idx in idxs:
        print(ngram_data.idx2word[idx], cosines[idx])

embeddings = model.embedding.weight.data.cpu().numpy()
print_closest_words(embeddings, ngram_data, 'comida', 10)

comida 21.857433
sistemas 20.6578
aquí 16.491972
jotos 16.309084
obsesionada 15.068522
#panamá 14.623585
profe 13.6933365
voces 13.524374
tico 13.416697
mueras 13.334407


In [58]:
def parse_text(text, tokenizer):
    all_tokens = [ 
        w.lower() if w in ngram_data.word2idx else ngram_data.UNK for w in tokenizer(text)
    ]   
    token_ids = [
        ngram_data.word2idx[w.lower()] for w in all_tokens
    ]
    return all_tokens, token_ids

def generate_sentence(model, initial_text, tokenizer):
    model.eval()
    all_tokens, token_ids = parse_text(initial_text, tokenizer)
    for _ in range(100):
        window_words = torch.tensor(token_ids[-model.window_size:])
        if args.device.type == 'cuda':
            window_words = window_words.cuda()
        output = model(window_words)
        next_id = predict(output)[0]
        all_tokens.append(ngram_data.idx2word[next_id])
        token_ids.append(next_id)
        if all_tokens[-1] == ngram_data.EOS:
            break
    return all_tokens

initial_text = ngram_data.SOS * 3 + 'yo puede ser que'
generated_text = generate_sentence(model, initial_text, ngram_data.tokenizer)
print(' '.join(generated_text))

<s> <s> <s> yo puede ser que <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>


In [65]:
def log_likelihood(model, text, ngram_model):
    model.eval()
    X, y = ngram_data.transform([text])
    X, y = X[2:], y[2:]
    X = torch.tensor(X).unsqueeze(0)
    y = torch.tensor(y).unsqueeze(0)
    if args.device.type == 'cuda':
        X = X.cuda()
        y = y.cuda()
    output = model(X)
    probs = F.softmax(output, dim=1).cpu().detach().numpy()

    return np.sum([
        np.log(
            probs[i][w]
        )
        for i, w in enumerate(y.cpu())
    ])

log_likelihood(model, '<s> <s> <s> estoy comiendo un', ngram_data)

-9.493666

In [67]:
from itertools import permutations
from random import shuffle

word_list = 'si no gano me voy a la chingada'.split(' ')
perms = [ ' '.join(perm) for perm in permutations(word_list)]

perms.sort(key=lambda x: log_likelihood(model, '<s> <s> <s> ' + x, ngram_data), reverse=True)

for perm in perms[:5]:
    print(perm, log_likelihood(model, '<s> <s> <s> ' + perm, ngram_data))
print('...')
for perm in perms[-5:]:
    print(perm, log_likelihood(model, '<s> <s> <s> ' + perm, ngram_data))

si no gano me voy a la chingada -59.223415
si no gano me voy a chingada la -59.223415
si no gano me voy la a chingada -59.223415
si no gano me voy la chingada a -59.223415
si no gano me voy chingada a la -59.223415
...
chingada la a no si voy gano me -59.223423
chingada la a gano voy si no me -59.223423
chingada la a gano voy no si me -59.223423
chingada la a voy gano si no me -59.223423
chingada la a voy gano no si me -59.223423
