# T5 - Juan Luis Baldelomar Cabrera

In [182]:
# os
import random

# NLP and numpy
import nltk 
import numpy as np
import nltk
from nltk.probability import FreqDist
from nltk import TweetTokenizer
from nltk.corpus import stopwords
import pandas as pd

# torch
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from torch.nn import functional as F

# metrics
from sklearn.metrics import accuracy_score as accuracy

In [45]:
seed = 1111
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False

In [132]:
def load_data(filename, labels_filename):
    file = open(filename, 'r')
    labels_file = open(labels_filename, 'r')
    tweets = file.read()
    labels = labels_file.read()
    documents = tweets.split('\n')
    labels = labels.split('\n')
    documents.pop(-1)
    labels.pop(-1)
    file.close()
    labels_file.close()
    return documents, labels

In [133]:
documents, labels = load_data('data/mex_train.txt', 'data/mex_train_labels.txt')
val_documents, val_labels = load_data('data/mex_val.txt', 'data/mex_val_labels.txt')

In [23]:
def print_doc(doc:list, end=' ', stop=-1):
    stop = len(doc) if stop is None else stop
    for token in doc[:stop]:
        print(token, end=end)
    print('')

In [4]:
def get_vocabulary(tokenized_docs, n):
    tokens = [token for doc in tokenized_docs for token in doc]
    unique_tokens = FreqDist(tokens).most_common(n)
    return [token for token, _ in unique_tokens]

def word2ids(vocabulary):
    word2id = {}
    id2word = {}
    
    # build both dictionaries
    for i, word in enumerate(vocabulary):
        word2id[word] = i
        id2word[i] = word
    
    # add special tokens
    n = len(word2id)
    word2id['<s>']   = n 
    word2id['</s>']  = n + 1
    word2id['<unk>'] = n + 2
    id2word[n]       = '<s>'
    id2word[n + 1]   = '</s>'
    id2word[n + 2]   = '<unk>'
    
    return word2id, id2word

In [169]:
class NGramBuilder:
    def __init__(self, tokenizer=None, embeddings=None, d_model=256, sos='<s>', eos='</s>', unk='<unk>', punctuation=None, postprocess=None):
        self.tokenizer = self.default_tokenizer() if tokenizer == None else tokenizer
        self.embeddings = embeddings
        self.d_model = d_model
        # special symbols
        self.SOS = sos
        self.EOS = eos
        self.UNK = unk
        # vocabulary 2 id and viceversa
        self.word2id  = None
        self.id2word  = None
        self.voc_size = 0
        # post tokenization functions
        self.punctuation = set(punctuation) if punctuation != None else None
        self.postprocess = postprocess if postprocess is not None else lambda x : x
        
    def default_tokenizer(doc):
        return TweetTokenizer().tokenize
    
    def get_vocabulary(self):
        return set(self.word2id.keys())
    
    def remove_punct(self, tokenized_documents):
        if self.punctuation == None:
            return tokenized_documents
        else:
            return [[token for token in doc if token not in self.punctuation] for doc in tokenized_documents]
        
    def _transform(self, tokenized_docs):
        N = self.N
        # docs and labels lists
        ngram_docs, ngram_targs = [], []
        # traverse each doc
        for doc in tokenized_docs:
            # add padding
            doc = [self.SOS]*(N - 1)  + doc + [self.EOS]
            # build list of ids from word2id dict and replace with UNK
            unk_id = self.word2id.get(self.UNK, 0)
            ids = [self.word2id.get(word, unk_id) for word in doc]
            
            # traverse each word as center and build ngrams
            for i in range(N-1, len(doc)):    
                ngram_docs.append(ids[i-(N-1): i])
                ngram_targs.append(ids[i])
                
        return np.array(ngram_docs), np.array(ngram_targs)
    
    def _tokenize(self, documents):
        tokenized_docs = [self.tokenizer(doc.lower()) for doc in documents]
        tokenized_docs = self.remove_punct(tokenized_docs)
        tokenized_docs = self.postprocess(tokenized_docs)
        return tokenized_docs
    
    def build_emb_matrix(self):
        dim_v = len(self.word2id)
        if self.embeddings is None:
            self.emb_matrix = np.random.rand(dim_v, self.d_model)
        else:
            self.emb_matrix = np.zeros((dim_v, self.d_model))
            for word in self.word2id.keys():
                if word in self.embeddings:
                    self.emb_matrix = self.embeddings[word]
                
    def fit(self, documents, N, t=10000):
        self.N = N
        # tokenize documents
        tokenized_docs = self._tokenize(documents)
        
        # get vocabulary and word2id and ids2word dicts
        vocabulary = get_vocabulary(tokenized_docs, t-3)
        self.word2id, self.id2word = word2ids(vocabulary)
        self.voc_size = len(self.word2id)
        self.build_emb_matrix()
        
        return self._transform(tokenized_docs)
    
    def transform(self, documents):
        # tokenize, get vocabulary and word2id and ids2word dicts
        tokenized_docs = self._tokenize(documents)
        return self._transform(tokenized_docs)
    
    def inverse(self, docs_as_ids):
        return [self.id2word.get(tok_id) for tok_id in docs_as_ids]

In [170]:
ngram_builder = NGramBuilder()
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=4)
val_ngram_docs, val_ngram_labels = ngram_builder.transform(val_documents)

In [171]:
ngram_builder.emb_matrix.shape

(10000, 256)

In [172]:
doc = ngram_builder.inverse(ngram_labels)
print_doc(doc[:30])

lo peor de todo es que no me dan por un tiempo y luego vuelven estoy hasta la verga de estl </s> a la vga no seas mamón 45 


In [137]:
ngram_docs[:10]

array([[10000, 10000, 10000],
       [10000, 10000,    28],
       [10000,    28,   282],
       [   28,   282,     1],
       [  282,     1,    59],
       [    1,    59,    17],
       [   59,    17,     0],
       [   17,     0,     6],
       [    0,     6,     7],
       [    6,     7,   315]])

## Char NGram

In [8]:
def char_postprocess(documents):
    return [[c for c in word] for doc in documents for word in doc]        

In [77]:
ngram_builder = NGramBuilder(postprocess=char_postprocess)
ngram_docs, ngram_labels = ngram_builder.fit(documents, N=4)
val_ngram_docs, val_ngram_labels = ngram_builder.transform(val_documents)

In [79]:
word = ngram_builder.inverse(ngram_labels)
print_doc(word[:100], end='', stop=-1)

lo</s>peor</s>de</s>todo</s>es</s>que</s>no</s>me</s>dan</s>por</s>un</s>tiempo</s>y</s>luego</s>vuelven</s>estoy</s>hasta</s>la</s>verga</s>de</s>estl</s>a</s>la</s>vg


In [81]:
ngram_builder.word2id['</s>']

365

In [80]:
ngram_docs[:10]

array([[364, 364, 364],
       [364, 364,   9],
       [364,   9,   2],
       [364, 364, 364],
       [364, 364,  13],
       [364,  13,   1],
       [ 13,   1,   2],
       [  1,   2,   4],
       [364, 364, 364],
       [364, 364,  10]])

# Dataset

In [175]:
train_ds = TensorDataset(torch.tensor(ngram_docs, dtype=torch.int64), torch.tensor(ngram_labels, dtype=torch.int64))
train_loader = DataLoader(train_ds, shuffle=True, batch_size=64, num_workers=2)

val_ds = TensorDataset(torch.tensor(val_ngram_docs, dtype=torch.int64), torch.tensor(val_ngram_labels, dtype=torch.int64))
val_loader = DataLoader(val_ds, batch_size=64, num_workers=2)

In [163]:
torch.cuda.is_available()

True

In [None]:
list(train_loader)[0][0].cuda()

# Neural Language Model

In [160]:
class BengioModel(nn.Module):
    def __init__(self, N, voc_size, d_model, hidden_size=128, emb_mat=None, dropout=0.1):
        
        super(BengioModel, self).__init__()
        # parameters
        self.N           = N
        self.d_model     = d_model
        self.voc_size    = voc_size
        self.hidden_size = hidden_size
        
        # Matriz entrenable de embeddings, tamaño vocab_size x Ngram.d_model
        self.embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(emb_mat), freeze=False)
        
        # fully connected layers
        self.fc1 = nn.Linear(d_model * (N-1), hidden_size)
        self.fc2 = nn.Linear(hidden_size, voc_size)
        
        # dropout
        self.drop = nn.Dropout(dropout)
        
    
    def forward(self, input_seq):
        # Calcula el embedding para cada palabra.
        x = self.embeddings(input_seq)
        x = x.view(-1, (self.N-1) * self.d_model)
        x = self.fc1(x)
        x = self.drop(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

In [164]:
def get_preds(raw_logit):
    probs = F.softmax(raw_logit.detach(), dim=1)
    y_pred = torch.argmax(probs, dim=1).cpu().numpy()
    return y_pred

## Test Model Forward

In [174]:
voc_size = ngram_builder.voc_size
N = ngram_builder.N
d_model = ngram_builder.d_model

model = BengioModel(N=N, voc_size=voc_size, d_model=d_model, emb_mat=ngram_builder.emb_matrix)

In [176]:
inputs, targs = list(train_loader)[0]

In [180]:
get_preds(model(inputs))

array([7311, 1509, 8174, 9097, 5687, 7311, 3779, 1184, 6567,  844,  692,
       2254,  778, 9837, 3779, 5687, 4098, 9837, 3779, 3779, 9837, 3823,
        778, 7311, 1184, 1623, 6604,  692,  692,  692, 5158, 3885, 3885,
       7311, 4908, 1961, 3885,  692,  692, 6122, 4098, 4098,  775, 7311,
        692,  310, 6604,  778,  692, 6604, 9544, 3779, 3779,  692, 7311,
       7311, 3779, 3779, 5687, 9837,  692,  692, 7788,  692])

# Eval

In [184]:
def eval_model(data, model, gpu=False):
    preds, targets = [], []
    with torch.no_grad():
        for inputs, labels in data:
            if gpu:
                # move inputs to gpu
                inputs = inputs.cuda()
            
            # compute output predictions    
            output = model(inputs)
            batch_preds = get_preds(output)
            # append preds and targets
            preds.append(batch_preds)
            targets.append(labels.numpy())
    
    # remove batch dimension
    preds = [p for batch_pred in preds for p in batch_pred]
    targets = [t for batch_tar in targets for t in batch_tar]
    return accuracy(preds, targets)

In [None]:
def checkpoint(state, path, is_best):
    if is_best:
        torch.save(state, path)

# Hyperparameters

In [166]:
criterion = nn.CrossEntropyLoss()

# Train Step

In [192]:
lr = 2.3e-1 
epochs = 50
patience = epochs//5

lr_patience = 10
lr_factor = 0.5

# gpu available?
use_gpu = torch.cuda.is_available()

# build model and move to gpu if possible
model = BengioModel(N=N, voc_size=voc_size, d_model=d_model, emb_mat=ngram_builder.emb_matrix)
if use_gpu:
    model = model.cuda()
    
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            'min',
            patience = lr_patience,
            verbose=True,
            factor = lr_factor
        )

In [193]:
best_metric = 0
best_model = None
last_metric = 0
val_metrics = []
counter = 0

for epoch in range(epochs):
    print('epoch: ', epoch)
    epoch_metrics = []
    for inputs, targets in train_loader:
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda()
        
        # feed model and get loss
        output = model(inputs)
        loss = criterion(output, targets)
        
        # metric with train dataset
        preds = get_preds(output)
        epoch_metrics.append(accuracy(preds, targets.cpu().numpy()))
            
        # step to optimize 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # get metric for training set
    train_acc = np.mean(epoch_metrics)
    val_acc = eval_model(val_loader, model, use_gpu)
    val_metrics.append(val_acc)
    
    # print metrics
    print('train accuracy mean: ', train_acc)
    print('validation accuracy: ', val_acc)
    
    # patience and last metric update
    counter = counter + 1 last_metric > val_acc else 0
    best_metric = val_acc if val_acc > best_metric else best_metric
    last_metric = val_acc
    
    state = {
            'epoch' : epoch + 1
            'optimizer': optimizer.state_dict()
            'model': model.state_dict()
            'scheduler': scheduler.state_dict()
            'best_metric': best_metric
    }
    
    checkpoint(state, 'best_model', val_acc>best_metric)
    
    
    if counter > patience:
        break    

epoch:  0
accuracy mean:  0.04045678827751196
epoch:  1
epoch:  2
epoch:  3
epoch:  4
epoch:  5
accuracy mean:  0.0975927033492823
epoch:  6
epoch:  7
epoch:  8
epoch:  9
epoch:  10
accuracy mean:  0.12285249700956938
epoch:  11
epoch:  12
epoch:  13
epoch:  14
epoch:  15
accuracy mean:  0.1393596740430622
epoch:  16
epoch:  17
epoch:  18
epoch:  19


In [195]:
eval_model(val_loader, model, use_gpu)

0.1323098154217699