In [3]:
#####################################################################################################

# WordVectors.py

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, sampler
import torch.nn as nn
import numpy as np

import itertools

class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self, tokens_dict={}, frequencies_dict={}):
        
        self._idx_to_tk = tokens_dict
        self._tk_to_idx = {tk: idx for idx, tk in tokens_dict.items()}
        self._idx_to_freq = frequencies_dict
        self.max_idx = len(self)
        
    @classmethod
    def from_corpus(cls, corpus, cutoff_freq=0):
        corpus_words = sorted(list(set([item for sublist in corpus for item in sublist])))
        freqs_dict = {word: 0 for word in corpus_words}
        for doc in corpus:
            for token in doc:
                freqs_dict[token] += 1
        freqs = np.array(list(freqs_dict.values()))
        mask = freqs > cutoff_freq
        corpus_words = {idx: tk for idx, tk in enumerate(itertools.compress(corpus_words,mask))}
        freqs = {idx: freq for idx, freq in enumerate(freqs[mask])}
        return cls(corpus_words, freqs)

    def index_to_token(self, index):
        return self._idx_to_tk[index]

    def token_to_index(self, token):
        return self._tk_to_idx[token]
        
    def get_freq(self, tk_or_idx):
        
        if isinstance(tk_or_idx, int):
            freq = self._idx_to_freq[tk_or_idx]
        elif isinstance(tk_or_idx, str):
            freq = 0 if tk_or_idx not in self._tk_to_idx else self._idx_to_freq[self._tk_to_idx[tk_or_idx]]
        else:
            raise KeyError('{} must be either integer or string'.format(tk_or_idx))
        return freq

    def __str__(self):
        return "<Vocabulary(size={})>".format(len(self))

    def __len__(self):
        return len(self._idx_to_tk)
    
    def __getitem__(self,tk_or_idx):
        if isinstance(tk_or_idx, int):
            return self.index_to_token(tk_or_idx)
        if isinstance(tk_or_idx, str):
            return self.token_to_index(tk_or_idx)
        raise KeyError('{} must be either integer or string'.format(tk_or_idx))
        
    def __iter__(self):
        self.current = 0
        return self
    
    def __next__(self):
        if self.current >= self.max_idx:
            raise StopIteration
        else:
            token = self._idx_to_tk[self.current]
            self.current += 1
            return token

    def __contains__(self,key):
        return key in self._tk_to_idx
    
    
class Word2VecSamples(Dataset):
    
    unk_token = '<UNK>'
    
    def samples_generator(self, doc):
        for t, token in enumerate(doc):
            if token in self.vocabulary:
                len_doc = len(doc)
                cond1 = max(-1,t-self.window_size) == -1
                cond2 = min(t+self.window_size, len_doc) == len_doc
                if cond1 and cond2:
                    context = itertools.chain(doc[:t],doc[t+1:])
                if cond1 and not cond2:
                    context = itertools.chain(doc[:t],doc[t+1:t+self.window_size+1])
                if cond2 and not cond1:
                    context = itertools.chain(doc[t-self.window_size:t],doc[t+1:])
                if not cond1 and not cond2:
                    context = itertools.chain(doc[t-self.window_size:t],doc[t+1:t+self.window_size+1])

                context_list = [self.vocabulary.token_to_index(tk) for tk in context if tk in self.vocabulary]
                if len(context_list) != 0:
                    yield (self.vocabulary.token_to_index(token), context_list)
    

    def __init__(self, corpus, window_size=2, cutoff_freq=0):
        
        # Obtengo el vocabulario a partir del corpus ya tokenizado:
        self.vocabulary = Vocabulary.from_corpus(corpus,cutoff_freq=cutoff_freq)
    
        # Obtengo el contexto a partir del corpus:
        self.padding_idx = len(self.vocabulary)
        self.window_size = window_size
        
        word_indeces = []
        word_contexts = []
        for doc in corpus:
            gen = self.samples_generator(doc)
            for word_index, word_context in gen:
                word_indeces.append(word_index)
                padd_num = 2 * window_size - len(word_context)
                if padd_num > 0:
                    word_contexts.append(word_context + [self.padding_idx for i in range(padd_num)])
                else:
                    word_contexts.append(word_context)
        
        self.word_indeces = torch.tensor(word_indeces,dtype=torch.long)
        self.context_indeces = torch.tensor(word_contexts,dtype=torch.long)
        
    def __getitem__(self,idx):
        return self.word_indeces[idx], self.context_indeces[idx,:]
    
    def __len__(self):
        return len(self.word_indeces)

#####################################################################################################

# Training.py

import torch
from torch.utils.data import DataLoader, sampler
import torch.optim as optim
import torch.nn as nn
#from .WordVectors import *
import numpy as np


class ModelTrainer(object):
    """
        Clase madre de todos los trainers
    """
    
    def __init__(self,
                 train_dataset,
                 test_dataset,
                 batch_size=64,
                 val_size=.02):
        
        # Data:
        tr, val, te = self.generate_data_batches(train_dataset, test_dataset,batch_size,val_size)
        self.train_dataloader, self.val_dataloader, self.test_dataloader = tr, val, te
        
        # Data-types:
        self.input_dtype = next(iter(self.train_dataloader))[0].dtype
        self.target_dtype = next(iter(self.train_dataloader))[1].dtype
        
        self.first_time = True
        self.batch_len = len(self.train_dataloader)
        
        print('Model trainer created:')
        train_samples = int((1 - val_size) * len(train_dataset)) 
        val_samples = len(train_dataset) - train_samples
        test_samples = len(test_dataset)
        total_samples = train_samples + val_samples + test_samples
        percent_val, percent_test = int((val_samples / total_samples) * 100), int((test_samples / total_samples) * 100)
        print('Number of training samples: {} ({}%)'.format(train_samples, 100 - percent_val - percent_test))
        print('Number of validation samples: {} ({}%)'.format(val_samples, percent_val))
        print('Number of test samples: {} ({}%)'.format(test_samples, percent_test))
        print('Number of train batches: {}'.format(self.batch_len))
        print('Number of samples per batch: {}'.format(batch_size))
        print()
        
        
    def generate_data_batches(self,train_dataset, test_dataset, # Train y test datasets
                              batch_size = 64, # Tamaño del batch
                              val_size = .02): # Proporción de muestras utilizadas para validación 
    
        """
            Función para iterar sobre los batches de muestras. 
            Devuelve los dataloaders de train / validation / test.
            
        """

        # Separo las muestras aleatoriamente en Train y Validation:
        NUM_TRAIN = int((1 - val_size) * len(train_dataset)) 
        samples_idx = torch.randperm(len(train_dataset))
        train_samples_idx = samples_idx[:NUM_TRAIN]
        val_samples_idx = samples_idx[NUM_TRAIN:]
        my_sampler = lambda indices: sampler.SubsetRandomSampler(indices) # sampler

        # Dataloader para las muestras de entrenamiento:
        train_dataloader = DataLoader(train_dataset, 
                                      batch_size=batch_size, 
                                      sampler=my_sampler(train_samples_idx))

        # Dataloader para las muestras de validación:
        val_dataloader = DataLoader(train_dataset, 
                                    batch_size=batch_size, 
                                    sampler=my_sampler(val_samples_idx))

        # Dataloader para las muestras de testeo:
        test_dataloader = DataLoader(test_dataset, 
                                     batch_size=batch_size)

        return train_dataloader, val_dataloader, test_dataloader
    
    
    def InitModel(self, model, state_dict=None, device='cpu'):
        
        # Defino el dispositivo sobre el cual trabajar:
        if device is None:
            self.device = torch.device('cpu')
            print('No se seleccionó ningún dispositivo de entrenamiento. Se asigna la cpu')
        elif device == 'cpu':
            self.device = torch.device('cpu')
            print('Dispositivo seleccionado: cpu')
        elif device == 'cuda:0' or device == 'cuda:1':
            if torch.cuda.is_available():
                self.device = torch.device(device)
                print('Dispositivo seleccionado: {}'.format(device))
            else:
                self.device = torch.device('cpu')
                print('No se dispone de GPUs. Se asigna como dispositivo de entrenamiento la cpu')
        else:
            raise TypeError('No se seleccionó un dispositivo válido')
            
        # Defino el modelo:
        self.model = model
        
        # Inicializo con los parámetros de state_dict si hubiera:
        if state_dict is not None:
            self.model.load_state_dict(state_dict)
        
        # Copio el modelo al dispositivo:
        self.model = self.model.to(device=self.device)

    def SaveModel(self,file):
        
        try:
            torch.save(self.model.state_dict(),file)
            print('Embeddings saved to file {}'.format(file))
        except:
            print('Embeddings could not be saved to file')
        
        
    def Train(self, algorithm='SGD', epochs=1, sample_loss_every=100, check_on_train=False, **kwargs):
        
        if algorithm == 'SGD':
            optimizer = optim.SGD(self.model.parameters(), **kwargs)
        elif algorithm == 'Adam':
            optimizer = optim.Adam(self.model.parameters(), **kwargs)
        self.model.train()
        
        if self.first_time:
            print('Starting training...')
            self.loss_history = {'iter': [], 'loss': []}
            n_iter = 0
            self.first_time = False
        else:
            n_iter = self.loss_history['iter'][-1]
            print('Resuming training...')
        
        print('Optimization method: {}'.format(algorithm))
        print('Learning Rate: {:.2g}'.format(kwargs['lr']))
        print('Number of epochs: {}'.format(epochs))
        print('Running on device ({})'.format(self.device))
        print()
        
        try:
    
            for e in range(epochs):
                for t, (x,y) in enumerate(self.train_dataloader):

                    x = x.to(device=self.device, dtype=self.input_dtype)
                    y = y.to(device=self.device, dtype=self.target_dtype)

                    optimizer.zero_grad() # Llevo a cero los gradientes de la red
                    scores = self.model(x) # Calculo la salida de la red
                    loss = self.model.loss(scores,y) # Calculo el valor de la loss
                    loss.backward() # Calculo los gradientes
                    optimizer.step() # Actualizo los parámetros

                    if (e * self.batch_len + t) % sample_loss_every == 0:
                        num_correct_val, num_samples_val = self.check_accuracy('validation')
                        self.performance_history['iter'].append(e * self.batch_len + t + n_iter)
                        self.performance_history['loss'].append(loss.item())
                        self.performance_history['accuracy'].append(float(num_correct_val / num_samples_val))
                        print('Epoch: {}, Batch number: {}'.format(e+1, t))
                        print('Accuracy on validation dataset: {}/{} ({:.2f}%)'.format(num_correct_val, num_samples_val, 100 * float(num_correct_val) / num_samples_val))
                        print()

                        if check_on_train:
                            num_correct_train, num_samples_train = self.check_accuracy('train')
                            print('Accuracy on train dataset: {}/{} ({:.2f}%)'.format(num_correct_train, num_samples_train, 100 * float(num_correct_train) / num_samples_train))
                            print()

            print('Training finished')
            print()

        except KeyboardInterrupt:

            print('Exiting training...')
            print()    

    def check_accuracy(self, dataset='validation'):
        
        num_correct = 0
        num_samples = 0
        
        if dataset == 'train':
            loader = self.train_dataloader
        elif dataset == 'validation':
            loader = self.val_dataloader
        elif dataset == 'test':
            loader = self.test_dataloader
        else:
            raise AttributeError('Please specify on which dataset to perform de accuracy calculation')
        
        self.model.eval()
        with torch.no_grad():
            for x, y in loader:
                x = x.to(device=self.device, dtype=self.input_dtype)  
                y = y.to(device=self.device, dtype=self.target_dtype)

                scores = self.model(x)
                _, preds = scores.max(1)
                num_correct += (preds == y).sum()
                num_samples += preds.size(0)

        self.model.train()
        return num_correct, num_samples

    def CheckResultsOnTest(self):
        
        total_corrects = 0
        total_samples = 0
        total_performance = 0.
        
        for (x,y) in enumerate(self.test_dataloader):
            x = x.to(device=self.device, dtype=self.input_dtype)
            y = y.to(device=self.device, dtype=self.target_dtype)
            num_correct, num_samples = self.check_accuracy('test')
            total_corrects += num_corrects
            total_samples += num_samples
            total_performance += float(num_correct / num_samples)
        
        print('Final accuracy on test set: {}/{} ({}%)'.format(total_corrects,total_samples,total_performance))

In [5]:
class Word2vecTrainer(ModelTrainer):
    
    """
        Clase para entrenar word embeddings. 
    
    """
    
    def __init__(self,
                 train_corpus,           # Corpus de entrenamiento (debe ser una lista de listas de strings).
                 test_corpus=None,       # Corpus para testear los resultados. Puede no darse ninguno.
                 cutoff_freq=0,          # Descartar palabras cuya frecuencia sea menor a este valor.
                 window_size=2,          # Tamaño de la ventana.
                 batch_size=64):         # Tamaño del batch.
        
        #iter_trough_token = itertools.chain(corpus)
        
        # Obtengo los batches de muestras:
        word2vec_samples = Word2VecSamples(train_corpus, window_size=window_size, cutoff_freq=cutoff_freq)
        samples_idx = torch.randperm(len(word2vec_samples))
        my_sampler = lambda indices: sampler.SubsetRandomSampler(indices)
        self.dataloader = DataLoader(word2vec_samples, batch_size=batch_size, sampler=my_sampler(samples_idx))
        self.batch_len = len(self.dataloader)
        
        
corpus = [['Esto', 'es', 'un', 'corpus', 'de', 'prueba'], ['Esto', 'también'], ['corpus', 'de', 'prueba']]
trainer = Word2vecTrainer(corpus,cutoff_freq=0,window_size=2,batch_size=2)