In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, sampler

# Clase madre:

In [12]:
class Trainer(object):
    
    def generate_data_batches(self,train_dataset, test_dataset, # Train y test datasets
                              batch_size = 64, # Tamaño del batch
                              val_size = .02): # Proporción de muestras utilizadas para validación 

        """
            Función para iterar sobre los batches de muestras. 
            Devuelve los dataloaders de train / validation / test.

        """

        # Sampler
        my_sampler = lambda indices: sampler.SubsetRandomSampler(indices) 
        
        if val_size != 0:
            # Separo las muestras aleatoriamente en Train y Validation:
            NUM_TRAIN = int((1 - val_size) * len(train_dataset)) 
            samples_idx = torch.randperm(len(train_dataset))
            train_samples_idx = samples_idx[:NUM_TRAIN]
            val_samples_idx = samples_idx[NUM_TRAIN:]
            # Dataloader para las muestras de validación:
            val_dataloader = DataLoader(train_dataset, 
                                        batch_size=batch_size, 
                                        sampler=my_sampler(val_samples_idx))
        else:
            val_dataloader = None
            
        if test_dataset is not None:
            # Dataloader para las muestras de testeo:
            test_dataloader = DataLoader(test_dataset, 
                                         batch_size=batch_size)
        else:
            test_dataloader = None
            
        # Dataloader para las muestras de entrenamiento:
        train_dataloader = DataLoader(train_dataset, 
                                      batch_size=batch_size, 
                                      sampler=my_sampler(train_samples_idx))

        return train_dataloader, val_dataloader, test_dataloader
    
    
    def __init__(self,
                 train_dataloader,
                 val_dataloader,
                 test_dataloader):
        
        # Dataloaders:
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.test_dataloader = test_dataloader
        
        # Data-types:
        self.input_dtype = next(iter(self.train_dataloader))[0].dtype
        self.target_dtype = next(iter(self.train_dataloader))[1].dtype
        
        
    def InitModel(self, model, state_dict=None, device='cpu'):
        
        # Defino el dispositivo sobre el cual trabajar:
        if device is None:
            self.device = torch.device('cpu')
            print('No se seleccionó ningún dispositivo de entrenamiento. Se asigna la cpu')
        elif device == 'cpu':
            self.device = torch.device('cpu')
            print('Dispositivo seleccionado: cpu')
        elif device == 'cuda:0' or device == 'cuda:1':
            if torch.cuda.is_available():
                self.device = torch.device(device)
                print('Dispositivo seleccionado: {}'.format(device))
            else:
                self.device = torch.device('cpu')
                print('No se dispone de GPUs. Se asigna como dispositivo de entrenamiento la cpu')
        else:
            raise TypeError('No se seleccionó un dispositivo válido')
            
        # Defino el modelo:
        self.model = model
        
        # Inicializo con los parámetros de state_dict si hubiera:
        if state_dict is not None:
            self.model.load_state_dict(state_dict)
        
        # Copio el modelo al dispositivo:
        self.model = self.model.to(device=self.device)
    
    def SaveModel(self,file):
        
        try:
            torch.save(self.model.state_dict(),file)
            print('Embeddings saved to file {}'.format(file))
        except:
            print('Embeddings could not be saved to file')
    
    def Train(self, epochs=1, sample_loss_every=100, algorithm='SGD', **kwargs):
        
        # Defino el algoritmo de optimización:
        if algorithm == 'SGD':
            optimizer = optim.SGD(self.model.parameters(), **kwargs)
        elif algorithm == 'Adam':
            optimizer = optim.Adam(self.model.parameters(), **kwargs)
        self.model.train()
        
        # Identifico si es la primera vez que entreno o no:
        try:
            n_iter = self.performance_history['iter'][-1]
            print('Resuming training...')
        except (IndexError, AttributeError):
            print('Starting training...')
            self.performance_history = {'iter': [], 'loss': []}
            n_iter = 0
        
        # Varios:
        print('Optimization method: {}'.format(algorithm))
        print('Learning Rate: {:.2g}'.format(kwargs['lr']))
        print('Number of epochs: {}'.format(epochs))
        print('Running on device ({})'.format(self.device))
        print()
        
        # Comienzo a entrenar:
        batch_len = len(self.train_dataloader)
        try:
    
            for e in range(epochs):
                for t, (x,y) in enumerate(self.train_dataloader):

                    x = x.to(device=self.device, dtype=self.input_dtype)
                    y = y.to(device=self.device, dtype=self.target_dtype)

                    optimizer.zero_grad() # Llevo a cero los gradientes de la red
                    scores = self.model(x) # Calculo la salida de la red
                    loss = self.Loss(scores,y) # Calculo el valor de la loss
                    loss.backward() # Calculo los gradientes
                    optimizer.step() # Actualizo los parámetros
                    
                    if (e * batch_len + t) % sample_loss_every == 0:
                        l = loss.item()
                        print('Epoch: {}, Batch number: {}, Loss: {}'.format(e+1, t,l))
                        self.performance_history['iter'].append(e * batch_len + t + n_iter)
                        self.performance_history['loss'].append(l)
                        self.EvalPerformance()
                    
            print('Training finished')
            print()

        except KeyboardInterrupt:

            print('Exiting training...')
            print()   
            
    def Loss(self,scores,target):
        pass
    
    def EvalPerformance(self):
        pass

# Caso 1:

In [13]:
def generate_data_batches(train_dataset, test_dataset, # Train y test datasets
                          batch_size = 64, # Tamaño del batch
                          val_size = .02): # Proporción de muestras utilizadas para validación 

    """
        Función para iterar sobre los batches de muestras. 
        Devuelve los dataloaders de train / validation / test.

    """

    # Separo las muestras aleatoriamente en Train y Validation:
    NUM_TRAIN = int((1 - val_size) * len(train_dataset)) 
    samples_idx = torch.randperm(len(train_dataset))
    train_samples_idx = samples_idx[:NUM_TRAIN]
    val_samples_idx = samples_idx[NUM_TRAIN:]
    my_sampler = lambda indices: sampler.SubsetRandomSampler(indices) # sampler

    # Dataloader para las muestras de entrenamiento:
    train_dataloader = DataLoader(train_dataset, 
                                  batch_size=batch_size, 
                                  sampler=my_sampler(train_samples_idx))

    # Dataloader para las muestras de validación:
    val_dataloader = DataLoader(train_dataset, 
                                batch_size=batch_size, 
                                sampler=my_sampler(val_samples_idx))

    # Dataloader para las muestras de testeo:
    test_dataloader = DataLoader(test_dataset, 
                                 batch_size=batch_size)

    return train_dataloader, val_dataloader, test_dataloader

class ToyDataset(Dataset):
    
    def __init__(self,j):
        
        self.x_samples = torch.tensor([[i] for i in range(j,1000+j)],dtype=torch.float)
        self.y_samples = torch.tensor([0 for i in range(j,1000+j)],dtype=torch.long)
        
    def __len__(self):
        return 1000
    
    def __getitem__(self,idx):
        return self.x_samples[idx], self.y_samples[idx]
    
class Model(nn.Module):
    
    def __init__(self):
        
        super().__init__()
        self.linear = nn.Linear(1,1)
    
    def forward(self,x):
        return self.linear(x)
    
def Loss(scores,target):
    lf = nn.CrossEntropyLoss(reduction='sum')
    return lf(scores,target)
    
train_dataset = ToyDataset(10)
test_dataset = ToyDataset(30)
model = Model()
train_dataloader, val_dataloader, test_dataloader = generate_data_batches(train_dataset, test_dataset, batch_size = 64, val_size = .02)
trainer = Trainer(train_dataloader, val_dataloader, test_dataloader)
trainer.Loss = Loss
trainer.InitModel(model)
trainer.Train(epochs=1, algorithm='SGD', lr=1e-3)

Dispositivo seleccionado: cpu
Starting training...
Optimization method: SGD
Learning Rate: 0.001
Number of epochs: 1
Running on device (cpu)

Epoch: 1, Batch number: 0, Loss: 0.0
Training finished



# Caso 2:

In [14]:
class TrainValTestTrainer(Trainer):
    
    def __init__(self, train_dataset, test_dataset, batch_size, val_size):
        
        tr, val, te = self.generate_data_batches(train_dataset, test_dataset,batch_size,val_size)
        super().__init__(tr, val, te)
    
    def generate_data_batches(self, train_dataset, test_dataset, # Train y test datasets
                              batch_size = 64, # Tamaño del batch
                              val_size = .02): # Proporción de muestras utilizadas para validación 

        """
            Función para iterar sobre los batches de muestras. 
            Devuelve los dataloaders de train / validation / test.

        """

        # Separo las muestras aleatoriamente en Train y Validation:
        NUM_TRAIN = int((1 - val_size) * len(train_dataset)) 
        samples_idx = torch.randperm(len(train_dataset))
        train_samples_idx = samples_idx[:NUM_TRAIN]
        val_samples_idx = samples_idx[NUM_TRAIN:]
        my_sampler = lambda indices: sampler.SubsetRandomSampler(indices) # sampler

        # Dataloader para las muestras de entrenamiento:
        train_dataloader = DataLoader(train_dataset, 
                                      batch_size=batch_size, 
                                      sampler=my_sampler(train_samples_idx))

        # Dataloader para las muestras de validación:
        val_dataloader = DataLoader(train_dataset, 
                                    batch_size=batch_size, 
                                    sampler=my_sampler(val_samples_idx))

        # Dataloader para las muestras de testeo:
        test_dataloader = DataLoader(test_dataset, 
                                     batch_size=batch_size)

        return train_dataloader, val_dataloader, test_dataloader
    
    def Loss(self,scores,target):
        lf = nn.CrossEntropyLoss(reduction='sum')
        return lf(scores,target)
    
    
class ToyDataset(Dataset):
    
    def __init__(self,j):
        
        self.x_samples = torch.tensor([[i] for i in range(j,1000+j)],dtype=torch.float)
        self.y_samples = torch.tensor([0 for i in range(j,1000+j)],dtype=torch.long)
        
    def __len__(self):
        return 1000
    
    def __getitem__(self,idx):
        return self.x_samples[idx], self.y_samples[idx]
    
class Model(nn.Module):
    
    def __init__(self):
        
        super().__init__()
        self.linear = nn.Linear(1,1)
    
    def forward(self,x):
        return self.linear(x)
    
    
train_dataset = ToyDataset(10)
test_dataset = ToyDataset(30)
model = Model()
trainer = TrainValTestTrainer(train_dataset, test_dataset, batch_size = 64, val_size = .02)
trainer.InitModel(model)
trainer.Train(epochs=1, algorithm='SGD', lr=1e-3)

Dispositivo seleccionado: cpu
Starting training...
Optimization method: SGD
Learning Rate: 0.001
Number of epochs: 1
Running on device (cpu)

Epoch: 1, Batch number: 0, Loss: 0.0
Training finished



# Caso 3:

In [15]:
class Word2VecSamples(Dataset):
    
    unk_token = '<UNK>'
    
    def samples_generator(self, doc):
        for t, token in enumerate(doc):
            if token in self.vocabulary:
                len_doc = len(doc)
                cond1 = max(-1,t-self.window_size) == -1
                cond2 = min(t+self.window_size, len_doc) == len_doc
                if cond1 and cond2:
                    context = itertools.chain(doc[:t],doc[t+1:])
                if cond1 and not cond2:
                    context = itertools.chain(doc[:t],doc[t+1:t+self.window_size+1])
                if cond2 and not cond1:
                    context = itertools.chain(doc[t-self.window_size:t],doc[t+1:])
                if not cond1 and not cond2:
                    context = itertools.chain(doc[t-self.window_size:t],doc[t+1:t+self.window_size+1])

                context_list = [self.vocabulary.token_to_index(tk) for tk in context if tk in self.vocabulary]
                if len(context_list) != 0:
                    yield (self.vocabulary.token_to_index(token), context_list)
    

    def __init__(self, corpus, window_size=2, cutoff_freq=0):
        
        # Obtengo el vocabulario a partir del corpus ya tokenizado:
        self.vocabulary = Vocabulary.from_corpus(corpus,cutoff_freq=cutoff_freq)
    
        # Obtengo el contexto a partir del corpus:
        self.padding_idx = len(self.vocabulary)
        self.window_size = window_size
        
        word_indeces = []
        word_contexts = []
        for doc in corpus:
            gen = self.samples_generator(doc)
            for word_index, word_context in gen:
                word_indeces.append(word_index)
                padd_num = 2 * window_size - len(word_context)
                if padd_num > 0:
                    word_contexts.append(word_context + [self.padding_idx for i in range(padd_num)])
                else:
                    word_contexts.append(word_context)
        
        self.word_indeces = torch.tensor(word_indeces,dtype=torch.long)
        self.context_indeces = torch.tensor(word_contexts,dtype=torch.long)
        
    def __getitem__(self,idx):
        return self.context_indeces[idx,:], self.word_indeces[idx]
    
    def __len__(self):
        return len(self.word_indeces)
    

class Word2VecTrainer(Trainer):
    
    def __init__(self,
                 corpus,                 # Corpus de entrenamiento (debe ser una lista de listas de strings).
                 cutoff_freq=1,          # Descartar palabras cuya frecuencia sea menor o igual a este valor.
                 window_size=2,          # Tamaño de la ventana.
                 batch_size=64):         # Tamaño del batch.
        
        self.cutoff_freq = cutoff_freq
        self.window_size = window_size
        
        # Obtengo los batches de muestras:
        dataset = Word2VecSamples(corpus, window_size=window_size, cutoff_freq=cutoff_freq)
        samples_idx = torch.randperm(len(dataset))
        my_sampler = lambda indices: sampler.SubsetRandomSampler(indices)
        dataloader = DataLoader(dataset, batch_size=batch_size, sampler=my_sampler(samples_idx))
        
        self.vocab_size = len(dataset.vocabulary)
        
        super().__init__(dataloader, None, None)
        
    def Loss(self,scores,target):
        lf = nn.CrossEntropyLoss(reduction='sum')
        return lf(scores,target)
        
import numpy as np        
import itertools

class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self, tokens_dict={}, frequencies_dict={}):
        
        self._idx_to_tk = tokens_dict
        self._tk_to_idx = {tk: idx for idx, tk in tokens_dict.items()}
        self._idx_to_freq = frequencies_dict
        self.max_idx = len(self)
        
    @classmethod
    def from_corpus(cls, corpus, cutoff_freq=0):
        corpus_words = sorted(list(set([item for sublist in corpus for item in sublist])))
        freqs_dict = {word: 0 for word in corpus_words}
        for doc in corpus:
            for token in doc:
                freqs_dict[token] += 1
        freqs = np.array(list(freqs_dict.values()))
        mask = freqs > cutoff_freq
        corpus_words = {idx: tk for idx, tk in enumerate(itertools.compress(corpus_words,mask))}
        freqs = {idx: freq for idx, freq in enumerate(freqs[mask])}
        return cls(corpus_words, freqs)

    def index_to_token(self, index):
        return self._idx_to_tk[index]

    def token_to_index(self, token):
        return self._tk_to_idx[token]
        
    def get_freq(self, tk_or_idx):
        
        if isinstance(tk_or_idx, int):
            freq = self._idx_to_freq[tk_or_idx]
        elif isinstance(tk_or_idx, str):
            freq = 0 if tk_or_idx not in self._tk_to_idx else self._idx_to_freq[self._tk_to_idx[tk_or_idx]]
        else:
            raise KeyError('{} must be either integer or string'.format(tk_or_idx))
        return freq

    def __str__(self):
        return "<Vocabulary(size={})>".format(len(self))

    def __len__(self):
        return len(self._idx_to_tk)
    
    def __getitem__(self,tk_or_idx):
        if isinstance(tk_or_idx, int):
            return self.index_to_token(tk_or_idx)
        if isinstance(tk_or_idx, str):
            return self.token_to_index(tk_or_idx)
        raise KeyError('{} must be either integer or string'.format(tk_or_idx))
        
    def __iter__(self):
        self.current = 0
        return self
    
    def __next__(self):
        if self.current >= self.max_idx:
            raise StopIteration
        else:
            token = self._idx_to_tk[self.current]
            self.current += 1
            return token

    def __contains__(self,key):
        return key in self._tk_to_idx   
        

class CBOWModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=vocab_size)
        self.out = nn.Linear(embedding_dim, vocab_size, bias=False)
        
    def forward(self,x):
        embedding = self.emb(x).mean(dim=1)
        return self.out(embedding)    

        
corpus = [['w1', 'w2', 'w2', 'w3'], ['w3', 'w2', 'w1', 'w1', 'w1', 'w2', 'w1', 'w1'], ['w1', 'w1', 'w1']]
trainer = Word2VecTrainer(corpus, 0, 2, 64)
model = CBOWModel(len(trainer.train_dataloader.dataset.vocabulary),10)
trainer.InitModel(model)
trainer.Train(epochs=10, algorithm='SGD', lr=1e-3)

Dispositivo seleccionado: cpu
Starting training...
Optimization method: SGD
Learning Rate: 0.001
Number of epochs: 10
Running on device (cpu)

Epoch: 1, Batch number: 0, Loss: 18.593631744384766
Training finished



# Caso 4:

In [22]:
class Word2VecTrainer(Trainer):
    
    model_name = 'CBOW'
    
    def __init__(self,
                 corpus,                 # Corpus de entrenamiento (debe ser una lista de listas de strings).
                 cutoff_freq=1,          # Descartar palabras cuya frecuencia sea menor o igual a este valor.
                 window_size=2,          # Tamaño de la ventana.
                 batch_size=64):         # Tamaño del batch.
        
        self.cutoff_freq = cutoff_freq
        self.window_size = window_size
        
        # Obtengo los batches de muestras:
        dataset = Word2VecSamples(corpus, window_size=window_size, cutoff_freq=cutoff_freq)
        samples_idx = torch.randperm(len(dataset))
        my_sampler = lambda indices: sampler.SubsetRandomSampler(indices)
        dataloader = DataLoader(dataset, batch_size=batch_size, sampler=my_sampler(samples_idx))
        
        self.vocab_size = len(dataset.vocabulary)
        
        super().__init__(dataloader, None, None)
        
    class CBOWModel(nn.Module):
    
        def __init__(self, vocab_size, embedding_dim):
            super().__init__()
            self.emb = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=vocab_size)
            self.out = nn.Linear(embedding_dim, vocab_size, bias=False)

        def forward(self,x):
            embedding = self.emb(x).mean(dim=1)
            return self.out(embedding)
        
    class SkipGramModel(nn.Module):
    
        def __init__(self, vocab_size, embedding_dim):
            super(SkipGramModel,self).__init__()
            self.emb = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=vocab_size)
            self.out = nn.Linear(embedding_dim, vocab_size, bias=False)
            self.vocab_size = vocab_size

        def forward(self,x):
            return self.out(self.emb(x))


    def InitModel(self, model, embedding_dim, state_dict=None, device='cpu'):
        
        self.model_name = model
        if model == 'CBOW':
            model = self.CBOWModel(len(self.train_dataloader.dataset.vocabulary),embedding_dim)
            self.Loss = self.CBOWLoss
        elif model == 'SkipGram':
            model = self.SkipGramModel(len(self.train_dataloader.dataset.vocabulary),embedding_dim)
            self.Loss = self.SkipGramLoss
        
        super().InitModel(model, state_dict, device)
    
    
    def CBOWLoss(self,scores,target):
        lf = nn.CrossEntropyLoss(reduction='sum')
        return lf(scores,target)
    
    def SkipGramLoss(self,scores,target):
        lf = nn.CrossEntropyLoss(ignore_index=self.vocab_size,reduction='sum')
        scores = scores.view(-1,self.vocab_size,1).repeat(1,1,target.size(1))
        return lf(scores,target)
    
    def EvalPerformance(self):
        
    
    
corpus = [['w1', 'w2', 'w2', 'w3'], ['w3', 'w2', 'w1', 'w1', 'w1', 'w2', 'w1', 'w1'], ['w1', 'w1', 'w1']]
trainer = Word2VecTrainer(corpus, 0, 2, 64)
trainer.InitModel('CBOW',10)
trainer.Train(epochs=10, algorithm='SGD', lr=1e-3)

Dispositivo seleccionado: cpu
Starting training...
Optimization method: SGD
Learning Rate: 0.001
Number of epochs: 10
Running on device (cpu)

Epoch: 1, Batch number: 0, Loss: 18.626895904541016
Training finished



# Caso 5:

In [None]:
#################### UTILS ####################
import itertools
import numpy as np

class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self, tokens_dict={}, frequencies_dict={}):
        
        self._idx_to_tk = tokens_dict
        self._tk_to_idx = {tk: idx for idx, tk in tokens_dict.items()}
        self._idx_to_freq = frequencies_dict
        self.max_idx = len(self)
        
    @classmethod
    def from_corpus(cls, corpus, cutoff_freq=0):
        corpus_words = sorted(list(set([item for sublist in corpus for item in sublist])))
        freqs_dict = {word: 0 for word in corpus_words}
        for doc in corpus:
            for token in doc:
                freqs_dict[token] += 1
        freqs = np.array(list(freqs_dict.values()))
        mask = freqs > cutoff_freq
        corpus_words = {idx: tk for idx, tk in enumerate(itertools.compress(corpus_words,mask))}
        freqs = {idx: freq for idx, freq in enumerate(freqs[mask])}
        return cls(corpus_words, freqs)

    def index_to_token(self, index):
        return self._idx_to_tk[index]

    def token_to_index(self, token):
        return self._tk_to_idx[token]
        
    def get_freq(self, tk_or_idx):
        
        if isinstance(tk_or_idx, int):
            freq = self._idx_to_freq[tk_or_idx]
        elif isinstance(tk_or_idx, str):
            freq = 0 if tk_or_idx not in self._tk_to_idx else self._idx_to_freq[self._tk_to_idx[tk_or_idx]]
        else:
            raise KeyError('{} must be either integer or string'.format(tk_or_idx))
        return freq

    def __str__(self):
        return "<Vocabulary(size={})>".format(len(self))

    def __len__(self):
        return len(self._idx_to_tk)
    
    def __getitem__(self,tk_or_idx):
        if isinstance(tk_or_idx, int):
            return self.index_to_token(tk_or_idx)
        if isinstance(tk_or_idx, str):
            return self.token_to_index(tk_or_idx)
        raise KeyError('{} must be either integer or string'.format(tk_or_idx))
        
    def __iter__(self):
        self.current = 0
        return self
    
    def __next__(self):
        if self.current >= self.max_idx:
            raise StopIteration
        else:
            token = self._idx_to_tk[self.current]
            self.current += 1
            return token

    def __contains__(self,key):
        return key in self._tk_to_idx

    
def samples_generator(doc, vocab, window_size):
    for t, token in enumerate(doc):
        if token in vocab:
            len_doc = len(doc)
            cond1 = max(-1,t-window_size) == -1
            cond2 = min(t+window_size, len_doc) == len_doc
            if cond1 and cond2:
                context = itertools.chain(doc[:t],doc[t+1:])
            if cond1 and not cond2:
                context = itertools.chain(doc[:t],doc[t+1:t+window_size+1])
            if cond2 and not cond1:
                context = itertools.chain(doc[t-window_size:t],doc[t+1:])
            if not cond1 and not cond2:
                context = itertools.chain(doc[t-window_size:t],doc[t+1:t+window_size+1])

            context_list = [vocab.token_to_index(tk) for tk in context if tk in vocab]
            if len(context_list) != 0:
                yield vocab.token_to_index(token), context_list

In [None]:
class Samples(Dataset):
    
    unk_token = '<UNK>'
    
    def __init__(self, corpus, window_size=2, cutoff_freq=0):
        
        # Obtengo el vocabulario a partir del corpus ya tokenizado:
        self.vocabulary = Vocabulary.from_corpus(corpus,cutoff_freq=cutoff_freq)
    
        # Obtengo el contexto a partir del corpus:
        self.padding_idx = len(self.vocabulary)
        self.window_size = window_size
        
        word_indeces = []
        word_contexts = []
        for doc in corpus:
            gen = samples_generator(doc, vocab, window_size)
            for word_index, word_context in gen:
                word_indeces.append(word_index)
                padd_num = 2 * window_size - len(word_context)
                if padd_num > 0:
                    word_contexts.append(word_context + [self.padding_idx for i in range(padd_num)])
                else:
                    word_contexts.append(word_context)
        
        self.word_indeces = torch.tensor(word_indeces,dtype=torch.long)
        self.context_indeces = torch.tensor(word_contexts,dtype=torch.long)
        
    def __getitem__(self,idx):
        return self.context_indeces[idx,:], self.word_indeces[idx]
    
    def __len__(self):
        return len(self.word_indeces)
    
    
class CBOWModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel,self).__init__()
        self.emb = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=vocab_size)
        self.out = nn.Linear(embedding_dim, vocab_size, bias=False)
        
    def forward(self,x):
        embedding = self.emb(x).mean(dim=1)
        return self.out(embedding)


    
class CBOWTrainer(Trainer):
    
    def __init__(self, samples, model):
        
        self.model_cls = model
        self.samples_cls = samples
        
        super().__init__(samples, None, None, )
    
    def 

# ------------- Desde archivos --------------

In [5]:
from NLPUtils import *
import torch

import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
corpus = [['w1', 'w2', 'w2', 'w3'], ['w3', 'w2', 'w1', 'w1', 'w1', 'w2', 'w1', 'w1'], ['w1', 'w1', 'w1']]
trainer = Word2VecTrainer(corpus, 0, 2, 64)
trainer.InitModel('CBOW',10)
trainer.Train(epochs=10, algorithm='SGD', lr=1e-3)

Dispositivo seleccionado: cpu
Starting training...
Optimization method: SGD
Learning Rate: 0.001
Number of epochs: 10
Running on device (cpu)

Epoch: 1, Batch number: 0, Loss: 15.119260787963867
Training finished



In [13]:
class ToyDataset(Dataset):
    
    def __init__(self,j):
        
        self.x_samples = torch.tensor([[i] for i in range(j,1000+j)],dtype=torch.float)
        self.y_samples = torch.tensor([0 for i in range(j,1000+j)],dtype=torch.long)
        
    def __len__(self):
        return 1000
    
    def __getitem__(self,idx):
        return self.x_samples[idx], self.y_samples[idx]
    
class Model(nn.Module):
    
    def __init__(self):
        
        super().__init__()
        self.linear = nn.Linear(1,1)
    
    def forward(self,x):
        return self.linear(x)
    

train_dataset = ToyDataset(10)
test_dataset = ToyDataset(30)
model = Model()
trainer = Trainer(train_dataset, test_dataset, batch_size = 64, val_size = .02)
trainer.InitModel(model)
trainer.Train(epochs=1, algorithm='SGD', lr=1e-3)

IndentationError: expected an indented block (<ipython-input-13-5b7654691fcd>, line 35)