In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import sampler, Dataset, DataLoader

# Muestras:

In [None]:
class CBOWSamples(Dataset):

    unk_token = '<UNK>'

    def samples_generator(self, doc):
        for t, token in enumerate(doc):
            if token in self.vocabulary:
                len_doc = len(doc)
                cond1 = max(-1,t-self.window_size) == -1
                cond2 = min(t+self.window_size, len_doc) == len_doc
                if cond1 and cond2:
                    context = itertools.chain(doc[:t],doc[t+1:])
                if cond1 and not cond2:
                    context = itertools.chain(doc[:t],doc[t+1:t+self.window_size+1])
                if cond2 and not cond1:
                    context = itertools.chain(doc[t-self.window_size:t],doc[t+1:])
                if not cond1 and not cond2:
                    context = itertools.chain(doc[t-self.window_size:t],doc[t+1:t+self.window_size+1])

                context_list = [self.vocabulary.token_to_index(tk) for tk in context if tk in self.vocabulary]
                if len(context_list) != 0:
                    yield (self.vocabulary.token_to_index(token), context_list)


    def __init__(self, corpus, window_size=2, cutoff_freq=0):

        # Obtengo el vocabulario a partir del corpus ya tokenizado:
        self.vocabulary = Vocabulary.from_corpus(corpus,cutoff_freq=cutoff_freq)

        # Obtengo el contexto a partir del corpus:
        self.padding_idx = len(self.vocabulary)
        self.window_size = window_size

        word_indeces = []
        word_contexts = []
        for doc in corpus:
            gen = self.samples_generator(doc)
            for word_index, word_context in gen:
                word_indeces.append(word_index)
                padd_num = 2 * window_size - len(word_context)
                if padd_num > 0:
                    word_contexts.append(word_context + [self.padding_idx for i in range(padd_num)])
                else:
                    word_contexts.append(word_context)

        self.word_indeces = torch.tensor(word_indeces,dtype=torch.long)
        self.context_indeces = torch.tensor(word_contexts,dtype=torch.long)

    def __getitem__(self,idx):
        return self.context_indeces[idx,:], self.word_indeces[idx]

    def __len__(self):
        return len(self.word_indeces)
    


# Modelo:

In [None]:
class CBOWModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=vocab_size)
        self.out = nn.Linear(embedding_dim, vocab_size, bias=False)

    def forward(self,x):
        embedding = self.emb(x).mean(dim=1)
        return self.out(embedding)

# Costo:

In [None]:
class CrossEntropyLoss(nn.CrossEntropyLoss):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

# Evaluaci√≥n de la performance:

In [None]:
class CheckAccuracy(object):
    
    def __init__(self, samples, model):
        self.samples = samples
        self.model = model
        self.input_dtype = next(iter(samples))[0].dtype
        self.target_dtype = next(iter(samples))[1].dtype
        
    def __call__(self):
        
        num_correct = 0
        num_samples = 0
        
        self.model.eval()
        with torch.no_grad():
            for x, y in samples:
                x = x.to(device=self.device, dtype=self.input_dtype)  
                y = y.to(device=self.device, dtype=self.target_dtype)

                scores = self.model(x)
                _, preds = scores.max(1)
                num_correct += (preds == y).sum()
                num_samples += preds.size(0)

        self.model.train()
        
        print('Resultados:')

In [None]:
class Trainer(object):
    
    def __init__(self,samples,model,loss,evalperformance):
        