In [1]:
import torch
from NLPUtils import *

import nltk
nltk.download('brown', download_dir='/home/lestien/anaconda3/envs/TorchEnv/nltk_data')
from nltk.corpus import brown

import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package brown to
[nltk_data]     /home/lestien/anaconda3/envs/TorchEnv/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
corpus_unpreproceced = brown.sents(categories=brown.categories())
corpus = list(corpus_unpreproceced)

In [8]:
# Modelo de lenguaje:
method = 'SkipGram'
window_size = 2
embedding_dim = 100
pretrained = None

# Parámetros de iteración:
batch_size = 128
epochs = 20
learning_rate = 5e-1
sample_loss_every = 1000
use_gpu = 1


embedding_layer, vocab, loss_history = SGDTrainWordVectors(corpus,pretrained_layer=pretrained,lm=method,window_size=window_size,batch_size=batch_size,embedding_dim=embedding_dim,use_gpu=use_gpu,epochs=epochs,learning_rate=learning_rate,sample_loss_every=sample_loss_every)

fig, ax = plt.subplots()
ax.plot(loss_history['iter'],loss_history['loss'],label='Learning Rate = {:.2g}'.format(learning_rate))

ax.legend()

Starting training...
	Model used: SkipGram
    	Optimization method: Stochastic Gradient Descent
    	Learning Rate: 0.5
    	Number of epochs: 20
    	Number of batches: 9072
    	Number of samples per batch: 128

Epoch: 1, Batch number: 0, Loss: 11.134016990661621
Epoch: 1, Batch number: 1000, Loss: 8.285545349121094
Epoch: 1, Batch number: 2000, Loss: 7.948732852935791
Epoch: 1, Batch number: 3000, Loss: 7.775062084197998
Epoch: 1, Batch number: 4000, Loss: 7.927264213562012
Epoch: 1, Batch number: 5000, Loss: 7.601624965667725
Epoch: 1, Batch number: 6000, Loss: 7.883325099945068
Epoch: 1, Batch number: 7000, Loss: 7.565744400024414
Epoch: 1, Batch number: 8000, Loss: 7.396945953369141
Epoch: 1, Batch number: 9000, Loss: 7.226373672485352
Epoch: 2, Batch number: 928, Loss: 7.469085216522217
Epoch: 2, Batch number: 1928, Loss: 7.367478847503662
Epoch: 2, Batch number: 2928, Loss: 7.277441501617432
Epoch: 2, Batch number: 3928, Loss: 7.534557342529297
Epoch: 2, Batch number: 4928, Lo

Epoch: 17, Batch number: 1848, Loss: 6.957075119018555
Epoch: 17, Batch number: 2848, Loss: 6.674424648284912
Epoch: 17, Batch number: 3848, Loss: 6.596491813659668
Epoch: 17, Batch number: 4848, Loss: 6.870919227600098
Epoch: 17, Batch number: 5848, Loss: 6.744700908660889
Epoch: 17, Batch number: 6848, Loss: 6.956305027008057
Epoch: 17, Batch number: 7848, Loss: 6.616408824920654
Epoch: 17, Batch number: 8848, Loss: 6.778581619262695
Epoch: 18, Batch number: 776, Loss: 6.642391204833984
Epoch: 18, Batch number: 1776, Loss: 6.954563617706299
Epoch: 18, Batch number: 2776, Loss: 6.676386833190918
Epoch: 18, Batch number: 3776, Loss: 6.733015060424805
Epoch: 18, Batch number: 4776, Loss: 7.001162528991699
Epoch: 18, Batch number: 5776, Loss: 7.001280784606934
Epoch: 18, Batch number: 6776, Loss: 6.907720565795898
Epoch: 18, Batch number: 7776, Loss: 6.856136322021484
Epoch: 18, Batch number: 8776, Loss: 6.9572978019714355
Epoch: 19, Batch number: 704, Loss: 6.913273334503174
Epoch: 19, 

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7fe22414f358>

In [16]:
from torch.utils.data import Dataset


class BrownDataset(Dataset):
    
    def __init__(self, df):
        self.data = df
        self.categories = ['adventure', 'belles_lettres', 'editorial', 'fiction', 
                           'government', 'hobbies', 'humor', 'learned', 'lore', 
                           'mystery', 'news', 'religion', 'reviews', 'romance', 
                           'science_fiction']
        
        # Obtengo el vocabulario a partir del corpus ya tokenizado:
        self.vocabulary = Vocabulary()
        for sentence in self.data['sentence']:
            for token in sentence:
                self.vocabulary.add_token(token)
                
        self.padding_idx = len(self.vocabulary)
        self.longest_sentence_size = self.data['sentence'].str.len().max()
        
    def __getitem__(self,idx):
        if type(idx) == torch.Tensor:
            idx = idx.item()
        
        sentence_vector = torch.ones(self.longest_sentence_size, dtype=torch.long) * self.padding_idx
        for i, token in enumerate(self.data['sentence'].iloc[idx]):
            sentence_vector[i] = self.vocabulary.token_to_index(token)
            
        return sentence_vector, torch.tensor(self.data['label'].iloc[idx],dtype=torch.long)
    
    def __len__(self):
        return len(self.data)
    
    @classmethod
    def get_dataset(cls):
        train_dataframe = pd.read_csv('train.csv')
        train_dataframe = pd.DataFrame({'sentence':train_dataframe['sentence'].str.split('<SEP>'),
                                        'label':train_dataframe['label']})
        test_dataframe = pd.read_csv('test.csv')
        test_dataframe = pd.DataFrame({'sentence':test_dataframe['sentence'].str.split('<SEP>'),
                                       'label':test_dataframe['label']})
        
        return cls(train_dataframe), cls(test_dataframe)

    
def generate_data_batches(train_dataset, test_dataset, # Train y test datasets
                          batch_size = 64, # Tamaño del batch
                          val_size = .02): # Proporción de muestras utilizadas para validación 
    
    """
    Función para iterar sobre los batches de muestras. 
    Devuelve los dataloaders de train / validation / test.
    """

    # Separo las muestras aleatoriamente en Train y Validation:
    NUM_TRAIN = int((1 - val_size) * len(train_dataset)) 
    samples_idx = torch.randperm(len(train_dataset))
    train_samples_idx = samples_idx[:NUM_TRAIN]
    val_samples_idx = samples_idx[NUM_TRAIN:]
    my_sampler = lambda indices: sampler.SubsetRandomSampler(indices) # sampler
    
    # Dataloader para las muestras de entrenamiento:
    train_dataloader = DataLoader(train_dataset, 
                                  batch_size=batch_size, 
                                  sampler=my_sampler(train_samples_idx))

    # Dataloader para las muestras de validación:
    val_dataloader = DataLoader(train_dataset, 
                                batch_size=batch_size, 
                                sampler=my_sampler(val_samples_idx))

    # Dataloader para las muestras de testeo:
    test_dataloader = DataLoader(test_dataset, 
                                 batch_size=batch_size)
    
    return train_dataloader, val_dataloader, test_dataloader
    
    

    
batch_size = 64
train_dataset, test_dataset = BrownDataset.get_dataset()
train_dataloader, val_dataloader, test_dataloader = generate_data_batches(train_dataset,test_dataset,batch_size=batch_size)

(tensor([  381,   747,     6, 12012,   710,    85,    26,  1243,  1545,     8,
           145,   339,   335,  3470,    26,  1557,   710,    19,    73,    61,
           597,   533,    58,  5889,  7321,   105,     6,  1188,    12, 51576,
         51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576,
         51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576,
         51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576,
         51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576,
         51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576,
         51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576,
         51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576,
         51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576,
         51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576, 51576,
         51576, 51576, 51576, 51576, 51576, 51576, 5

In [None]:
class Model(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim):
        super(Model,self).__init__()
        self.emb = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=vocab_size)
        self.out = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self,x):
        embedding = self.emb(x).mean(dim=1)
        return self.out(embedding)
    
    def loss(self,scores,target):
        lf = nn.CrossEntropyLoss()
        return lf(scores,target)
    
vocab_size = len(train_dataset.vocabulary)
model = Model()

In [None]:
epochs = 1
learning_rate = 1e-2
sample_loss_every = 100
check_on_train = False
use_gpu = 1

performance_history = SGDTrainModel(model,train_dataloader,val_dataloader,epochs=epochs,learning_rate=learning_rate,sample_loss_every=sample_loss_every,check_on_train=check_on_train,use_gpu=use_gpu)

fig, ax = plt.subplots()
ax.plot(performance_history['iter'],performace_history['loss'])