# Manejo de datos en NLP

Todo esto está sacado de https://github.com/joosthub/PyTorchNLPBook, que es el github del libro que usan en cs224n. El capítulo 3 tiene un ejemplo "Classifying Sentiment of Restaurant Reviews" que es la fuente de toda esta información.

## Ciclo de trabajo

Para hacer un modelo de NLP, hay que hacer varias cosas:

1. Preprocesar el texto, de manera de obtener un .csv con todas las muestras de training y de test.

2. Definir un vocabulario. Para eso, se suele hacer un objeto con todos los handlers 

In [1]:
import torch

In [2]:
from torch.utils.data import Dataset

class DatasetFicticio(Dataset):
    
    def __init__(self):
        
        self.samples = [(text, class_idx) for text, class_idx in zip(torch.randint(0, 120000, (120000,10)), torch.randint(0, 4, (120000,)))]
        
    def __getitem__(self,idx):
        return self.samples[idx]
    
    def __len__(self):
        return len(self.samples)

train_dataset = DatasetFicticio()
test_dataset = DatasetFicticio()

In [3]:
def generate_data_batches(train_dataset, test_dataset, # Train y test datasets
                          batch_size = 64, # Tamaño del batch
                          val_size = .02): # Proporción de muestras utilizadas para validación 
    
    """
    Función para iterar sobre los batches de muestras. 
    Devuelve los dataloaders de train / validation / test.
    """

    # Separo las muestras aleatoriamente en Train y Validation:
    NUM_TRAIN = int((1 - val_size) * len(train_dataset)) 
    samples_idx = torch.randperm(len(train_dataset))
    train_samples_idx = samples_idx[:NUM_TRAIN]
    val_samples_idx = samples_idx[NUM_TRAIN:]
    sampler = lambda indices: torch.utils.data.SubsetRandomSampler(indices) # sampler
    
    # Dataloader para las muestras de entrenamiento:
    train_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                                   batch_size=batch_size, 
                                                   sampler=sampler(train_samples_idx))

    # Dataloader para las muestras de validación:
    val_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                                 batch_size=batch_size, 
                                                 sampler=sampler(val_samples_idx))

    # Dataloader para las muestras de testeo:
    test_dataloader = torch.utils.data.DataLoader(test_dataset, 
                                                  batch_size=batch_size)
    
    return train_dataloader, val_dataloader, test_dataloader

train_dataloader, val_dataloader, test_dataloader = generate_data_batches(train_dataset, test_dataset)

In [5]:
import torch.nn as nn

class MyModel(nn.Module):
    
    def __init__(self):
        super(MyModel, self).__init__()
        self.linear = nn.Linear(10, 5)
    def forward(self,x):
        return self.linear(x)
    def loss(self,scores,target):
        lf = nn.CrossEntropyLoss()
        return lf(scores,target)
    
model = MyModel()



import torch.optim as optim

def CheckAccuracy(loader, model, device, input_dtype, target_dtype):  
    num_correct = 0
    num_samples = 0
    model.eval()  
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=input_dtype)  
            y = y.to(device=device, dtype=target_dtype)
            
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)

        return num_correct, num_samples
        

def TrainModel(model, data, epochs=1, learning_rate=1e-2, sample_loss_every=100):
    
    input_dtype = data['input_dtype'] 
    target_dtype = data['target_dtype']
    device = data['device']
    train_dataloader = data['train_dataloader']
    val_dataloader = data['val_dataloader']
    
    performance_history = {'iter': [], 'loss': [], 'accuracy': []}
    
    model = model.to(device=device)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    batch_size = len(train_dataloader)
    
    try:
    
        for e in range(epochs):
            for t, (x,y) in enumerate(train_dataloader):
                model.train()
                x = x.to(device=device, dtype=input_dtype)
                y = y.to(device=device, dtype=target_dtype)

                # Forward pass
                scores = model(x) 

                # Backward pass
                loss = model.loss(scores,y)                 
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if (e * batch_size + t) % sample_loss_every == 0:
                    num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
                    performance_history['iter'].append(t)
                    performance_history['loss'].append(loss.item())
                    performance_history['accuracy'].append(float(num_correct) / num_samples)
                    print('Epoch: %d, Iteration: %d, Accuracy on validation dataset: %d/%d ' % (e, t, num_correct, num_samples))
                    
    except KeyboardInterrupt:
        num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
        print('Exiting training...')
        print('Final accuracy on validation dataset: %.2f%%' % (100 * float(num_correct) / num_samples) )
    
    return performance_history


# Especificaciones de cómo adquirir los datos para entrenamiento:
use_gpu = True
if torch.cuda.is_available() and use_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

data = {
    'device': device,
    'input_dtype': torch.float,
    'target_dtype': torch.long,
    'train_dataloader': train_dataloader,
    'val_dataloader': val_dataloader
}

# Hiperparámetros del modelo y otros:
epochs = 10 # Cantidad de epochs
sample_loss_every = 100 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-1 # Tasa de aprendizaje

# Entrenamiento:
performance_history = TrainModel(model, data, epochs, learning_rate, sample_loss_every)

Epoch: 0, Iteration: 0, Accuracy on validation dataset: 617/2400 
Epoch: 0, Iteration: 100, Accuracy on validation dataset: 612/2400 
Epoch: 0, Iteration: 200, Accuracy on validation dataset: 616/2400 
Epoch: 0, Iteration: 300, Accuracy on validation dataset: 582/2400 
Epoch: 0, Iteration: 400, Accuracy on validation dataset: 616/2400 
Epoch: 0, Iteration: 500, Accuracy on validation dataset: 624/2400 
Epoch: 0, Iteration: 600, Accuracy on validation dataset: 608/2400 
Epoch: 0, Iteration: 700, Accuracy on validation dataset: 616/2400 
Epoch: 0, Iteration: 800, Accuracy on validation dataset: 578/2400 
Epoch: 0, Iteration: 900, Accuracy on validation dataset: 595/2400 
Epoch: 0, Iteration: 1000, Accuracy on validation dataset: 603/2400 
Epoch: 0, Iteration: 1100, Accuracy on validation dataset: 616/2400 
Epoch: 0, Iteration: 1200, Accuracy on validation dataset: 554/2400 
Epoch: 0, Iteration: 1300, Accuracy on validation dataset: 600/2400 
Epoch: 0, Iteration: 1400, Accuracy on validat

In [10]:
import torch
from torch.utils.data import Dataset
from TorchDataUtils import *

%load_ext autoreload
%autoreload 2

class DatasetFicticio(Dataset):
    
    def __init__(self):
        
        self.samples = [(text, class_idx) for text, class_idx in zip(torch.randint(0, 120000, (120000,10)), torch.randint(0, 4, (120000,)))]
        
    def __getitem__(self,idx):
        return self.samples[idx]
    
    def __len__(self):
        return len(self.samples)

train_dataset = DatasetFicticio()
test_dataset = DatasetFicticio()

import torch.nn as nn

class MyModel(nn.Module):
    
    def __init__(self):
        super(MyModel, self).__init__()
        self.linear = nn.Linear(10, 5)
    def forward(self,x):
        return self.linear(x)
    def loss(self,scores,target):
        lf = nn.CrossEntropyLoss()
        return lf(scores,target)
    
model = MyModel()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
train_dataloader, val_dataloader, test_dataloader = generate_data_batches(train_dataset, test_dataset)

# Especificaciones de cómo adquirir los datos para entrenamiento:
use_gpu = True
if torch.cuda.is_available() and use_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

data = {
    'device': device,
    'input_dtype': torch.float,
    'target_dtype': torch.long,
    'train_dataloader': train_dataloader,
    'val_dataloader': val_dataloader
}

# Hiperparámetros del modelo y otros:
epochs = 10 # Cantidad de epochs
sample_loss_every = 100 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-1 # Tasa de aprendizaje

# Entrenamiento:
performance_history = TrainModel(model, data, epochs, learning_rate, sample_loss_every)

Epoch: 0, Batch number: 0
Accuracy on validation dataset: 614/2400 

Epoch: 0, Batch number: 100
Accuracy on validation dataset: 588/2400 

Epoch: 0, Batch number: 200
Accuracy on validation dataset: 575/2400 

Epoch: 0, Batch number: 300
Accuracy on validation dataset: 627/2400 

Epoch: 0, Batch number: 400
Accuracy on validation dataset: 627/2400 

Epoch: 0, Batch number: 500
Accuracy on validation dataset: 584/2400 

Epoch: 0, Batch number: 600
Accuracy on validation dataset: 581/2400 

Epoch: 0, Batch number: 700
Accuracy on validation dataset: 626/2400 

Epoch: 0, Batch number: 800
Accuracy on validation dataset: 589/2400 

Epoch: 0, Batch number: 900
Accuracy on validation dataset: 614/2400 

Epoch: 0, Batch number: 1000
Accuracy on validation dataset: 614/2400 

Epoch: 0, Batch number: 1100
Accuracy on validation dataset: 625/2400 

Epoch: 0, Batch number: 1200
Accuracy on validation dataset: 583/2400 

Epoch: 0, Batch number: 1300
Accuracy on validation dataset: 628/2400 

Epoc