# Resumen Parcial de NLP

Este resumen muestra una de las metodologías utilizadas actualmente para diseñar un algoritmo de NLP. En este caso, la tarea a resolver será la de clasificación de texto. 

Esta metodología consiste en lo siguiente: para una tarea determinada, primero se realiza una extracción del significado de los elementos que componen el texto y luego se diseña un modelo que toma como entrada dicha representación del significado. Este modelo es el que realiza la tarea propiamente dicha, pero no lo hace sobre los símbolos que componen el texto original, sino sobre una representación de los mismos en la que pueden verse algunas características semánticas entre ellas. 

Por ejemplo, para el [corpus de texto *Brown*](https://www.nltk.org/book/ch02.html), que contiene textos de 15 categorías diferentes (news, editorial, reviews, religion, hobbies, lore, belles lettres, government, learned, fiction, mystery, science fiction, adventure, romanc, humor), se buscará realizar un algoritmo que clasifique una frase en alguna de estas categorías. Este procedimiento se realiza de dos maneras distintas:

* Se entrena un clasificador de una capa (con salida Softmax) que tiene como entrada una frase, y como salida la probabilidad de cada una de las categorías mencionadas anteriormente.

* Se entrena el mismo clasificador que antes, con la diferencia que la entrada del modelo no son las palabras que componen la frase, sino la representación del significado de cada una de esas palabras (*word embedding*).

**TODO: EXPLICAR UN POCO MEJOR QUE EL SIGNIFICADO SE EXTRAE CON UN MODELO DE LENGUAJE.**

In [1]:
# Importamos PyTorch
import torch

# Importamos la libraría de utils de NLP
import sys
sys.path.insert(1, '../')
from nlp_utils import *

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package brown to
[nltk_data]     /home/lestien/Documents/BecaNLP/Programs/nltk_data...
[nltk_data]   Package brown is already up-to-date!


## Primera forma (sin extracción del significado)

In [16]:
class BrownClassificationDataset(torch.utils.data.Dataset):
    
    def __init__(self, root, preprocessing, train=True):
        
        self.root = root
        
        file_train = open(root + 'train.txt', 'r')
        file_test = open(root + 'test.txt', 'r')
        text_train = file_train.read()
        text_test = file_test.read()
        file_train.close()
        file_test.close()

        samples = []
        vocab = []
        
        if train:
            text = text_train.split('<ENDLABEL>')
            extended_text = text_test.split('<ENDLABEL>')
        else:
            text = text_test.split('<ENDLABEL>')
            extended_text = text_train.split('<ENDLABEL>')
            
        for i in range(len(text)-1):
            sample = text[i].split('<BEGINLABEL>')
            text_splitted = sample[0].split('<TS>')
            samples.append((text_splitted, sample[1]))
        
        if preprocessing:
            samples_preprocessed = preprocessing(samples)
        else:
            samples_preprocessed = samples
            
        vocab.append([sample[0] for sample in samples_preprocessed])
        vocab.append([extended_text[i].split('<BEGINLABEL>')[0].split('<TS>') for i in range(len(extended_text)-1)])
        
        self.samples = samples_preprocessed
        it = itertools.chain.from_iterable
        self.vocabulary = set(it(list(it(vocab))))
        self.vocabulary.add('<NS>')
        self.word_to_index = {w: idx for (idx, w) in enumerate(self.vocabulary)}
        self.index_to_word = {idx: w for (idx, w) in enumerate(self.vocabulary)}
        
        self.size_of_longest_sample = max([len(sample[0]) for sample in self.samples])
        self.categories = list(set([sample[1] for sample in self.samples]))
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sentence, category = self.samples[idx]
        idx_sentence = torch.empty(self.size_of_longest_sample, dtype=torch.long)
        idx_category = torch.tensor(self.categories.index(category), dtype=torch.long)
        ls = len(sentence)
        for i in range(self.size_of_longest_sample):
            if i < ls:
                idx_sentence[i] = self.word_to_index[sentence[i]]
            else:
                idx_sentence[i] = self.word_to_index['<NS>']

        return idx_sentence, idx_category
    
    
# TODO: INVESTIGAR Y HACER UN BUEN PREPROCESAMIENTO!!
class PreprocessBrown(object):
    def __call__(self, samples):
        samples_processed = []
        for sample in samples:
            text = ' '.join(sample[0])
            text = text.lower()
            text.replace('\n', ' ')
            text = re.sub('[^a-z ]+', '', text)
            samples_processed.append(([w for w in text.split() if w != ''], sample[1]))
        return samples_processed    
    

train_dataset = BrownClassificationDataset('./', PreprocessBrown(), train=True)
val_dataset = BrownClassificationDataset('./', PreprocessBrown(), train=True)
test_dataset = BrownClassificationDataset('./', PreprocessBrown(), train=False)

from torch.utils.data import SubsetRandomSampler, DataLoader

val_size = .02
batch_size = 918

NUM_TRAIN = int((1 - val_size) * len(train_dataset))
NUM_VAL = len(train_dataset) - NUM_TRAIN

train_dataloader = DataLoader(train_dataset, 
                              batch_size=batch_size, 
                              sampler=SubsetRandomSampler(range(NUM_TRAIN)))

val_dataloader = DataLoader(val_dataset, 
                            batch_size=batch_size, 
                            sampler=SubsetRandomSampler(range(NUM_TRAIN, NUM_TRAIN+NUM_VAL)))

In [17]:
import torch.optim as optim

def CheckAccuracy(loader, model, device, input_dtype, target_dtype):  
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=input_dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=target_dtype)
            
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)
                
    return num_correct, num_samples
        

def TrainClassifier(model, data, epochs=1, learning_rate=1e-2, sample_loss_every=100, lm='CBOW'):
    
    input_dtype = data['input_dtype'] 
    target_dtype = data['target_dtype']
    device = data['device']
    train_dataloader = data['train_dataloader']
    val_dataloader = data['val_dataloader']
    
    performance_history = {'iter': [], 'loss': [], 'accuracy': []}
    
    model = model.to(device=device)
    model.train()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    for e in range(epochs):
        for t, (x,y) in enumerate(train_dataloader):
            x = x.to(device=device, dtype=input_dtype)
            y = y.to(device=device, dtype=target_dtype)

            scores = model(x) # Forward pass
            loss = model.loss(scores,y) # Backward pass
                
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if t % sample_loss_every == 0:
                num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
                performance_history['iter'].append(t)
                performance_history['loss'].append(loss.item())
                performance_history['accuracy'].append(float(num_correct) / num_samples)
                print('Epoch: %d, Iteration: %d, Accuracy: %d/%d, loss: %.4f' % (e, t, num_correct, num_samples, loss.item()))
                
    num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
    print('Final accuracy: %.2f%%' % (100 * float(num_correct) / num_samples) )
    
    return performance_history

In [18]:
import torch.nn as nn

class LinearSoftmaxClassifier(nn.Module):
    
    def __init__(self, vocab_size, n_categories):
        super(LinearSoftmaxClassifier,self).__init__()
        self.linear = nn.Embedding(vocab_size, n_categories)
        
    def forward(self, sentence):
        score = self.linear(sentence).mean(dim=1)
        return score
    
    def loss(self, scores, target):
        m = nn.CrossEntropyLoss()
        return m(scores,target)
    
vocab_size = len(train_dataset.vocabulary)
n_categories = len(train_dataset.categories)
model = LinearSoftmaxClassifier(vocab_size, n_categories)

In [19]:
# Especificaciones de cómo adquirir los datos para entrenamiento:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

device

data = {
    'device': device,
    'input_dtype': torch.long, 
    'target_dtype': torch.long,
    'train_dataloader': train_dataloader,
    'val_dataloader': val_dataloader
}

# Hiperparámetros del modelo y otros:
epochs = 10 # Cantidad de epochs
sample_loss_every = 50 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-6 # Tasa de aprendizaje

# Entrenamiento:
performance_history = TrainClassifier(model, data, epochs, learning_rate, sample_loss_every)

Epoch: 0, Iteration: 0, Accuracy: 0/918, loss: 3.4561
Epoch: 1, Iteration: 0, Accuracy: 0/918, loss: 3.4444
Epoch: 2, Iteration: 0, Accuracy: 0/918, loss: 3.4971
Epoch: 3, Iteration: 0, Accuracy: 0/918, loss: 3.4490
Epoch: 4, Iteration: 0, Accuracy: 0/918, loss: 3.4957
Epoch: 5, Iteration: 0, Accuracy: 0/918, loss: 3.4712
Epoch: 6, Iteration: 0, Accuracy: 0/918, loss: 3.4821
Epoch: 7, Iteration: 0, Accuracy: 0/918, loss: 3.4626
Epoch: 8, Iteration: 0, Accuracy: 0/918, loss: 3.4708
Epoch: 9, Iteration: 0, Accuracy: 0/918, loss: 3.4894
Final accuracy: 0.00%


## Segunda forma (con extracción del significado)

In [None]:
from TrainWordVectors import *

# Especificaciones de cómo adquirir los datos para entrenamiento:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

device

data = {
    'device': device,
    'input_dtype': torch.long, 
    'target_dtype': torch.long,
    'train_dataloader': train_dataloader,
    'val_dataloader': val_dataloader
}

# Hiperparámetros del modelo y otros:
epochs = 5 # Cantidad de epochs
sample_loss_every = 200 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-2 # Tasa de aprendizaje

# Entrenamiento:
embeddings = TrainWord2Vec(model, data, epochs, learning_rate, sample_loss_every, lm='CBOW')

In [None]:
class BrownClassificationDataset(torch.utils.data.Dataset):
    
    def __init__(self, root, preprocessing, train=True):
        
        self.root = root
        
        file_train = open(root + 'train.txt', 'r')
        file_test = open(root + 'test.txt', 'r')
        text_train = file_train.read()
        text_test = file_test.read()
        file_train.close()
        file_test.close()

        samples = []
        vocab = []
        
        if train:
            text = text_train.split('<ENDLABEL>')
            extended_text = text_test.split('<ENDLABEL>')
        else:
            text = text_test.split('<ENDLABEL>')
            extended_text = text_train.split('<ENDLABEL>')
            
        for i in range(len(text)-1):
            sample = text[i].split('<BEGINLABEL>')
            text_splitted = sample[0].split('<TS>')
            samples.append((text_splitted, sample[1]))
        
        if preprocessing:
            samples_preprocessed = preprocessing(samples)
        else:
            samples_preprocessed = samples
            
        vocab.append([sample[0] for sample in samples_preprocessed])
        vocab.append([extended_text[i].split('<BEGINLABEL>')[0].split('<TS>') for i in range(len(extended_text)-1)])
        
        self.samples = samples_preprocessed
        it = itertools.chain.from_iterable
        self.vocabulary = set(it(list(it(vocab))))
        self.vocabulary.add('<NS>')
        self.word_to_index = {w: idx for (idx, w) in enumerate(self.vocabulary)}
        self.index_to_word = {idx: w for (idx, w) in enumerate(self.vocabulary)}
        
        self.size_of_longest_sample = max([len(sample[0]) for sample in self.samples])
        self.categories = list(set([sample[1] for sample in self.samples]))
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sentence, category = self.samples[idx]
        idx_sentence = torch.empty(self.size_of_longest_sample, dtype=torch.long)
        idx_category = torch.tensor(self.categories.index(category), dtype=torch.long)
        ls = len(sentence)
        for i in range(self.size_of_longest_sample):
            if i < ls:
                idx_sentence[i] = self.word_to_index[sentence[i]]
            else:
                idx_sentence[i] = self.word_to_index['<NS>']

        return idx_sentence, idx_category
    
    
# TODO: INVESTIGAR Y HACER UN BUEN PREPROCESAMIENTO!!
class PreprocessBrown(object):
    def __call__(self, samples):
        samples_processed = []
        for sample in samples:
            text = ' '.join(sample[0])
            text = text.lower()
            text.replace('\n', ' ')
            text = re.sub('[^a-z ]+', '', text)
            samples_processed.append(([w for w in text.split() if w != ''], sample[1]))
        return samples_processed    
    

train_dataset = BrownClassificationDataset('./', PreprocessBrown(), train=True)
val_dataset = BrownClassificationDataset('./', PreprocessBrown(), train=True)
test_dataset = BrownClassificationDataset('./', PreprocessBrown(), train=False)

from torch.utils.data import SubsetRandomSampler, DataLoader

val_size = .02
batch_size = 918

NUM_TRAIN = int((1 - val_size) * len(train_dataset))
NUM_VAL = len(train_dataset) - NUM_TRAIN

train_dataloader = DataLoader(train_dataset, 
                              batch_size=batch_size, 
                              sampler=SubsetRandomSampler(range(NUM_TRAIN)))

val_dataloader = DataLoader(val_dataset, 
                            batch_size=batch_size, 
                            sampler=SubsetRandomSampler(range(NUM_TRAIN, NUM_TRAIN+NUM_VAL)))

In [None]:
import torch.optim as optim

def CheckAccuracy(loader, model, device, input_dtype, target_dtype):  
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=input_dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=target_dtype)
            
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)
                
    return num_correct, num_samples
        

def TrainClassifier(model, data, epochs=1, learning_rate=1e-2, sample_loss_every=100, lm='CBOW'):
    
    input_dtype = data['input_dtype'] 
    target_dtype = data['target_dtype']
    device = data['device']
    train_dataloader = data['train_dataloader']
    val_dataloader = data['val_dataloader']
    
    performance_history = {'iter': [], 'loss': [], 'accuracy': []}
    
    model = model.to(device=device)
    model.train()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    for e in range(epochs):
        for t, (x,y) in enumerate(train_dataloader):
            x = x.to(device=device, dtype=input_dtype)
            y = y.to(device=device, dtype=target_dtype)

            scores = model(x) # Forward pass
            loss = model.loss(scores,y) # Backward pass
                
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if t % sample_loss_every == 0:
                num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
                performance_history['iter'].append(t)
                performance_history['loss'].append(loss.item())
                performance_history['accuracy'].append(float(num_correct) / num_samples)
                print('Epoch: %d, Iteration: %d, Accuracy: %d/%d, loss: %.4f' % (e, t, num_correct, num_samples, loss.item()))
                
    num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
    print('Final accuracy: %.2f%%' % (100 * float(num_correct) / num_samples) )
    
    return performance_history

In [None]:
import torch.nn as nn

class LinearSoftmaxClassifier(nn.Module):
    
    def __init__(self, vocab_size, n_categories):
        super(LinearSoftmaxClassifier,self).__init__()
        self.linear = nn.Embedding(vocab_size, n_categories)
        
    def forward(self, sentence):
        score = self.linear(sentence).mean(dim=1)
        return score
    
    def loss(self, scores, target):
        m = nn.CrossEntropyLoss()
        return m(scores,target)
    
vocab_size = len(train_dataset.vocabulary)
n_categories = len(train_dataset.categories)
model = LinearSoftmaxClassifier(vocab_size, n_categories)

In [None]:
# Especificaciones de cómo adquirir los datos para entrenamiento:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

device

data = {
    'device': device,
    'input_dtype': torch.long, 
    'target_dtype': torch.long,
    'train_dataloader': train_dataloader,
    'val_dataloader': val_dataloader
}

# Hiperparámetros del modelo y otros:
epochs = 10 # Cantidad de epochs
sample_loss_every = 50 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-6 # Tasa de aprendizaje

# Entrenamiento:
performance_history = TrainClassifier(model, data, epochs, learning_rate, sample_loss_every)