In [1]:
import torch
import pandas as pd
from TorchDataUtils import *
from NLPDataUtils import *

%matplotlib notebook
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

# Clasificación sin WordVectors

In [4]:
class AGNewsDataset(torch.utils.data.Dataset):
    
    def __init__(self, root='./AG_NEWS/', preprocess=lambda x: x, train=True):
        
        target = 'train.csv' if train else 'test.csv'
        df = pd.read_csv(root + target, header=None, names=['class_idx', 'title', 'description'])
        
        # Etiquetas:
        self.cls_indeces = torch.tensor(df['class_idx'].tolist(), dtype=torch.long) - 1
        
        # DataSeries con las muestras de entradas:
        data = df['title']
        self.data = preprocess(data)
    
    def __getitem__(self,idx):
        if type(idx) == torch.Tensor:
            idx = idx.item()
        return self.vectorizer.vectorize(self.data.iloc[idx]), self.cls_indeces[idx]
    
    def __len__(self):
        return len(self.cls_indeces)
    
    
def GetAGNewsDataset(root, preprocess, cutoff=25):
    
    # Datasets:
    train_dataset = AGNewsDataset(root, preprocess=preprocess, train=True)
    train_dataset.vectorizer = Vectorizer([train_dataset.data], cutoff=cutoff)
    test_dataset = AGNewsDataset(root, preprocess=preprocess, train=False)
    test_dataset.vectorizer = train_dataset.vectorizer
    
    # Dataloaders:
    train_dataloader, val_dataloader, test_dataloader = generate_data_batches(train_dataset, 
                                                                              test_dataset,    
                                                                              batch_size=64)
    return train_dataloader, val_dataloader, test_dataloader

def preprocess(data):
    df = data.str.replace(r'\(AP\)','')
    df = df.str.replace(r'\(Reuters\)','')
    df = df.str.replace(r'\(AFP\)','')
    df = df.str.replace(r'\(SPACE\.com\)','')
    df = df.str.replace(r'\ba\b','')
    df = df.str.replace(r'\bthe\b','')
    df = df.str.replace(r'\bis\b','')
    df = df.str.replace(r'\bof\b','')
    df = df.str.replace(r'\bto\b','')
    df = df.str.replace(r'[,:;\?\!\"]','')
    df = df.str.replace(r'\s+','<SEP>')
    df = df.str.replace(r"'s<SEP>","<SEP>'s<SEP>")
    df = df.str.split('<SEP>')
    return df

In [5]:
tc_train_dataloader, tc_val_dataloader, tc_test_dataloader = GetAGNewsDataset(root='./AG_NEWS/', 
                                                                     preprocess=preprocess, 
                                                                     cutoff=25)

import torch.nn as nn

class TextClassifier1(nn.Module):
    
    def __init__(self, vocab_size, n_embeddings, n_classes):
        super(TextClassifier1, self).__init__()
        self.emb = nn.Linear(vocab_size, n_embeddings)
        self.out = nn.Linear(n_embeddings, n_classes)
        
    def forward(self, x):
        return self.out(self.emb(x))
    
    def loss(self, scores, target):
        lf = nn.CrossEntropyLoss()
        return lf(scores, target)

In [None]:
# Parámetros de las muestras:
data = {
    'use_gpu': True, # Trasladar o no las muestras a la GPU
    'input_dtype': torch.float, # Tipo de dato de las muestras de entrada
    'target_dtype': torch.long, # Tipo de dato de las muestras de salida
    'train_dataloader': tc_train_dataloader, # Dataset de entrenamiento
    'val_dataloader': tc_val_dataloader # Dataset de validación
}

# Parámetros de optimización:
epochs = 20 # Cantidad de epochs
sample_loss_every = 500 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = [1, 5e-1] # Tasa de aprendizaje
check_on_train = False # Queremos ver los resultados también en el train set

# Entrenamiento:
performance_history = []
classification_models = []

vocab_size = len(tc_train_dataloader.dataset.vectorizer.vocabulary)
n_classes = 4
n_embeddings = 100
ClassifierModel = TextClassifier1(vocab_size, n_embeddings, n_classes)

for lr in learning_rate:
    model = TextClassifier1(vocab_size, n_embeddings, n_classes)
    model.load_state_dict(ClassifierModel.state_dict())
    performance_history.append(SGDTrainModel(model, 
                                             data, 
                                             epochs, 
                                             lr, 
                                             sample_loss_every, 
                                             check_on_train, 
                                             verbose=True))
    classification_models.append(model)
    print('lr={:.2g} completed.'.format(lr))

Epoch: 0, Batch number: 0
Accuracy on validation dataset: 645/2400 (26.88%)

Epoch: 0, Batch number: 500
Accuracy on validation dataset: 1585/2400 (66.04%)

Epoch: 0, Batch number: 1000
Accuracy on validation dataset: 1914/2400 (79.75%)

Epoch: 0, Batch number: 1500
Accuracy on validation dataset: 1911/2400 (79.62%)

Epoch: 1, Batch number: 162
Accuracy on validation dataset: 1917/2400 (79.88%)

Epoch: 1, Batch number: 662
Accuracy on validation dataset: 1922/2400 (80.08%)

Epoch: 1, Batch number: 1162
Accuracy on validation dataset: 1897/2400 (79.04%)

Epoch: 1, Batch number: 1662
Accuracy on validation dataset: 1942/2400 (80.92%)

Epoch: 2, Batch number: 324
Accuracy on validation dataset: 1926/2400 (80.25%)

Epoch: 2, Batch number: 824
Accuracy on validation dataset: 1844/2400 (76.83%)

Epoch: 2, Batch number: 1324
Accuracy on validation dataset: 1940/2400 (80.83%)

Epoch: 2, Batch number: 1824
Accuracy on validation dataset: 1921/2400 (80.04%)

Epoch: 3, Batch number: 486
Accuracy 

# Entrenamiento de word vectors para clasificación

## Pretraining

In [2]:
class WordEmbeddingAGNewsDataset(torch.utils.data.Dataset):
    
    def __init__(self, root='./AG_NEWS/', preprocess=lambda x: x, n_window=2, train=True):
        
        target = 'train.csv' if train else 'test.csv'
        df = pd.read_csv(root + target, header=None, names=['class_idx', 'title', 'description'])
        data = preprocess(df['title'])
        samples = data.apply(self._get_context, n_window=n_window)
        self.data = pd.DataFrame([[subsample[0], subsample[1]] for sample in samples for subsample in sample],
                                 columns=['word','context'])
    
    def __getitem__(self,idx):
        if type(idx) == torch.Tensor:
            idx = idx.item()
        
        context_vector = self.vectorizer.vectorize(self.data['context'].iloc[idx])
        word_index = self.vectorizer.vocabulary.token_to_index(self.data['word'].iloc[idx])
        return context_vector, word_index
    
    def _get_context(self,sentence,n_window):
        
        no_sentence = '<NS>'
        samples = []
        for i, word in enumerate(sentence):
            first_context_word_index = max(0,i-n_window)
            last_context_word_index = min(i+n_window+1, len(sentence))

            context = [no_sentence for j in range(i-n_window,first_context_word_index)] + \
                      sentence[first_context_word_index:i] + \
                      sentence[i+1:last_context_word_index] + \
                      [no_sentence for j in range(last_context_word_index,i+n_window+1)]
            
            samples.append((word,context))

        return samples
    
    
    def __len__(self):
        return len(self.data)
    
    
def GetAGNewsWordEmbeddingsDataset(root, preprocess, n_window=2, cutoff=25):
    
    # Datasets:
    train_dataset = WordEmbeddingAGNewsDataset(root, preprocess=preprocess, n_window=2, train=True)
    train_dataset.vectorizer = Vectorizer([train_dataset.data['word']], cutoff=cutoff)
    test_dataset = WordEmbeddingAGNewsDataset(root, preprocess=preprocess, n_window=2, train=False)
    test_dataset.vectorizer = train_dataset.vectorizer
    
    # Dataloaders:
    train_dataloader, val_dataloader, test_dataloader = generate_data_batches(train_dataset, 
                                                                              test_dataset,    
                                                                              batch_size=64)
    return train_dataloader, val_dataloader, test_dataloader


def preprocess(data):
    df = data.str.replace(r'\(AP\)','')
    df = df.str.replace(r'\(Reuters\)','')
    df = df.str.replace(r'\(AFP\)','')
    df = df.str.replace(r'\(SPACE\.com\)','')
    df = df.str.replace(r'\ba\b','')
    df = df.str.replace(r'\bthe\b','')
    df = df.str.replace(r'\bis\b','')
    df = df.str.replace(r'\bof\b','')
    df = df.str.replace(r'\bto\b','')
    df = df.str.replace(r'[,:;\?\!\"]','')
    df = df.str.replace(r'\s+','<SEP>')
    df = df.str.replace(r"'s<SEP>","<SEP>'s<SEP>")
    df = df.str.split('<SEP>')
    return df


we_train_dataloader, we_val_dataloader, we_test_dataloader = GetAGNewsWordEmbeddingsDataset(root='./AG_NEWS/', 
                                                                               preprocess=preprocess, 
                                                                               cutoff=25)

In [3]:
import torch.nn as nn

class Word2VecCBOW(nn.Module):
    
    def __init__(self, vocab_size, n_embeddings):
        super(Word2VecCBOW, self).__init__()
        self.emb = nn.Linear(vocab_size, n_embeddings)
        self.out = nn.Linear(n_embeddings, vocab_size)
        
    def forward(self, x):
        return self.out(self.emb(x))
    
    def loss(self, scores, target):
        lf = nn.CrossEntropyLoss()
        return lf(scores, target)
    

vocab_size = len(we_train_dataloader.dataset.vectorizer.vocabulary)
n_embeddings = 100
EmbeddingModel = Word2VecCBOW(vocab_size, n_embeddings)

In [4]:
# Parámetros de las muestras:
data = {
    'use_gpu': True, # Trasladar o no las muestras a la GPU
    'input_dtype': torch.float, # Tipo de dato de las muestras de entrada
    'target_dtype': torch.long, # Tipo de dato de las muestras de salida
    'train_dataloader': we_train_dataloader, # Dataset de entrenamiento
    'val_dataloader': we_val_dataloader # Dataset de validación
}

# Parámetros de optimización:
epochs = 20 # Cantidad de epochs
sample_loss_every = 5000 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = [5e-1, 1e-1, 5e-2] # Tasa de aprendizaje
check_on_train = False # Queremos ver los resultados también en el train set

# Entrenamiento:
models = []
performance_history = []
n_classes = 4
for lr in learning_rate:
    model = Word2VecCBOW(vocab_size, n_embeddings)
    model.load_state_dict(EmbeddingModel.state_dict())
    performance_history.append(SGDTrainModel(model, 
                                             data, 
                                             epochs, 
                                             lr, 
                                             sample_loss_every, 
                                             check_on_train, 
                                             verbose=True))
    models.append(model)
    print('lr={:.2g} completed.'.format(lr))

Epoch: 0, Batch number: 0
Accuracy on validation dataset: 4/15547 (0.03%)

Epoch: 0, Batch number: 5000
Accuracy on validation dataset: 364/15547 (2.34%)

Epoch: 0, Batch number: 10000
Accuracy on validation dataset: 667/15547 (4.29%)

Epoch: 1, Batch number: 3097
Accuracy on validation dataset: 782/15547 (5.03%)

Epoch: 1, Batch number: 8097
Accuracy on validation dataset: 898/15547 (5.78%)

Epoch: 2, Batch number: 1194
Accuracy on validation dataset: 927/15547 (5.96%)

Epoch: 2, Batch number: 6194
Accuracy on validation dataset: 965/15547 (6.21%)

Epoch: 2, Batch number: 11194
Accuracy on validation dataset: 1076/15547 (6.92%)

Epoch: 3, Batch number: 4291
Accuracy on validation dataset: 1137/15547 (7.31%)

Epoch: 3, Batch number: 9291
Accuracy on validation dataset: 1210/15547 (7.78%)

Epoch: 4, Batch number: 2388
Accuracy on validation dataset: 1216/15547 (7.82%)

Epoch: 4, Batch number: 7388
Accuracy on validation dataset: 1263/15547 (8.12%)

Epoch: 5, Batch number: 485
Accuracy o

Epoch: 2, Batch number: 1194
Accuracy on validation dataset: 362/15547 (2.33%)

Epoch: 2, Batch number: 6194
Accuracy on validation dataset: 372/15547 (2.39%)

Epoch: 2, Batch number: 11194
Accuracy on validation dataset: 362/15547 (2.33%)

Epoch: 3, Batch number: 4291
Accuracy on validation dataset: 399/15547 (2.57%)

Epoch: 3, Batch number: 9291
Accuracy on validation dataset: 457/15547 (2.94%)

Epoch: 4, Batch number: 2388
Accuracy on validation dataset: 494/15547 (3.18%)

Epoch: 4, Batch number: 7388
Accuracy on validation dataset: 496/15547 (3.19%)

Epoch: 5, Batch number: 485
Accuracy on validation dataset: 546/15547 (3.51%)

Epoch: 5, Batch number: 5485
Accuracy on validation dataset: 588/15547 (3.78%)

Epoch: 5, Batch number: 10485
Accuracy on validation dataset: 600/15547 (3.86%)

Epoch: 6, Batch number: 3582
Accuracy on validation dataset: 614/15547 (3.95%)

Epoch: 6, Batch number: 8582
Accuracy on validation dataset: 654/15547 (4.21%)

Epoch: 7, Batch number: 1679
Accuracy o

In [5]:
fig, ax = plt.subplots()
for lr, ph in zip(learning_rate, performance_history):
    ax.plot(ph['iter'],ph['loss'],label='lr={:.2g}'.format(lr))
    
ax.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7fc809157908>

## Finetuning

In [6]:
class AGNewsDataset(torch.utils.data.Dataset):
    
    def __init__(self, root='./AG_NEWS/', preprocess=lambda x: x, train=True):
        
        target = 'train.csv' if train else 'test.csv'
        df = pd.read_csv(root + target, header=None, names=['class_idx', 'title', 'description'])
        
        # Etiquetas:
        self.cls_indeces = torch.tensor(df['class_idx'].tolist(), dtype=torch.long) - 1
        
        # DataSeries con las muestras de entradas:
        data = df['title']
        self.data = preprocess(data)
    
    def __getitem__(self,idx):
        if type(idx) == torch.Tensor:
            idx = idx.item()
        return self.vectorizer.vectorize(self.data.iloc[idx]), self.cls_indeces[idx]
    
    def __len__(self):
        return len(self.cls_indeces)
    
    
def GetAGNewsDataset(root, preprocess, cutoff=25):
    
    # Datasets:
    train_dataset = AGNewsDataset(root, preprocess=preprocess, train=True)
    train_dataset.vectorizer = Vectorizer([train_dataset.data], cutoff=cutoff)
    test_dataset = AGNewsDataset(root, preprocess=preprocess, train=False)
    test_dataset.vectorizer = train_dataset.vectorizer
    
    # Dataloaders:
    train_dataloader, val_dataloader, test_dataloader = generate_data_batches(train_dataset, 
                                                                              test_dataset,    
                                                                              batch_size=64)
    return train_dataloader, val_dataloader, test_dataloader

In [7]:
tc_train_dataloader, tc_val_dataloader, tc_test_dataloader = GetAGNewsDataset(root='./AG_NEWS/', 
                                                                     preprocess=preprocess, 
                                                                     cutoff=25)

import torch.nn as nn

class TextClassifier(nn.Module):
    
    def __init__(self, EmbeddingLayer, n_classes):
        super(TextClassifier, self).__init__()
        self.emb = nn.Linear(EmbeddingLayer.in_features, EmbeddingLayer.out_features)
        self.out = nn.Linear(EmbeddingLayer.out_features, n_classes)
        
    def forward(self, x):
        return self.out(self.emb(x))
    
    def loss(self, scores, target):
        lf = nn.CrossEntropyLoss()
        return lf(scores, target)

In [18]:
# Parámetros de las muestras:
data = {
    'use_gpu': True, # Trasladar o no las muestras a la GPU
    'input_dtype': torch.float, # Tipo de dato de las muestras de entrada
    'target_dtype': torch.long, # Tipo de dato de las muestras de salida
    'train_dataloader': tc_train_dataloader, # Dataset de entrenamiento
    'val_dataloader': tc_val_dataloader # Dataset de validación
}

# Parámetros de optimización:
epochs = 20 # Cantidad de epochs
sample_loss_every = 500 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = [1, 5e-1] # Tasa de aprendizaje
check_on_train = False # Queremos ver los resultados también en el train set

# Entrenamiento:
performance_history = []
n_classes = 4
BestEmbeddingModel = models[0]
classification_models = []
ClassifierModel = TextClassifier(BestEmbeddingModel.emb, n_classes)
ClassifierModel.load_state_dict({
    'emb.weight': BestEmbeddingModel.state_dict()['emb.weight'],
    'emb.bias': BestEmbeddingModel.state_dict()['emb.bias'],
    'out.weight': ClassifierModel.state_dict()['out.weight'],
    'out.bias': ClassifierModel.state_dict()['out.bias']
})
for lr in learning_rate:
    model = TextClassifier(BestEmbeddingModel.emb, n_classes)
    model.load_state_dict(ClassifierModel.state_dict())
    for param in model.emb.parameters():
        param.requires_grad = False
    performance_history.append(SGDTrainModel(model, 
                                             data, 
                                             epochs, 
                                             lr, 
                                             sample_loss_every, 
                                             check_on_train, 
                                             verbose=True))
    classification_models.append(model)
    print('lr={:.2g} completed.'.format(lr))

Epoch: 0, Batch number: 0
Accuracy on validation dataset: 1420/2400 (59.17%)

Epoch: 0, Batch number: 500
Accuracy on validation dataset: 1768/2400 (73.67%)

Epoch: 0, Batch number: 1000
Accuracy on validation dataset: 1811/2400 (75.46%)

Epoch: 0, Batch number: 1500
Accuracy on validation dataset: 1800/2400 (75.00%)

Epoch: 1, Batch number: 162
Accuracy on validation dataset: 1812/2400 (75.50%)

Epoch: 1, Batch number: 662
Accuracy on validation dataset: 1819/2400 (75.79%)

Epoch: 1, Batch number: 1162
Accuracy on validation dataset: 1865/2400 (77.71%)

Epoch: 1, Batch number: 1662
Accuracy on validation dataset: 1870/2400 (77.92%)

Epoch: 2, Batch number: 324
Accuracy on validation dataset: 1853/2400 (77.21%)

Epoch: 2, Batch number: 824
Accuracy on validation dataset: 1844/2400 (76.83%)

Epoch: 2, Batch number: 1324
Accuracy on validation dataset: 1895/2400 (78.96%)

Epoch: 2, Batch number: 1824
Accuracy on validation dataset: 1911/2400 (79.62%)

Epoch: 3, Batch number: 486
Accuracy

NameError: name 'classification_model' is not defined

In [19]:
fig, ax = plt.subplots()
for lr, ph in zip(learning_rate, performance_history):
    ax.plot(ph['iter'],ph['loss'],label='lr={:.2g}'.format(lr))
    
ax.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7fc718a82ef0>