In [2]:
import torch
import pandas as pd
from TorchDataUtils import *
from NLPDataUtils import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
class AGNewsDataset(torch.utils.data.Dataset):
    
    def __init__(self, root='./AG_NEWS/', preprocess=lambda x: x, train=True):
        
        target = 'train.csv' if train else 'test.csv'
        df = pd.read_csv(root + target, header=None, names=['class_idx', 'title', 'description'])
        
        # Etiquetas:
        self.cls_indeces = torch.tensor(df['class_idx'].tolist(), dtype=torch.long) - 1
        
        # DataSeries con las muestras de entradas:
        data = df['title']
        self.data = preprocess(data)
    
    def __getitem__(self,idx):
        if type(idx) == torch.Tensor:
            idx = idx.item()
        return self.vectorizer.vectorize(self.data.iloc[idx]), self.cls_indeces[idx]
    
    def __len__(self):
        return len(self.cls_indeces)
    
    
def GetAGNewsDataset(root, preprocess, cutoff=25):
    
    # Datasets:
    train_dataset = AGNewsDataset(root, preprocess=preprocess, train=True)
    train_dataset.vectorizer = Vectorizer(train_dataset.data, cutoff=cutoff)
    test_dataset = AGNewsDataset(root, preprocess=preprocess, train=False)
    test_dataset.vectorizer = train_dataset.vectorizer
    
    # Dataloaders:
    train_dataloader, val_dataloader, test_dataloader = generate_data_batches(train_dataset, 
                                                                              test_dataset,    
                                                                              batch_size=64)
    return train_dataloader, val_dataloader, test_dataloader

# Prueba 1

* Cutoff: 25
* Preprocesamiento: sólo split
* Muestras de entrada: Title
* Modelo de clasificación: Layer lineal + CrossEntropy

In [18]:
def preprocess(data):
    sep_token = ' '
    splitted_data = data.str.split(sep_token)
    return splitted_data

train_dataloader, val_dataloader, test_dataloader = GetAGNewsDataset(root='./AG_NEWS/', 
                                                                     preprocess=preprocess, 
                                                                     cutoff=25)

import torch.nn as nn

class TextClassifier(nn.Module):
    
    def __init__(self, vocab_size, n_classes):
        super(TextClassifier, self).__init__()
        self.emb = nn.Linear(vocab_size, n_classes)
        
    def forward(self, x):
        return self.emb(x)
    
    def loss(self, scores, target):
        lf = nn.CrossEntropyLoss()
        return lf(scores, target)
    

vocab_size = len(train_dataset.vectorizer.vocabulary)
n_classes = 4
model = TextClassifier(vocab_size, n_classes)

In [19]:
# Parámetros de las muestras:
data = {
    'use_gpu': True, # Trasladar o no las muestras a la GPU
    'input_dtype': torch.float, # Tipo de dato de las muestras de entrada
    'target_dtype': torch.long, # Tipo de dato de las muestras de salida
    'train_dataloader': train_dataloader, # Dataset de entrenamiento
    'val_dataloader': val_dataloader # Dataset de validación
}

# Parámetros de optimización:
epochs = 10 # Cantidad de epochs
sample_loss_every = 500 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-5 # Tasa de aprendizaje
check_on_train = False # Queremos ver los resultados también en el train set

# Entrenamiento:
performance_history = SGDTrainModel(model, data, epochs, learning_rate, sample_loss_every, check_on_train)

Epoch: 0, Batch number: 0
Accuracy on validation dataset: 560/2400 (23.33%)

Epoch: 0, Batch number: 500
Accuracy on validation dataset: 564/2400 (23.50%)

Epoch: 0, Batch number: 1000
Accuracy on validation dataset: 572/2400 (23.83%)

Epoch: 0, Batch number: 1500
Accuracy on validation dataset: 581/2400 (24.21%)

Exiting training...
Final accuracy registered on validation dataset: 581/2400 (24.21%)


# Prueba 2

* Cutoff: 50
* Preprocesamiento: sólo split
* Muestras de entrada: Title
* Modelo de clasificación: Layer lineal + CrossEntropy

In [20]:
def preprocess(data):
    sep_token = ' '
    splitted_data = data.str.split(sep_token)
    return splitted_data

train_dataloader, val_dataloader, test_dataloader = GetAGNewsDataset(root='./AG_NEWS/', 
                                                                     preprocess=preprocess, 
                                                                     cutoff=50)

import torch.nn as nn

class TextClassifier(nn.Module):
    
    def __init__(self, vocab_size, n_classes):
        super(TextClassifier, self).__init__()
        self.emb = nn.Linear(vocab_size, n_classes)
        
    def forward(self, x):
        return self.emb(x)
    
    def loss(self, scores, target):
        lf = nn.CrossEntropyLoss()
        return lf(scores, target)
    

vocab_size = len(train_dataset.vectorizer.vocabulary)
n_classes = 4
model = TextClassifier(vocab_size, n_classes)

In [23]:
# Parámetros de las muestras:
data = {
    'use_gpu': True, # Trasladar o no las muestras a la GPU
    'input_dtype': torch.float, # Tipo de dato de las muestras de entrada
    'target_dtype': torch.long, # Tipo de dato de las muestras de salida
    'train_dataloader': train_dataloader, # Dataset de entrenamiento
    'val_dataloader': val_dataloader # Dataset de validación
}

# Parámetros de optimización:
epochs = 10 # Cantidad de epochs
sample_loss_every = 500 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-5 # Tasa de aprendizaje
check_on_train = False # Queremos ver los resultados también en el train set

# Entrenamiento:
performance_history = SGDTrainModel(model, data, epochs, learning_rate, sample_loss_every, check_on_train)

Epoch: 0, Batch number: 0
Accuracy on validation dataset: 1260/2400 (52.50%)

Epoch: 0, Batch number: 500
Accuracy on validation dataset: 1260/2400 (52.50%)

Epoch: 0, Batch number: 1000
Accuracy on validation dataset: 1260/2400 (52.50%)

Epoch: 0, Batch number: 1500
Accuracy on validation dataset: 1261/2400 (52.54%)

Epoch: 1, Batch number: 162
Accuracy on validation dataset: 1261/2400 (52.54%)

Epoch: 1, Batch number: 662
Accuracy on validation dataset: 1261/2400 (52.54%)

Epoch: 1, Batch number: 1162
Accuracy on validation dataset: 1261/2400 (52.54%)

Epoch: 1, Batch number: 1662
Accuracy on validation dataset: 1261/2400 (52.54%)

Epoch: 2, Batch number: 324
Accuracy on validation dataset: 1261/2400 (52.54%)

Epoch: 2, Batch number: 824
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 2, Batch number: 1324
Accuracy on validation dataset: 1261/2400 (52.54%)

Epoch: 2, Batch number: 1824
Accuracy on validation dataset: 1261/2400 (52.54%)

Epoch: 3, Batch number: 486
Accuracy

# Prueba 3

* Cutoff: 25
* Preprocesamiento: Un poco más trabajado
* Muestras de entrada: Title
* Modelo de clasificación: Layer lineal + CrossEntropy

In [64]:
def preprocess(data):
    df = data.str.replace(r'\(AP\)','')
    df = df.str.replace(r'\(Reuters\)','')
    df = df.str.replace(r'\(AFP\)','')
    df = df.str.replace(r'\(SPACE\.com\)','')
    df = df.str.replace(r'\ba\b','')
    df = df.str.replace(r'\bthe\b','')
    df = df.str.replace(r'\bis\b','')
    df = df.str.replace(r'\bof\b','')
    df = df.str.replace(r'\bto\b','')
    df = df.str.replace(r'[,:;\?\!\"]','')
    df = df.str.replace(r'\s+','<SEP>')
    df = df.str.replace(r"'s<SEP>","<SEP>'s<SEP>")
    df = df.str.split('<SEP>')
    return df

train_dataloader, val_dataloader, test_dataloader = GetAGNewsDataset(root='./AG_NEWS/', 
                                                                     preprocess=preprocess, 
                                                                     cutoff=25)

import torch.nn as nn

class TextClassifier(nn.Module):
    
    def __init__(self, vocab_size, n_classes):
        super(TextClassifier, self).__init__()
        self.emb = nn.Linear(vocab_size, n_classes)
        
    def forward(self, x):
        return self.emb(x)
    
    def loss(self, scores, target):
        lf = nn.CrossEntropyLoss()
        return lf(scores, target)
    

vocab_size = len(train_dataloader.dataset.vectorizer.vocabulary)
n_classes = 4
model = TextClassifier(vocab_size, n_classes)

In [67]:
# Parámetros de las muestras:
data = {
    'use_gpu': True, # Trasladar o no las muestras a la GPU
    'input_dtype': torch.float, # Tipo de dato de las muestras de entrada
    'target_dtype': torch.long, # Tipo de dato de las muestras de salida
    'train_dataloader': train_dataloader, # Dataset de entrenamiento
    'val_dataloader': val_dataloader # Dataset de validación
}

# Parámetros de optimización:
epochs = 10 # Cantidad de epochs
sample_loss_every = 500 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-5 # Tasa de aprendizaje
check_on_train = False # Queremos ver los resultados también en el train set

# Entrenamiento:
performance_history = SGDTrainModel(model, data, epochs, learning_rate, sample_loss_every, check_on_train)

Epoch: 0, Batch number: 0
Accuracy on validation dataset: 1298/2400 (54.08%)

Epoch: 0, Batch number: 500
Accuracy on validation dataset: 1297/2400 (54.04%)

Epoch: 0, Batch number: 1000
Accuracy on validation dataset: 1297/2400 (54.04%)

Epoch: 0, Batch number: 1500
Accuracy on validation dataset: 1297/2400 (54.04%)

Epoch: 1, Batch number: 162
Accuracy on validation dataset: 1298/2400 (54.08%)

Epoch: 1, Batch number: 662
Accuracy on validation dataset: 1297/2400 (54.04%)

Epoch: 1, Batch number: 1162
Accuracy on validation dataset: 1298/2400 (54.08%)

Epoch: 1, Batch number: 1662
Accuracy on validation dataset: 1298/2400 (54.08%)

Epoch: 2, Batch number: 324
Accuracy on validation dataset: 1298/2400 (54.08%)

Epoch: 2, Batch number: 824
Accuracy on validation dataset: 1299/2400 (54.12%)

Epoch: 2, Batch number: 1324
Accuracy on validation dataset: 1299/2400 (54.12%)

Epoch: 2, Batch number: 1824
Accuracy on validation dataset: 1299/2400 (54.12%)

Epoch: 3, Batch number: 486
Accuracy

# Prueba 4

* Cutoff: 75
* Preprocesamiento: Un poco más trabajado
* Muestras de entrada: Title
* Modelo de clasificación: Layer lineal + CrossEntropy

In [68]:
def preprocess(data):
    df = data.str.replace(r'\(AP\)','')
    df = df.str.replace(r'\(Reuters\)','')
    df = df.str.replace(r'\(AFP\)','')
    df = df.str.replace(r'\(SPACE\.com\)','')
    df = df.str.replace(r'\ba\b','')
    df = df.str.replace(r'\bthe\b','')
    df = df.str.replace(r'\bis\b','')
    df = df.str.replace(r'\bof\b','')
    df = df.str.replace(r'\bto\b','')
    df = df.str.replace(r'[,:;\?\!\"]','')
    df = df.str.replace(r'\s+','<SEP>')
    df = df.str.replace(r"'s<SEP>","<SEP>'s<SEP>")
    df = df.str.split('<SEP>')
    return df

train_dataloader, val_dataloader, test_dataloader = GetAGNewsDataset(root='./AG_NEWS/', 
                                                                     preprocess=preprocess, 
                                                                     cutoff=75)

import torch.nn as nn

class TextClassifier(nn.Module):
    
    def __init__(self, vocab_size, n_classes):
        super(TextClassifier, self).__init__()
        self.emb = nn.Linear(vocab_size, n_classes)
        
    def forward(self, x):
        return self.emb(x)
    
    def loss(self, scores, target):
        lf = nn.CrossEntropyLoss()
        return lf(scores, target)
    

vocab_size = len(train_dataloader.dataset.vectorizer.vocabulary)
n_classes = 4
model = TextClassifier(vocab_size, n_classes)

In [71]:
# Parámetros de las muestras:
data = {
    'use_gpu': True, # Trasladar o no las muestras a la GPU
    'input_dtype': torch.float, # Tipo de dato de las muestras de entrada
    'target_dtype': torch.long, # Tipo de dato de las muestras de salida
    'train_dataloader': train_dataloader, # Dataset de entrenamiento
    'val_dataloader': val_dataloader # Dataset de validación
}

# Parámetros de optimización:
epochs = 10 # Cantidad de epochs
sample_loss_every = 500 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-5 # Tasa de aprendizaje
check_on_train = False # Queremos ver los resultados también en el train set

# Entrenamiento:
performance_history = SGDTrainModel(model, data, epochs, learning_rate, sample_loss_every, check_on_train)

Epoch: 0, Batch number: 0
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 0, Batch number: 500
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 0, Batch number: 1000
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 0, Batch number: 1500
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 1, Batch number: 162
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 1, Batch number: 662
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 1, Batch number: 1162
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 1, Batch number: 1662
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 2, Batch number: 324
Accuracy on validation dataset: 1261/2400 (52.54%)

Epoch: 2, Batch number: 824
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 2, Batch number: 1324
Accuracy on validation dataset: 1262/2400 (52.58%)

Epoch: 2, Batch number: 1824
Accuracy on validation dataset: 1261/2400 (52.54%)

Epoch: 3, Batch number: 486
Accuracy

In [72]:
device = torch.device('cuda:0') if torch.cuda.is_available() and data['use_gpu'] else torch.device('cpu')
num_correct_val, num_samples_val = CheckAccuracy(test_dataloader, 
                                                 model, 
                                                 device, 
                                                 data['input_dtype'], 
                                                 data['target_dtype'])
print('Accuracy on test dataset: {}/{} ({:.2f}%)'.format(num_correct_val, num_samples_val, 100 * float(num_correct_val) / num_samples_val))

Accuracy on test dataset: 4049/7600 (53.28%)
