In [15]:
import os
import pandas as pd
import re

import torch


class AGNewsDataset(object):
    
    token_pad = '<PAD>'
    token_unk = '<UNK>'
    token_sep = '<TS>'
    special_tokens = [token_pad, token_unk]
    
    def __init__(self, root, train=True):
        
        if not os.path.exists(root):
            raise IOError('Carpeta {} no encontrada'.format(root))
        self.root = root + '/' if root[-1] != '/' else root
        
        print('Buscando archivos train.csv test.csv...')
        self.train_path = '{}train.csv'.format(self.root)
        self.test_path = '{}test.csv'.format(self.root)
        if not os.path.exists(self.train_path):
            raise IOError('Archivo train.csv no encontrado en el directorio {}'.format(self.train_path))
        elif not os.path.exists(self.train_path):
            raise IOError('Archivo test.csv no encontrado en el directorio {}'.format(self.test_path))
        
        print('Obteniendo el dataset...')
        train_data_df = self._read_data(self.train_path)
        test_data_df = self._read_data(self.test_path)
                
        print('Obteniendo el vocabulario...')
        self.vocabulary = self._get_vocabulary(train_data_df)
        self.vocabulary = self._get_vocabulary(test_data_df, vocabulary=self.vocabulary)
        self.n_tokens = len(self.vocabulary)
        
        self._data = train_data_df if train else test_data_df


    
    def __getitem__(self, idx):
                
        if isinstance(idx,torch.Tensor):
            index = idx.tolist()
        else:
            index = idx
        try:
            text, cls_idx = self._data.iloc[index,:]
        except IndexError:
            raise IndexError('{} exceeds index of dataset'.format(index))
            return
        
        cls_idx = torch.tensor(cls_idx - 1, dtype=torch.long)
        
        text = self._string_to_tokens(text)
        text_idx = torch.tensor([self.vocabulary.token_to_index(word) for word in text], dtype=torch.long)
        text_one_hot = torch.zeros(self.n_tokens, dtype=torch.long)
        text_one_hot[text_idx] = 1
        
        return text_one_hot, cls_idx
    

    def __len__(self):
        return len(self._data)
    

    def _read_data(self, filename):
                
        pp_filename = [token for token in filename.split('/')]
        pp_filename[-1] = 'preprocessed_' + pp_filename[-1]
        pp_filename = '/'.join(pp_filename)

        if not os.path.exists(pp_filename):
            with open(pp_filename, 'w') as pp_f:
                pp_f.write('Title,Class label\n')
                for index, row in df.iterrows():
                    text = re.sub( r'"', r"'", row[1])
                    text = re.sub( r' ', self.token_sep, text)
                    new_row = re.sub( r' ', self.token_sep, '\"{0}\",{1:}\n'.format(text,int(row[0])) )
                    pp_f.write(new_row)
        
        data_df = pd.read_csv(pp_filename)
        return data_df
                
        
    def _get_vocabulary(self,df,vocabulary=None):
        
        if vocabulary is None:
            vocabulary = AGNewsVocabulary()
        
        for token in self.special_tokens:
            idx = vocabulary.add_token(token)
            vocabulary._idx_to_freq[idx] -= 1

        for title in df.iloc[:,0]:
            title = self._string_to_tokens(title)
            for word in title:
                vocabulary.add_token(word)

        return vocabulary
        
    def _string_to_tokens(self,string):
        return string.split(self.token_sep)




class AGNewsVocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self):

        self._token_to_idx = {}
        self._idx_to_token = {}
        self._idx_to_freq = {}

    def add_token(self, token):
        
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
            self._idx_to_freq[index] += 1
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
            self._idx_to_freq[index] = 1
        return index
    
    def index_to_token(self, index):
        
        if not isinstance(index, list):
            if not isinstance(index, int):
                raise NameError("'index' must be an integer or list of integers")
            if index not in self._idx_to_token:
                raise KeyError('the index {} exeeds the Vocabulary lenght'.format(index))
            return self._idx_to_token[index]
        
        tokens = []
        for idx in index:
            if not isinstance(idx, int):
                raise NameError("{} is not an integer".format(idx))
            if idx not in self._idx_to_token:
                raise KeyError('the index {} exeeds the Vocabulary lenght'.format(idx))
            tokens.append(self._idx_to_token[idx])
        return tokens

    def token_to_index(self, token):
        
        if not isinstance(token, list):
            if not isinstance(token, str):
                raise NameError("'token' must be a string or list of strings")
            if token not in self._token_to_idx:
                raise KeyError('the token {} is not in the Vocabulary'.format(token))
            return self._token_to_idx[token]
        
        indeces = []
        for tk in token:
            if not isinstance(tk, str):
                raise NameError("'token' must be a string or list of strings")
            if tk not in self._token_to_idx:
                raise KeyError('the token {} is not in the Vocabulary'.format(tk))
            indeces.append(self._token_to_idx[tk])
        return indeces
    
    def get_freq(self, token_or_index):
        freqs = []
        try:
            length = len(token_or_index)
        except TypeError:
            tk_or_idx_list = [token_or_index]
        
        for tk_or_idx in tk_or_idx_list:
            if isinstance(tk_or_idx, int):
                if tk_or_idx not in self._idx_to_token:
                    raise KeyError('the index {} exeeds the Vocabulary lenght'.format(tk_or_idx))
                freqs.append(self._idx_to_freq[tk_or_idx])
            if isinstance(tk_or_idx, str):
                if tk_or_idx not in self._token_to_idx:
                    raise KeyError('the token {} is not in the Vocabulary'.format(tk_or_idx))
                freqs.append(self._idx_to_freq[self._token_to_idx[tk_or_idx]])
            raise KeyError('{} must be either integer or string'.format(tk_or_idx))
        if len(freqs) == 1 and not isinstance(token_or_index, list):
            return freqs[0]
        return freqs

    def __str__(self):
        return "<Vocabulary(size={})>".format(len(self))

    def __len__(self):
        return len(self._token_to_idx)

# Manejo de datos en NLP

Todo esto está sacado de https://github.com/joosthub/PyTorchNLPBook, que es el github del libro que usan en cs224n. El capítulo 3 tiene un ejemplo "Classifying Sentiment of Restaurant Reviews" que es la fuente de toda esta información.

In [None]:
train_dataset = AGNewsDataset(root='./AG_NEWS/', train=True)
test_dataset = AGNewsDataset(root='./AG_NEWS/', train=False)

In [9]:
import torch
from TorchDataUtils import *

%load_ext autoreload
%autoreload 2

In [13]:
print("""Train Dataset:
Cantidad de muestras de entrenamiento: {}
Tamaño de cada muestra: {}
Categorías: ['World', 'Sports', 'Business', 'Sci/Tech']
""".format(len(train_dataset), len(train_dataset[0][0])))

print("""Test Dataset:
Cantidad de muestras de testeo: {}
Tamaño de cada muestra: {}
Categorías: ['World', 'Sports', 'Business', 'Sci/Tech']
""".format(len(test_dataset), len(test_dataset[0][0])))

print("Tamaño del vocabulario: {}".format(len(train_dataset.vocabulary)))
print("Se usan las palabras del train y del test")

train_dataloader, val_dataloader, test_dataloader = generate_data_batches(train_dataset, test_dataset,
                                                                         batch_size=128)

Train Dataset:
Cantidad de muestras de entrenamiento: 120000
Tamaño de cada muestra: 9
Categorías: ['World', 'Sports', 'Business', 'Sci/Tech']

Test Dataset:
Cantidad de muestras de testeo: 7600
Tamaño de cada muestra: 7
Categorías: ['World', 'Sports', 'Business', 'Sci/Tech']

Tamaño del vocabulario: 73916
Se usan las palabras del train y del test


In [16]:
train_dataset[0]

(tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10]), tensor(2))

In [11]:
import torch.nn as nn

class TextClassifier(nn.Module):
    
    def __init__(self, vocab_size, n_classes):
        super(TextClassifier, self).__init__()
        self.emb = nn.Linear(vocab_size, n_classes)
        
    def forward(self, x):
        return self.emb(x)
    
    def loss(self, scores, target):
        lf = nn.CrossEntropyLoss()
        return lf(scores, target)
    
vocab_size = len(train_dataset.vocabulary)
n_classes = 4
model = TextClassifier(vocab_size, n_classes)

In [12]:
# Parámetros de las muestras:
data = {
    'use_gpu': True, # Trasladar o no las muestras a la GPU
    'input_dtype': torch.long, # Tipo de dato de las muestras de entrada
    'target_dtype': torch.long, # Tipo de dato de las muestras de salida
    'train_dataloader': train_dataloader, # Dataset de entrenamiento
    'val_dataloader': val_dataloader # Dataset de validación
}

# Parámetros de optimización:
epochs = 10 # Cantidad de epochs
sample_loss_every = 1 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-1 # Tasa de aprendizaje
check_on_train = False # Queremos ver los resultados también en el train set

# Entrenamiento:
performance_history = SGDTrainModel(model, data, epochs, learning_rate, sample_loss_every, check_on_train)

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 6 and 5 in dimension 1 at /opt/conda/conda-bld/pytorch_1570910687650/work/aten/src/TH/generic/THTensor.cpp:689