# Manejo de los datos en NLP

In [1]:
import torch

In [2]:
!cat ../AG_NEWS/ag_news_csv/readme.txt

AG's News Topic Classification Dataset

Version 3, Updated 09/09/2015


ORIGIN

AG is a collection of more than 1 million news articles. News articles have been gathered from more than 2000  news sources by ComeToMyHead in more than 1 year of activity. ComeToMyHead is an academic news search engine which has been running since July, 2004. The dataset is provided by the academic comunity for research purposes in data mining (clustering, classification, etc), information retrieval (ranking, search, etc), xml, data compression, data streaming, and any other non-commercial activity. For more information, please refer to the link http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .

The AG's news topic classification dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the dataset above. It is used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Adv

In [3]:
!head ../AG_NEWS/ag_news_csv/train.csv

"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
"3","Carlyle Looks Toward Commercial Aerospace (Reuters)","Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market."
"3","Oil and Economy Cloud Stocks' Outlook (Reuters)","Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums."
"3","Iraq Halts Oil Exports from Main Southern Pipeline (Reuters)","Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday."
"3","Oil prices soar to all-time record, posing new menace to 

In [4]:
from torch.utils.data import Dataset
import csv
import re

class AGNEWS(Dataset):
    
    def _get_categories(self):
        with open(self.root_path + 'classes.txt', 'r') as f:
            categories = [line[:-2] for line in f]
        return categories
    
    
    def preprocessing(self, sentence):
        """
        Función para preprocesar los datos
        """
        return sentence.split()
    
    def _get_size_of_longest_sentence(self):
        length = 0
        with open(self.data_filename, 'r') as f:
            csv_file = csv.reader(f)
            for line in csv_file:
                class_idx, title, description = line
                text_length = len(self.preprocessing(title))
                if  text_length > length:
                    length = text_length 
                    
        return length
                
    
    def _get_vocab(self):
        """
        Devuelve un diccionario con las palabras del vocabulario
        y la cantidad de veces que aparece en el corpus.
        """
        
        special_tokens = ['<PAD>', '<UNK>']
        vocabulary = {token: i for i, token in enumerate(special_tokens)}
        
        filenames = [self.root_path + 'train.csv', self.root_path + 'test.csv']
        for filename in filenames:
            with open(filename, 'r') as f:
                csv_file = csv.reader(f)
                for i, line in enumerate(csv_file):
                    class_idx, title, description = line
                    title = self.preprocessing(title)
                    for word in title:
                        if word in vocabulary:
                            vocabulary[word] += 1
                        else:
                            vocabulary[word] = 1
                            
        return vocabulary
        
    
    def __init__(self, root_path, train=True):
        
        # Directorio de raíz de los datos:
        self.root_path = root_path 
        
        # Elección de datos (entrenamiento o testeo):
        if train:
            self.data_filename = root_path + 'train.csv'
        else:
            self.data_filename = root_path + 'test.csv'
        
        # Obtención de las categorías:
        self.categories = self._get_categories()
        
        # Obtención del vocabulario:
        self.vocabulary = self._get_vocab() # Contiene las frecuencias
        self.word_to_index = {word: idx for idx, word in enumerate(self.vocabulary)}
        self.index_to_word = {idx: word for idx, word in enumerate(self.vocabulary)}
        self.size_of_longest_sentence = self._get_size_of_longest_sentence()

        
    def __len__(self):
        with open(self.data_filename, 'r') as f:
            csv_file = csv.reader(f)
            for i, line in enumerate(csv_file):
                pass
        return i+1
        
    def __getitem__(self,idx):
        with open(self.data_filename, 'r') as f:
            csv_file = csv.reader(f)
            for i, line in enumerate(csv_file):
                if i == idx:
                    class_idx, title, description = line
                    title = self.preprocessing(title)
                    class_idx = int(class_idx) - 1
                    break
        
        title = torch.tensor([self.word_to_index[word] for word in title], dtype=torch.long)
        class_idx = torch.tensor(class_idx, dtype=torch.long)
        
        title = torch.nn.functional.pad(title,
                                        pad=(0,self.size_of_longest_sentence - len(title)),
                                        mode='constant', 
                                        value=self.word_to_index['<PAD>'])
        
        return title, class_idx
            
        
root_path = '../AG_NEWS/ag_news_csv/'
train_dataset = AGNEWS(root_path, train=True)
val_dataset = AGNEWS(root_path, train=True)
test_dataset = AGNEWS(root_path, train=False)

In [None]:
#############################
# LEVANTADO EL TEXTO DE UNA #
#############################

from torch.utils.data import Dataset
import csv
import re

class AGNEWS(Dataset):
    
    def _get_categories(self):
        with open(self.root_path + 'classes.txt', 'r') as f:
            categories = [line[:-2] for line in f]
        return categories
    
    
    def preprocessing(self, sentence):
        """
        Función para preprocesar los datos
        """
        return sentence.split()
    
    def _get_size_of_longest_sentence(self):
        length = 0
        with open(self.data_filename, 'r') as f:
            csv_file = csv.reader(f)
            for line in csv_file:
                class_idx, title, description = line
                text_length = len(self.preprocessing(title))
                if  text_length > length:
                    length = text_length 
                    
        return length
                
    
    def _get_vocab(self):
        """
        Devuelve un diccionario con las palabras del vocabulario
        y la cantidad de veces que aparece en el corpus.
        """
        
        special_tokens = ['<PAD>', '<UNK>']
        vocabulary = {token: i for i, token in enumerate(special_tokens)}
        
        filenames = [self.root_path + 'train.csv', self.root_path + 'test.csv']
        for filename in filenames:
            with open(filename, 'r') as f:
                csv_file = csv.reader(f)
                for i, line in enumerate(csv_file):
                    class_idx, title, description = line
                    title = self.preprocessing(title)
                    for word in title:
                        if word in vocabulary:
                            vocabulary[word] += 1
                        else:
                            vocabulary[word] = 1
                            
        return vocabulary
        
    
    def __init__(self, root_path, train=True):
        
        # Directorio de raíz de los datos:
        self.root_path = root_path 
        
        # Elección de datos (entrenamiento o testeo):
        if train:
            self.data_filename = root_path + 'train.csv'
        else:
            self.data_filename = root_path + 'test.csv'
        
        # Obtención del texto:
        with open(self.data_filename, 'r') as f:
            self.text = f.read()
        
        # Obtención de las categorías:
        self.categories = self._get_categories()
        
        # Obtención del vocabulario:
        self.vocabulary = self._get_vocab() # Contiene las frecuencias
        self.word_to_index = {word: idx for idx, word in enumerate(self.vocabulary)}
        self.index_to_word = {idx: word for idx, word in enumerate(self.vocabulary)}
        self.size_of_longest_sentence = self._get_size_of_longest_sentence()

        
    def __len__(self):
        with open(self.data_filename, 'r') as f:
            csv_file = csv.reader(f)
            for i, line in enumerate(csv_file):
                pass
        return i+1
        
    def __getitem__(self,idx):
        with open(self.data_filename, 'r') as f:
            csv_file = csv.reader(f)
            for i, line in enumerate(csv_file):
                if i == idx:
                    class_idx, title, description = line
                    title = self.preprocessing(title)
                    class_idx = int(class_idx) - 1
                    break
        
        title = torch.tensor([self.word_to_index[word] for word in title], dtype=torch.long)
        class_idx = torch.tensor(class_idx, dtype=torch.long)
        
        title = torch.nn.functional.pad(title,
                                        pad=(0,self.size_of_longest_sentence - len(title)),
                                        mode='constant', 
                                        value=self.word_to_index['<PAD>'])
        
        return title, class_idx
            
        
root_path = '../AG_NEWS/ag_news_csv/'
train_dataset = AGNEWS(root_path, train=True)
val_dataset = AGNEWS(root_path, train=True)
test_dataset = AGNEWS(root_path, train=False)

In [5]:
print('Tamaño del dataset: ', len(train_dataset))
print('Tamaño del vocabulario: ', len(train_dataset.vocabulary))
print(train_dataset[0])

Tamaño del dataset:  120000
Tamaño del vocabulario:  73957
(tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0]), tensor(2))


In [6]:
batch_size = 64 # Tamaño del batch
val_size = .02 # Proporción de muestras utilizadas para validación 
NUM_TRAIN = int((1 - val_size) * len(train_dataset)) # Cantidad de muestras de entrenamiento
NUM_VAL = len(train_dataset) - NUM_TRAIN # Cantidad de muestras para validación
sampler = lambda start, end: torch.utils.data.SubsetRandomSampler(range(start, end)) # Función para mezclar aleatoriamente las muestras


# Dataloader para las muestras de entrenamiento:
train_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                               batch_size=batch_size, 
                                               sampler=sampler(0, NUM_TRAIN))

# Dataloader para las muestras de validación:
val_dataloader = torch.utils.data.DataLoader(val_dataset, 
                                             batch_size=batch_size, 
                                             sampler=sampler(NUM_TRAIN, NUM_TRAIN+NUM_VAL))

# Dataloader para las muestras de testeo:
test_dataloader = torch.utils.data.DataLoader(test_dataset, 
                                              batch_size=batch_size)

In [7]:
import torch.nn as nn

class EmbeddingSoftmaxClassifier(torch.nn.Module):
    
    def __init__(self, n_vectors, embedding_dim, n_classes):
        
        super(EmbeddingSoftmaxClassifier, self).__init__()
        self.emb = nn.Embedding(n_vectors, embedding_dim)
        self.linear = nn.Linear(embedding_dim, n_classes)
        
    def forward(self, x):
        emb = self.emb(x).mean(dim=1)
        scores = self.linear(emb)
        return scores
    
    def loss(self, scores, target):
        lf = nn.CrossEntropyLoss()
        return lf(scores, target)
    
n_classes = len(train_dataset.categories) # Cantidad de categorías
n_vectors = len(train_dataset.vocabulary) # Cantidad de palabras que contiene la frase
embedding_dim = 50
model = EmbeddingSoftmaxClassifier(n_vectors, embedding_dim, n_classes)

In [8]:
import torch.optim as optim

def CheckAccuracy(loader, model, device, input_dtype, target_dtype):  
    num_correct = 0
    num_samples = 0
    model.eval()  
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=input_dtype)  
            y = y.to(device=device, dtype=target_dtype)
            
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)

        return num_correct, num_samples
        

def TrainModel(model, data, epochs=1, learning_rate=1e-2, sample_loss_every=100):
    
    input_dtype = data['input_dtype'] 
    target_dtype = data['target_dtype']
    device = data['device']
    train_dataloader = data['train_dataloader']
    val_dataloader = data['val_dataloader']
    
    performance_history = {'iter': [], 'loss': [], 'accuracy': []}
    
    model = model.to(device=device)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    batch_size = len(train_dataloader)
    for e in range(epochs):
        for t, (x,y) in enumerate(train_dataloader):
            model.train()
            x = x.to(device=device, dtype=input_dtype)
            y = y.to(device=device, dtype=target_dtype)

            # Forward pass
            scores = model(x) 
            
            # Backward pass
            loss = model.loss(scores,y)                 
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (e * batch_size + t) % sample_loss_every == 0:
                num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
                performance_history['iter'].append(t)
                performance_history['loss'].append(loss.item())
                performance_history['accuracy'].append(float(num_correct) / num_samples)
                print('Epoch: %d, Iteration: %d, Accuracy: %d/%d ' % (e, t, num_correct, num_samples))
                
    num_correct, num_samples = CheckAccuracy(val_dataloader, model, device, input_dtype, target_dtype)
    print('Final accuracy: %.2f%%' % (100 * float(num_correct) / num_samples) )
    
    return performance_history


# Especificaciones de cómo adquirir los datos para entrenamiento:
use_gpu = True
if torch.cuda.is_available() and use_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

data = {
    'device': device,
    'input_dtype': torch.long,
    'target_dtype': torch.long,
    'train_dataloader': train_dataloader,
    'val_dataloader': val_dataloader
}

# Hiperparámetros del modelo y otros:
epochs = 1 # Cantidad de epochs
sample_loss_every = 1 # Cantidad de iteraciones para calcular la cantidad de aciertos
learning_rate = 1e-1 # Tasa de aprendizaje

# Entrenamiento:
input_dtype = data['input_dtype'] 
target_dtype = data['target_dtype']
device = data['device']
train_dataloader = data['train_dataloader']
val_dataloader = data['val_dataloader']

performance_history = {'iter': [], 'loss': [], 'accuracy': []}

model = model.to(device=device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [12]:
n_batches = len(train_dataloader)
for e in range(epochs):
    for t, (x,y) in enumerate(train_dataloader):
        model.train()
        x = x.to(device=device, dtype=input_dtype)
        y = y.to(device=device, dtype=target_dtype)

        # Forward pass
        scores = model(x) 

        # Backward pass
        loss = model.loss(scores,y)                 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (e * n_batches + t) % sample_loss_every == 0:
            print('t = {}, epoch = {}'.format(t, e))

t = 0, epoch = 0
t = 1, epoch = 0
t = 2, epoch = 0
t = 3, epoch = 0
t = 4, epoch = 0
t = 5, epoch = 0
t = 6, epoch = 0
t = 7, epoch = 0
t = 8, epoch = 0
t = 9, epoch = 0
t = 10, epoch = 0
t = 11, epoch = 0
t = 12, epoch = 0
t = 13, epoch = 0


KeyboardInterrupt: 