# Manejo de los datos en NLP

In [16]:
import torch

In [1]:
!cat ../AG_NEWS/ag_news_csv/readme.txt

AG's News Topic Classification Dataset

Version 3, Updated 09/09/2015


ORIGIN

AG is a collection of more than 1 million news articles. News articles have been gathered from more than 2000  news sources by ComeToMyHead in more than 1 year of activity. ComeToMyHead is an academic news search engine which has been running since July, 2004. The dataset is provided by the academic comunity for research purposes in data mining (clustering, classification, etc), information retrieval (ranking, search, etc), xml, data compression, data streaming, and any other non-commercial activity. For more information, please refer to the link http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .

The AG's news topic classification dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the dataset above. It is used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Adv

In [15]:
!less ../AG_NEWS/ag_news_csv/train.csv

"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
"3","Carlyle Looks Toward Commercial Aerospace (Reuters)","Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market."
"3","Oil and Economy Cloud Stocks' Outlook (Reuters)","Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums."
"3","Iraq Halts Oil Exports from Main Southern Pipeline (Reuters)","Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday."
"3","Oil prices soar to all-time record, posing new menace to US e

In [26]:
def file_generator(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            yield line

file_path = '../AG_NEWS/ag_news_csv/train.csv'
my_gen = file_generator(file_path)
print(next(my_gen))

"3","Wall St. Bears Claw Back Into the Black (Reuters)","Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."



In [44]:
l = ['a', 'a', 'b', 'c', 'd', 'e']
s = set(l)
print(l)
print(s)
d = {'a':3, 'b':2}
if 'c' not in d:
    print(d)

['a', 'a', 'b', 'c', 'd', 'e']
{'c', 'd', 'e', 'b', 'a'}
{'a': 3, 'b': 2}


In [68]:
from torch.utils.data import Dataset
import csv
import re

class AGNEWS(Dataset):
    
    def _get_categories(self):
        with open(self.root_path + 'classes.txt', 'r') as f:
            categories = [line[:-2] for line in f]
        return categories
    
    
    def preprocessing(self, sentence):
        """
        Función para preprocesar los datos
        """
        return sentence.split()
    
    def _get_size_of_longest_sentence(self):
        length = 0
        with open(self.data_filename, 'r') as f:
            csv_file = csv.reader(f)
            for line in csv_file:
                class_idx, title, description = line
                text_length = len(self.preprocessing(title))
                if  text_length > length:
                    length = text_length 
                    
        return length
                
    
    def _get_vocab(self):
        """
        Devuelve un diccionario con las palabras del vocabulario
        y la cantidad de veces que aparece en el corpus.
        """
        
        special_tokens = ['<PAD>', '<UNK>']
        vocabulary = {token: i for i, token in enumerate(special_tokens)}
        
        filenames = [self.root_path + 'train.csv', self.root_path + 'test.csv']
        for filename in filenames:
            with open(filename, 'r') as f:
                csv_file = csv.reader(f)
                for i, line in enumerate(csv_file):
                    class_idx, title, description = line
                    title = self.preprocessing(title)
                    for word in title:
                        if word in vocabulary:
                            vocabulary[word] += 1
                        else:
                            vocabulary[word] = 1
                            
        return vocabulary
        
    
    def __init__(self, root_path, train=True):
        
        # Directorio de raíz de los datos:
        self.root_path = root_path 
        
        # Elección de datos (entrenamiento o testeo):
        if train:
            self.data_filename = root_path + 'train.csv'
        else:
            self.data_filename = root_path + 'test.csv'
        
        # Obtención de las categorías:
        self.categories = self._get_categories()
        
        # Obtención del vocabulario:
        self.vocabulary = self._get_vocab() # Contiene las frecuencias
        self.word_to_index = {word: idx for idx, word in enumerate(self.vocabulary)}
        self.index_to_word = {idx: word for idx, word in enumerate(self.vocabulary)}
        self.size_of_longest_sentence = self._get_size_of_longest_sentence()

        
    def __len__(self):
        with open(self.data_filename, 'r') as f:
            csv_file = csv.reader(f)
            for i, line in enumerate(csv_file):
                pass
        return i+1
        
    def __getitem__(self,idx):
        with open(self.data_filename, 'r') as f:
            csv_file = csv.reader(f)
            for i, line in enumerate(csv_file):
                if i == idx:
                    class_idx, title, description = line
                    title = self.preprocessing(title)
                    class_idx = int(class_idx)
                    break
        
        title = torch.tensor([self.word_to_index[word] for word in title], dtype=torch.long)
        class_idx = torch.tensor(class_idx, dtype=torch.long)
        
        title = torch.nn.functional.pad(title,
                                        pad=(0,self.size_of_longest_sentence - len(title)),
                                        mode='constant', 
                                        value=self.word_to_index['<PAD>'])
        
        return title, class_idx
            
        
root_path = '../AG_NEWS/ag_news_csv/'
train_dataset = AGNEWS(root_path, train=True)
test_dataset = AGNEWS(root_path, train=False)

In [69]:
print('Tamaño del dataset: ', len(train_dataset))
print('Tamaño del vocabulario: ', len(train_dataset.vocabulary))
print(train_dataset[0])

Tamaño del dataset:  120000
Tamaño del vocabulario:  73957
(tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0]), tensor(3))
