In [1]:
import torch
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer

In [2]:
def load_data(path,type='JSON'):
    if type == 'JSON':
        return pd.read_json(path)
    else:
        return pd.read_csv(path)

In [16]:
content = load_data('data/cleaned_es_data.json')
labels = content['category'].tolist()
heads = content['headline'].tolist()
descriptions = content['description'].tolist()

In [9]:
print(type(labels))

<class 'list'>


In [10]:
def idx_init(data):
    label_set = set(data)
    label_dic = {}
    for idx,label in enumerate(label_set):
        label_dic[label] = idx
    return label_set,label_dic

In [11]:
itoc,ctoi = idx_init(labels)
for i in range(len(labels)):
    labels[i] = ctoi[labels[i]]
print(labels[:5])

[16, 13, 13, 13, 13]


In [23]:
def encode(tokenizer,contents1,contents2=None):
    if contents2:
        return tokenizer(contents1,contents2,truncation=True,return_tensors='pt')
    else:
        return tokenizer(contents1,truncation=True,return_tensors='pt')

In [17]:
model_name = "dccuchile/bert-base-spanish-wwm-cased"
encodings = encode(model_name,heads,descriptions)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Tokenizer initalized...


In [22]:
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
except OSError:
    print('Can not load config for {}'.format(model_name))
else:
    print('Tokenizer initalized...')
    
for i in encodings['input_ids'][:3]:
    print(i.shape)
    print(tokenizer.decode(i))
    

Tokenizer initalized...
torch.Size([548])
[CLS] Hubo 2 disparos en masa en Texas la semana pasada, pero sólo 1 en la televisión [SEP] Dejó a su marido, él mató a sus hijos, sólo otro día en América. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [12]:
class NewsCategoryDataset(Dataset):
    def __init__(self, path,tk=None,mode='train', balance=[0.7,0.15,0.15]):
        self.df = pd.read_json(path)
        train_num = int(len(self.df)*balance[0])
        val_num = int(len(self.df)*balance[1])
        test_num = int(len(self.df)*balance[2])
        #choose mode
        if mode == 'train':
            self.df = self.df[:train_num]
        elif mode == 'val':
            self.df = self.df[train_num:train_num + val_num]
        elif mode == 'test':
            self.df = self.df[-test_num:]
        # convert idx
        for idx in range(len(self.df))
        if tk:
            self.tokenizer = AutoTokenizer.from_pretrained(tk)
        


    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return {'label': self.df['category'][idx],
                'headline': self.df['headline'][idx],
                'description': self.df['description'][idx]}

In [13]:
ds = NewsCategoryDataset('data/cleaned_es_data.json')

In [14]:
print(ds[1])

{'label': 'ENTERTAINMENT', 'headline': 'Hugh Grant se casa por primera vez a la edad de 57 años', 'description': 'El actor y su larga novia Anna Eberstein ataron el nudo en una ceremonia civil.'}


In [15]:
dataloader = DataLoader(ds,batch_size=5,shuffle=True,num_workers=0)