In [1]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer,AutoConfig,AutoModel,Trainer
from tqdm import tqdm,trange
from torch.autograd import Variable

In [2]:
MODEL_NAME = 'dccuchile/bert-base-spanish-wwm-cased'
LOG_DIR = "./news-classification"
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 10
LEARN_RATE = 5e-5

In [3]:
def load_data(path,type='JSON'):
    if type == 'JSON':
        df = pd.read_json(path)
    else:
        df = pd.read_csv(path)
    labels = df['category'].tolist()
    heads = df['headline'].tolist()
    descriptions = df['description'].tolist()
    contents = [h+d for h,d in zip(heads,descriptions)]
    label_set = set(labels)
    label_dic = {}
    for idx,label in enumerate(label_set):
        label_dic[label] = idx
    print('Data loaded: ',len(labels))
    return labels,contents,label_set,label_dic

In [4]:
labels,contents,lset,ldic = load_data('data/cleaned_es_data.json')

Data loaded:  128850


In [5]:
def pre_encode_dic(model_name,contents,max_length=512):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except OSError:
        print('OSError! can not load tokenizer!')
    else:
        print('Tokenizer loaded...')
        return tokenizer(contents,
                         add_special_tokens=True,
                         padding=True,
                         max_length=max_length,
                         truncation=True,
                         return_tensors='pt')

In [6]:
def pre_encode_list(model_name,contents,max_length=512):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except OSError:
        print('OSError! can not load tokenizer!')
    else:
        print('Tokenizer loaded...')
        res = []
        for content in tqdm(contents):
            res.append(tokenizer(content,
                                 add_special_tokens=True,
                                 padding='max_length',
                                 max_length=max_length,
                                 truncation=True,
                                 return_tensors='pt'))
        return res

In [7]:
import time
# encode 1 87s
start = time.time()
encodings_list = pre_encode_list(MODEL_NAME,contents,MAX_SEQ_LENGTH)
print(time.time()-start)

# start = time.time()
# encodings_dic = pre_encode_dic(MODEL_NAME,contents,MAX_SEQ_LENGTH)
# print(time.time()-start)

  0%|▏                                                                          | 318/128850 [00:00<01:20, 1592.58it/s]

Tokenizer loaded...


100%|████████████████████████████████████████████████████████████████████████| 128850/128850 [01:30<00:00, 1423.71it/s]

93.75212502479553





In [136]:
print(encodings[0]['input_ids'])
print(len(encodings[1]['input_ids']))

tensor([[    4, 10735,  1115, 15063,  1036,  6770,  1036, 11173,  1030,  3165,
          9091,  1017,  1355,  1737,  1094,  1036,  1030,  5011, 30971, 30931,
          7482,  1013,  1079,  5245,  1017,  1611,  6847,  1013,  1287,  3339,
          1017,  1737,  1925,  1726,  1036,  4109,  1009,     5,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

In [8]:
class NewsCategoryDataset(Dataset):
    def __init__(self, 
                 labels,
                 label_dic,
                 inputs,
                 mode='train',
                 balance=[0.7,0.15,0.15]):
        train_num = int(len(labels)*balance[0])
        val_num = int(len(labels)*balance[1])
        test_num = int(len(labels)*balance[2])
        #choose mode
        if mode == 'train':
            self.inputs = inputs[:train_num]
        elif mode == 'val':
            self.inputs = inputs[train_num:-test_num]
        elif mode == 'test':
            self.inputs = inputs[-test_num:]
        if labels is not None:
            self.label_dic = label_dic
            if mode == 'train':
                self.labels = labels[:train_num]
            elif mode == 'val':
                self.labels = labels[train_num:-test_num]
            elif mode == 'test':
                self.labels = labels[-test_num:]
                  

    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        res = {'input_ids':self.inputs[idx]['input_ids'][0],
               'token_type_ids':self.inputs[idx]['token_type_ids'][0],
               'attention_mask':self.inputs[idx]['attention_mask'][0]}
        
        if self.labels is not None:
            y = self.labels[idx]
            y_encoded = torch.Tensor(
                [self.label_dic.get(y, -1)]
            ).long()
            res["targets"] = y_encoded
        
        return res

In [9]:
train_dataset = NewsCategoryDataset(labels,ldic,encodings_list)
print(len(train_dataset))
valid_dataset = NewsCategoryDataset(labels,ldic,encodings_list,'val')
print(len(valid_dataset))

90195
19328


In [10]:
train_val_loaders = {
    "train": DataLoader(dataset=train_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=True),
    "valid": DataLoader(dataset=valid_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=False)    
}

In [11]:
class BertForSequenceClassification(nn.Module):
    def __init__(self,model_name,num_classes=None):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_name,num_labels=num_classes)
        
        self.bert = AutoModel.from_pretrained(model_name,
                                              config=config)
        
        self.MLP = nn.Sequential(
            nn.Linear(config.hidden_size,config.hidden_size),
            nn.Linear(config.hidden_size,num_classes)
        )
        
        self.dropout = nn.Dropout(0.2)
        
    def forward(self,features,attention_mask=None,head_mask=None):
        
        assert attention_mask is not None,'attention_mask is none'
        bert_output = self.bert(features,
                               attention_mask=attention_mask,
                               head_mask=head_mask)
        
        hidden_state = bert_output[0]
        pool_output = hidden_state[:,0]
        pool_output = self.MLP[0](pool_output)
        pool_output = nn.ReLU()(pool_output)
        pool_output = self.dropout(pool_output)
        logits = self.MLP[1](pool_output)
        
        return logits
        

In [12]:
model = BertForSequenceClassification(MODEL_NAME,len(lset))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=LEARN_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

In [13]:
from transformers import TrainingArguments

In [17]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,
    label_names=['label']
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

In [13]:
def train_classifier(model,epoch,iterator,loss,optimizer=None,
                     scheduler=None
                    ):
    loss = 0
    tokens = 0
    model.to('cuda')
    for i in trange(epoch):
        for batch,data in enumerate(iterator):
            features = Variable(data['input_ids']).to('cuda')
            print(features.shape)
            att_mask = Variable(data['attention_mask']).to('cuda')
            print(att_mask.shape)
            y = data['targets']
            print(y.shape)
            y_pred = model(features,att_mask)
            print(y_pred.shape)
            print(y.shape)
            if batch==3:
                break
            
            
            
            

In [14]:
train_classifier(model,3,train_val_loaders['train'],criterion,optimizer,scheduler)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

torch.Size([10, 512])
torch.Size([10, 512])
torch.Size([10, 1])


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

torch.Size([10, 26])
torch.Size([10, 1])
torch.Size([10, 512])
torch.Size([10, 512])
torch.Size([10, 1])





RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 8.00 GiB total capacity; 6.20 GiB already allocated; 10.72 MiB free; 6.23 GiB reserved in total by PyTorch)

In [223]:
data = train_dataset[0]
f = data['input_ids']
am = data['attention_mask']
y = data['targets']
logits = model(f,am)

In [227]:
print(logits)
print(y)
print(logits.shape)
print(y.shape)

tensor([[-0.3122,  0.1587,  0.1271, -0.2139,  0.1143, -0.1041,  0.0060, -0.2339,
          0.0685,  0.3044,  0.0594, -0.1323,  0.1277, -0.4797,  0.2011, -0.1829,
          0.4356,  0.0585,  0.1889,  0.2406,  0.2503,  0.0744,  0.0250,  0.1773,
         -0.3891, -0.1906]], grad_fn=<AddmmBackward>)
tensor([4])
torch.Size([1, 26])
torch.Size([1])


In [228]:
logits[0]

tensor([-0.3122,  0.1587,  0.1271, -0.2139,  0.1143, -0.1041,  0.0060, -0.2339,
         0.0685,  0.3044,  0.0594, -0.1323,  0.1277, -0.4797,  0.2011, -0.1829,
         0.4356,  0.0585,  0.1889,  0.2406,  0.2503,  0.0744,  0.0250,  0.1773,
        -0.3891, -0.1906], grad_fn=<SelectBackward>)

In [230]:
loss = criterion(logits,y)
loss

tensor(3.1820, grad_fn=<NllLossBackward>)

In [17]:
del model
torch.cuda.empty_cache() 


NameError: name 'model' is not defined