In [18]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertForSequenceClassification
from transformers import BertTokenizer
from transformers import AdamW
from torch import nn
import torch

In [39]:
# BertForSeqClassification
## define bert model, optmizer, loss func etc.
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 672)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# the inserted loss function for BertForSeqClassification, here we replace with BCELoss multi-label
loss_func = nn.BCEWithLogitsLoss()
# learning rate
lr = 0.01
# optimizer
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
# load data and process for train
data = pd.read_csv(r"data_with_labels.csv", encoding = "utf-8", error_bad_lines = False)
def load_data(data, i):
    text = data.iloc[i]["text"]
    labels = data.iloc[i][1:]
    return text, labels
def tokenize(tokenizer, text):
    tokens = tokenizer(text, padding = True, truncation = True, return_tensors = "pt")
    input_ids = tokens["input_ids"]
    attention_mask = tokens["attention_mask"]
    return input_ids, attention_mask

In [43]:
from tqdm import tqdm
# train
with tqdm(total = len(data)) as phbar:
    phbar.set_description("Processing:")
    for i in range(len(data)):
        text, labels = load_data(data, i)
        input_ids, attention_mask = tokenize(tokenizer, text)
        optimizer.zero_grad()
        labels = torch.tensor(labels).unsqueeze(0)
        outputs = model(input_ids, attention_mask)
        try:
            loss = loss_func(outputs.logits, labels.fl)
        except Exception as e:
            print(e)
            continue
        loss.backward()
        optimizer.step()
        phbar.update(1)

Processing::  14%|█▍        | 3053/21578 [3:03:51<18:35:36,  3.61s/it]  


KeyboardInterrupt: 

In [62]:
from sklearn.metrics import f1_score,recall_score, precision_score
# test
total = 0
correct = 0
recall_micro = 0
recall_macro = 0
precision_micro = 0
precision_macro = 0
micro_f1 = 0
macro_f1 = 0
with tqdm(total = 500) as phbar:
    phbar.set_description("Processing:")
    for i in range(len(data) - 500, len(data)):
        text, labels = load_data(data, i)
        input_ids, attention_mask = tokenize(tokenizer, text)
        labels = torch.tensor(labels).unsqueeze(0)
        # generate prediction
        outputs = model(input_ids, attention_mask=attention_mask)  # don't use internal CrossEntropyLoss
        prob = outputs.logits.sigmoid()   # Because BCEWithLogitsLoss has sigmoid
            
        # record processed data count
        total += (labels.size(0)*labels.size(1))

        # take the index of the highest prob as prediction output
        THRESHOLD = 0.7
        prediction = prob.detach().clone()
        prediction[prediction > THRESHOLD] = 1
        prediction[prediction <= THRESHOLD] = 0
        recall_micro += recall_score(prediction, labels, average="micro", zero_division=1)
        precision_micro += precision_score(prediction, labels, average="micro", zero_division=1)
        recall_macro += recall_score(prediction, labels, average="macro", zero_division=1)
        precision_macro += precision_score(prediction, labels, average="macro", zero_division=1)
        micro_f1 += f1_score(prediction, labels, average="micro", zero_division=1)
        macr0_f1 += f1_score(prediction, labels, average="macro",zero_division=1)
        correct += prediction.eq(labels).sum().item()
        phbar.update(1)
    
        

Processing:: 100%|██████████| 500/500 [02:47<00:00,  2.99it/s]


In [63]:
# print completed result
acc = 100.*correct/total
recall_micro = 100.*recall_micro/total
recall_macro = 100.*recall_macro/total
precision_micro = 100.*precision_micro/total
precision_macro = 100.*precision_macro/total
micro_f1 = 100.*micro_f1/total
macro_f1 = 100*macr0_f1/total

print('correct: %i  / total: %i / test_acc: %f / test_recall_micro: %f / test_recall_macro: %f / test_precision_micro: %f / test_precision_macro: %f / test_micro_f1: %f / test_macro_f1: %f' % (correct, total, acc, recall_micro, recall_macro, precision_micro, precision_macro, micro_f1, macro_f1))

correct: 335292  / total: 336000 / test_acc: 99.789286 / test_recall_micro: 0.088690 / test_recall_macro: 0.148720 / test_precision_micro: 0.077103 / test_precision_macro: 0.148585 / test_micro_f1: 0.065266 / test_macro_f1: 0.381079
