In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import gc
import transformers
import torch
from torch.utils.data import (
    Dataset, 
    DataLoader
)

import math

from transformers import  (
    BertPreTrainedModel, 
    XLMRobertaConfig,
    XLMRobertaTokenizer
)

from transformers.optimization import (
    AdamW, 
    get_linear_schedule_with_warmup
)

from scipy.special import softmax
from torch.nn import CrossEntropyLoss

from sklearn.metrics import (
    confusion_matrix,
    matthews_corrcoef,
    roc_curve,
    auc,
    average_precision_score,
)

from transformers.models.xlm_roberta.modeling_xlm_roberta import (
    XLMRobertaClassificationHead,
    XLMRobertaConfig,
    XLMRobertaModel
)

In [None]:
MODEL_NAME = 'xlm-roberta-base'
num_labels = 3

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

train_batch_size = 8
test_batch_size = 8
warmup_ratio = 0.06
weight_decay=0.0
gradient_accumulation_steps = 1
num_train_epochs = 25
learning_rate = 1e-05
adam_epsilon = 1e-08

In [18]:
class RobertaDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path, tokenizer, category=False):
        dataset = pd.read_pickle(dataset_path)
        self.data = dataset['tokens'].apply(lambda x: ' '.join(x)).values
        max_len = 0
        for i in tqdm(range(len(self.data))):
            input_ids = tokenizer.encode(self.data[i], add_special_tokens=True)
            max_len = max(max_len, len(input_ids))
        self.max_len = max_len
        self.tokenizer = tokenizer
        if category:
            self.targets = dataset['category']
        else:
            self.targets = dataset['stance']
        del dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        data = self.data[item]
        inputs = self.tokenizer.encode_plus(
            data,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(self.targets[item], dtype=torch.long)
        }

In [None]:
# Define a classification head based on Roberta
class XLMRobertaClassifier(BertPreTrainedModel):
    
    def __init__(self, config):
        super(XLMRobertaClassifier, self).__init__(config)
        self.num_labels = config.num_labels
        self.xlm_roberta = XLMRobertaModel(config)
        self.classifier = XLMRobertaClassificationHead(config)
        
        
    def forward(self, input_ids, attention_mask, labels):
        outputs = self.xlm_roberta(input_ids,attention_mask=attention_mask)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]
        
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [None]:
config_class = XLMRobertaConfig
model_class = XLMRobertaClassifier
tokenizer_class = XLMRobertaTokenizer

config = config_class.from_pretrained(MODEL_NAME, num_labels=num_labels) 

model = model_class.from_pretrained(MODEL_NAME, config=config)
# print('Model=\n',model,'\n')

tokenizer = tokenizer_class.from_pretrained(MODEL_NAME, do_lower_case=False)
# print('Tokenizer=',tokenizer,'\n')

In [19]:
# build the pytorch dataloader
train_dataset = RobertaDataset('output/train_3_original.pkl', tokenizer)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)
dev_dataset = RobertaDataset('output/dev_1_original.pkl', tokenizer)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=4, shuffle=True)

100%|██████████| 3000/3000 [00:00<00:00, 4555.39it/s]
100%|██████████| 1000/1000 [00:00<00:00, 4766.35it/s]


In [None]:
t_total = len(train_loader) // gradient_accumulation_steps * num_train_epochs
optimizer_grouped_parameters = []
custom_parameter_names = set()
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters.extend(
    [
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if n not in custom_parameter_names and not any(nd in n for nd in no_decay)
            ],
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if n not in custom_parameter_names and any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
)

warmup_steps = math.ceil(t_total * warmup_ratio)
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)


In [None]:
def compute_metrics(preds, model_outputs, labels, eval_examples=None, multi_label=False):
    assert len(preds) == len(labels)
    mismatched = labels != preds
    wrong = [i for (i, v) in zip(eval_examples, mismatched) if v.any()]
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds, labels=[0, 1]).ravel()
    scores = np.array([softmax(element)[1] for element in model_outputs])
    fpr, tpr, thresholds = roc_curve(labels, scores)
    auroc = auc(fpr, tpr)
    auprc = average_precision_score(labels, scores)
    return (
        {
            **{"mcc": mcc, "tp": tp, "tn": tn, "fp": fp, "fn": fn, "auroc": auroc, "auprc": auprc},
        },
        wrong,
    )

def print_confusion_matrix(result):
    print('confusion matrix:')
    print('            predicted    ')
    print('          0     |     1')
    print('    ----------------------')
    print('   0 | ',format(result['tn'],'5d'),' | ',format(result['fp'],'5d'))
    print('gt -----------------------')
    print('   1 | ',format(result['fn'],'5d'),' | ',format(result['tp'],'5d'))
    print('---------------------------------------------------')

In [None]:
def f1_score(output, targets):
    # get precision and recall for each class
    precision = [0] * num_labels
    recall = [0] * num_labels
    for i in range(num_labels):
        tp = 0
        fp = 0
        fn = 0
        for j in range(len(output)):
            if output[j] == i:
                if targets[j] == i:
                    tp += 1
                else:
                    fp += 1
            elif targets[j] == i:
                fn += 1
        if tp == 0:
            precision[i] = 0
            recall[i] = 0
        else:
            precision[i] = tp / (tp + fp)
            recall[i] = tp / (tp + fn)
    # calculate f1 score for each class
    f1 = [0] * num_labels
    for i in range(num_labels):
        if precision[i] == 0 and recall[i] == 0:
            f1[i] = 0
        else:
            f1[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i])
    # print precision, recall, f1 score for each class
    print('class 0: ')
    print('precision: ', precision)
    print('recall: ', recall)
    print('f1 score: ', f1)
    # calculate weighted average f1 score
    f1_weighted = 0
    for i in range(num_labels):
        f1_weighted += f1[i] * (sum(targets == i) / len(targets))
    # print weighted average f1 score
    print('weighted average f1 score: ', f1_weighted)

In [None]:
model.to(device)

model.zero_grad()

for epoch in range(num_train_epochs):

    model.train()
    epoch_loss = []
    
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['targets'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        epoch_loss.append(loss.item())
        
    #evaluate model with test_df at the end of the epoch.
    eval_loss = 0.0
    nb_eval_steps = 0
    n_batches = len(dev_loader)
    preds = np.empty((len(dev_dataset), num_labels))
    out_label_ids = np.empty((len(dev_dataset)))
    model.eval()
    
    for i,test_batch in enumerate(dev_loader):
        with torch.no_grad():
            input_ids = test_batch['input_ids'].to(device)
            attention_mask = test_batch['attention_mask'].to(device)
            labels = test_batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.item()
            
        nb_eval_steps += 1
        start_index = test_batch_size * i
        end_index = start_index + test_batch_size if i != (n_batches - 1) else len(dev_dataset)
        preds[start_index:end_index] = logits.detach().cpu().numpy()
        out_label_ids[start_index:end_index] = test_batch["labels"].detach().cpu().numpy()
        
    eval_loss = eval_loss / nb_eval_steps
    model_outputs = preds
    preds = np.argmax(preds, axis=1)
    score = f1_score(preds, )    
    print('epoch',epoch,'Training avg loss',np.mean(epoch_loss))
    print('epoch',epoch,'Testing  avg loss',eval_loss)
    print(result)
    # print f1 score
    print('f1 score:',f1_score(out_label_ids, preds))
    print('---------------------------------------------------\n')