This script fine-tunes and evaluates DA-RoBERTa, DA-BERT, DA-BART, and DA-T5 on the BABE dataset by 5-fold cross-validation:

Required data to run this script:
- BABE.xlsx
- the pretrained model that should be evaluated (selected model from ) 

In [None]:
!pip install transformers
!pip install openpyxl
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import json
import io
import sys
import random
import openpyxl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import roc_auc_score,f1_score,precision_score,recall_score,accuracy_score,confusion_matrix
import transformers
from transformers import AdamW,BertTokenizer,BertModel,RobertaTokenizer,RobertaModel,T5EncoderModel,T5Tokenizer,BartModel,BartTokenizer
from torch.utils.data import DataLoader,TensorDataset,RandomSampler

**Create model architecture** (Uncomment respective model which should be evaluated)

**RoBERTa**

In [None]:
# class RobertaClass(torch.nn.Module):
#     def __init__(self):
#         super(RobertaClass, self).__init__()
#         self.roberta = RobertaModel.from_pretrained("roberta-base")
#         self.vocab_transform = torch.nn.Linear(768, 768)
#         self.dropout = torch.nn.Dropout(0.2)
#         self.classifier1 = torch.nn.Linear(768,2)

#     def forward(self, input_ids, attention_mask):
#         output_1 = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
#         hidden_state = output_1[0]
#         pooler = hidden_state[:, 0]
#         pooler = self.vocab_transform(pooler)
#         pooler = self.dropout(pooler)
#         output = self.classifier1(pooler)

#         return output

**BERT**

In [None]:
# class BertClass(torch.nn.Module):
#     def __init__(self):
#         super(BertClass, self).__init__()
#         self.bert = BertModel.from_pretrained("bert-base-uncased")
#         self.vocab_transform = torch.nn.Linear(768, 768)
#         self.dropout = torch.nn.Dropout(0.2)
#         self.classifier1 = torch.nn.Linear(768,2)

#     def forward(self, input_ids, attention_mask):
#         output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         hidden_state = output_1[0]
#         pooler = hidden_state[:, 0]
#         pooler = self.vocab_transform(pooler)
#         pooler = self.dropout(pooler)
#         output = self.classifier1(pooler)

#         return output

**T5**

In [None]:
#create model
# class T5Class(torch.nn.Module):
#     def __init__(self):
#         super(T5Class, self).__init__()
#         self.T5 = T5EncoderModel.from_pretrained("t5-base")
#         self.vocab_transform = torch.nn.Linear(768, 768)
#         self.dropout = torch.nn.Dropout(0.2)
#         self.classifier1 = nn.Linear(768,2)

#     def forward(self, input_ids, attention_mask):
#         output_1 = self.T5(input_ids=input_ids, attention_mask=attention_mask)
#         hidden_state = output_1[0]
#         pooler = hidden_state[:, 0]
#         pooler = self.vocab_transform(pooler)
#         pooler = self.dropout(pooler)
#         output = self.classifier1(pooler)

#         return output

**BART**

In [None]:
# #create model
# class BartClass(torch.nn.Module):
#     def __init__(self):
#         super(BartClass, self).__init__()
#         self.bart = BartModel.from_pretrained("facebook/bart-base")
#         self.vocab_transform = torch.nn.Linear(768, 768)
#         self.dropout = torch.nn.Dropout(0.2)
#         self.classifier1 = nn.Linear(768,2)

#     def forward(self, input_ids, attention_mask):
#         output_1 = self.bart(input_ids=input_ids, attention_mask=attention_mask)
#         hidden_state = output_1[0]
#         pooler = hidden_state[:, 0]
#         pooler = self.vocab_transform(pooler)
#         pooler = self.dropout(pooler)
#         output = self.classifier1(pooler)

#         return output

**Connect to GPU**

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

**Load pre-trained domain-adapted weights/parameters for the model:** You might have to adapt the path pointing to the domain-adapted model

In [None]:
#load weights of pretrained news model
#weight_dict = torch.load('Roberta.bin')
#weight_dict = torch.load('BERT.bin')
#weight_dict = torch.load('T5.bin')
#weight_dict = torch.load('BART.bin')

#load saved classifier weights + classifier bias --> we use same parameters for the final classification of all models to achieve maximum comparability
classifier_weights = torch.load('../input/domainadaptivepretrainingjcdl/classifier.weights.pt')
classifier_bias = torch.load('../input/domainadaptivepretrainingjcdl/classifier.bias.pt')

#insert weights and bias into weight dict
weight_dict['classifier1.weight'] = classifier_weights
weight_dict['classifier1.bias'] = classifier_bias

**Load BABE Data:** You might have to adapt the path again

In [None]:
df = pd.read_excel("BABE.xlsx")
df = df[df['label_bias']!= 'No agreement']
df['Label_bias_0-1'] = df['label_bias'].map({'Biased':1,'Non-biased':0})
df.head(3)

**Define Cross-Validation,Tokenizer,Batch Size,Epochs,Loss, and Seeds**

In [None]:
np.random.seed(2018)
torch.manual_seed(2018)   
random.seed(2018)    
torch.cuda.manual_seed_all(2018)
random.seed(2018)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

kfold = StratifiedKFold(n_splits = 3,shuffle = True,random_state=2)
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#tokenizer = T5Tokenizer.from_pretrained('t5-base')
#tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
cross_entropy = nn.CrossEntropyLoss()

epochs = 10 #we implement an early stopping criterion. Fine-tuning is actually not done for 10 epochs
batch_size = 32

**Define functions for fine-tuning and validation**

In [None]:
def train(model):

    model.train()
    total_loss = 0

    for batch in train_dataloader:
        optim_dbert.zero_grad()
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        outputs = model(sent_id, attention_mask=mask)
        loss = cross_entropy(outputs,labels)
        total_loss = total_loss+loss.item()
        loss.backward()
        optim_dbert.step()

    avg_loss = total_loss / len(train_dataloader)

    return avg_loss

In [None]:
def validate(model):

    model.eval()
    total_loss = 0

    print("\n   Validating...")

    for batch in test_dataloader:
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch

        with torch.no_grad():
            outputs = model(sent_id, attention_mask=mask)
            loss = cross_entropy(outputs,labels)
            total_loss = total_loss+loss.item()

    avg_loss = total_loss / len(test_dataloader) 

    return avg_loss

In [None]:
#combine train and validate function: get train and validation loss for every cross-validation split, save best-performing model, and get predictions on the held-out test set to calculate evaluation metrics

def train_validate_pred(model):
    best_valid_loss = float('inf')

    # empty lists to store training and validation loss of each epoch
    train_losses=[]
    valid_losses=[]

    #for each epoch
    for epoch in range(epochs):

        print('\n   Epoch {} / {}'.format(epoch+1,epochs))

        #train model
        train_loss = train(model)

        #evaluate model
        valid_loss = validate(model)

        #save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            global model_dbert
            torch.save(model.state_dict(), 'saved_weights.pt')


        #if validation loss increases, stop training
        elif valid_loss >= best_valid_loss:
            print("\n Validation loss not decreased, Model of previous epoch saved")
            break

        print(f'\n    Training Loss: {train_loss:.3f}')
        print(f'    Validation Loss: {valid_loss:.3f}')
  
    #predict
    path = 'saved_weights.pt'
    model.load_state_dict(torch.load(path))
    with torch.no_grad():
        preds = model(test_seq.to(device), test_mask.to(device))
        preds = preds.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
  
    #save results
    loss.append(best_valid_loss)
    acc.append(accuracy_score(test_y,preds))
    auc.append(roc_auc_score(test_y,preds))
    micro_F1.append(f1_score(test_y,preds,average='micro'))
    macro_F1_weighted.append(f1_score(test_y,preds,average='weighted'))
    binary_F1.append(f1_score(test_y,preds,average='binary'))
    precision.append(precision_score(test_y,preds))
    recall.append(recall_score(test_y,preds))
    conf_matrix = confusion_matrix(test_y, preds)
    conf_matrices.append(conf_matrix)

In [None]:
#implement cross validation + train/validate/predict
loss = []
acc = []
auc = []
micro_F1 = []
macro_F1_weighted = []
binary_F1 = []
precision = []
recall = []
conf_matrices = []

for fold, (train_index, test_index) in enumerate(kfold.split(df['text'], df['Label_bias_0-1'])):
    sys.stdout.write('\n \r Fold {} / {}\n'.format(fold+1,kfold.get_n_splits()))

    #divide data into folds
    train_text = df['text'].iloc[train_index]
    test_text = df['text'].iloc[test_index]
    train_labels = df['Label_bias_0-1'].iloc[train_index]
    test_labels = df['Label_bias_0-1'].iloc[test_index]

    #encode
    train_encodings = tokenizer(train_text.tolist(), truncation=True, padding=True)
    test_encodings = tokenizer(test_text.tolist(), truncation=True, padding=True)

    #convert input to tensors 
    train_seq = torch.tensor(train_encodings['input_ids'])
    train_mask = torch.tensor(train_encodings['attention_mask'])
    train_y = torch.tensor(train_labels.tolist())

    test_seq = torch.tensor(test_encodings['input_ids'])
    test_mask = torch.tensor(test_encodings['attention_mask'])
    test_y = torch.tensor(test_labels.tolist())

    # wrap tensors into one dataset
    train_data = TensorDataset(train_seq, train_mask, train_y)
    test_data = TensorDataset(test_seq, test_mask, test_y)

    #define dataloader
    train_sampler = RandomSampler(train_data)
    test_sampler = RandomSampler(test_data)
    train_dataloader = DataLoader(train_data,sampler= train_sampler, batch_size=batch_size)
    test_dataloader = DataLoader(test_data,sampler = test_sampler, batch_size=batch_size)

    #create model instance with pre-trained weights and optimizer: insert respective model that is to be fine-tuned/evaluated
#     model = BertClass()
#     model = RobertaClass()
#     model = BartClass()
#     model = T5Class()
    model.load_state_dict(weight_dict)
    model.to(device)
    optim_dbert = AdamW(model.parameters(), lr=1e-5)

    #call train/validate/predict function
    train_validate_pred(model)

In [None]:
#compute cross-validated performance metrics
cv_loss = sum(loss)/len(loss)
cv_acc = sum(acc)/len(acc)
cv_auc = sum(auc)/len(auc)
cv_micro_f1 = sum(micro_F1)/len(micro_F1)
cv_macro_f1 = sum(macro_F1_weighted)/len(macro_F1_weighted)
sd = np.std(macro_F1_weighted)
cv_binary_f1 = sum(binary_F1)/len(binary_F1)
cv_prec = sum(precision)/len(precision)
cv_recall = sum(recall)/len(recall)
cv_conf_matrix = np.mean(conf_matrices, axis=0)


print("CV Accuracy = {}".format(round(cv_acc,4)))
print("CV AUC = {}".format(round(cv_auc,4)))
print("CV Micro F1 = {}".format(round(cv_micro_f1,4)))
print("CV Macro F1 weighted = {}".format(round(cv_macro_f1,4)))
print("SD = {}".format(round(sd,4)))
print("CV Binary F1 = {}".format(round(cv_binary_f1,4)))
print("CV Precision = {}".format(round(cv_prec,4)))
print("CV Recall = {}".format(round(cv_recall,4)))
print("CV Loss = {}".format(round(cv_loss,4)))

In [None]:
#optionally save metrics in dict
#Roberta_DA_SG2_bs64_lr1e5_6ep = {"loss":cv_loss,"micro_f1":cv_micro_f1,"macro_f1":cv_macro_f1,"SD":sd,"binary_f1":cv_binary_f1,"prec":cv_prec,"recall":cv_recall}

#store metrics in json format
# with open('./Roberta_DA_SG2_bs64_lr1e5_6ep.json', 'w') as f:
#     json.dump(Roberta_DA_SG2_bs64_lr1e5_6ep, f)

**McNemar test for statistical significance based on last cv split**

In [None]:
from mlxtend.evaluate import mcnemar,mcnemar_table

In [None]:
#get predictions for model on test set. Insert the domain-adapted model you want to evaluate here. Predictions are provided in the repository and do not have to be computed separately
# with torch.no_grad():
#     preds_DA = model(test_seq.to(device), test_mask.to(device))
#     preds_DA = preds_DA.detach().cpu().numpy()
# preds_DA = np.argmax(preds_DA, axis = 1)

#optionally store predictions
#np.save("preds_T5_DA.npy",preds_T5_DA)

In [None]:
# #get predictions for baseline model. Insert the baseline model you want to evaluate here.
# with torch.no_grad():
#     preds_noDA = model(test_seq.to(device), test_mask.to(device))
#     preds_noDA = preds_noDA.detach().cpu().numpy()
# preds_noDA = np.argmax(preds_noDA, axis = 1)

#optionally store predictions
# np.save("preds_T5_noDA.npy",preds_T5_noDA)

In [None]:
# load predictions for baseline and domain-adapted model and get contingency table. Predictions are provided in the repository and do not have to be computed separately
preds_noDA = np.load("preds_T5_noDA.npy") #path might have to be adapted
preds__DA = np.load("preds_T5_DA.npy")
tb = mcnemar_table(y_target=np.array(test_labels), 
                   y_model1=preds_roberta_noDA, 
                   y_model2=preds_roberta_DA)

print(tb)

In [None]:
#calculate McNemar test statistic
chi2, p = mcnemar(ary=tb, corrected=True)
print('chi-squared:', chi2)
print('p-value:', p)