- reference: https://github.com/dh1105/Sentence-Entailment/blob/main/Sentence_Entailment_BERT.ipynb

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 24.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 75.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 70.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pad_sequence
import pickle
import os
from os import listdir
from os.path import isfile, join
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# change the model, 1 for deberta, 0 for bert
choice = 1
# whether or not to apply data cleaning action, 1 for apply, 0 for not apply
cleaning_choice = 1
# change path data where the data files
path_data = '/content/drive/My Drive/multi-evidence-nli-nlp-243-master/Data/'

In [10]:
# set the deviceto gpu if it exist
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [11]:
import json
# get train.json and force it into single data type
f = open(path_data + 'train.json') 
data = json.load(f)
f.close()
df=pd.DataFrame.from_dict(data, orient='index')
df = df[df.Type == "Single"]

In [None]:

# traverse the CTR file to get all the CTRs
dir_path = path_data + 'CT json'
all_json_files = [join(dir_path, f) for f in listdir(dir_path) if isfile(join(dir_path, f)) and f.endswith(".json")]

ctr_list = []
for file_path in all_json_files:
    with open(file_path) as input_file:
        json_array = json.load(input_file)
        ctr_list.append(json_array)
ctr_df = pd.json_normalize(ctr_list)

In [None]:
#get test data by 
def get_train_val_test_data_split(df):
    test_ind = int(len(df)*0.85)
    val_ind = int(len(df)*0.7)            
        
    return df[:val_ind], df[val_ind:test_ind], df[test_ind:]

In [None]:
df.rename(columns = {'Label':'label'}, inplace = True)
# split the data into training, validation, and test dataframe
train_df, val_df, test_df = get_train_val_test_data_split(df)

In [None]:
# skip cleaning process if the choice is set to 0
if cleaning_choice == 0:
    def processing(df, ctr_df):
        # get all information neccessart for constructing data
        p_id = df['Primary_id']
        s_id = df['Section_id']
        labels = df['label']
        state = df['Statement']
        p_index = df['Primary_evidence_index']
        check_lst = ["intervention", "disease characteristics", "patient characteristics", "results", "adverse events"]
        ctr, hypothesis, label = [], [], []
        for p, s, l, st, p_i in zip(p_id, s_id, labels, state, p_index):
            # retreive the CTR when clinical trial ID is matched
            temp = ctr_df.loc[ctr_df['Clinical Trial ID'] == p].drop('Clinical Trial ID', inplace=False, axis=1)
            for lst in temp:
                app_str = ""
                for premises in temp[lst]:
                    for i, sen in enumerate(premises):
                        # get the premise and hypothesis
                        ctr.append(sen)
                        hypothesis.append(st)
                        # check if the sentence is non-neutral label
                        if s == lst and i in p_i:
                            label.append(l.lower())
                        else:
                            label.append("neutral")


        d = {'ctr': ctr, 'hypothesis':hypothesis, 'label': label}
        return pd.DataFrame(d)

    train_df = processing(train_df, ctr_df)
    val_df = processing(val_df, ctr_df)
    test_df = processing(test_df, ctr_df)
    p_id = df['Primary_id']
    
# doing cleaning process if the choice is set to 1    
else:  
    def processing(df, ctr_df):
        # get all information neccessart for constructing data
        p_id = df['Primary_id']
        s_id = df['Section_id']
        labels = df['label']
        state = df['Statement']
        p_index = df['Primary_evidence_index']
        ctr, hypothesis, label = [], [], []
        for p, s, l, st, p_i in zip(p_id, s_id, labels, state, p_index):
            # retreive the CTR when clinical trial ID is matched
            temp = ctr_df.loc[ctr_df['Clinical Trial ID'] == p].drop('Clinical Trial ID', inplace=False, axis=1)
            for lst in temp:
                app_str = ""
                for premises in temp[lst]:
                    for i, sen in enumerate(premises):
                        if sen.strip().endswith(":"):
                            app_str+=" "+sen
                            continue
                        elif sen != []:
                            ctr.append(app_str+sen)
                            hypothesis.append(st)
                            # check if the sentence is non-neutral label
                            if s == lst and i in p_i:
                                label.append(l.lower())
                            else:
                                label.append("neutral")
                            app_str = ""
        d = {'ctr': ctr, 'hypothesis':hypothesis, 'label': label}
        return pd.DataFrame(d)
    
    train_df = processing(train_df, ctr_df)
    val_df = processing(val_df, ctr_df)
    test_df = processing(test_df, ctr_df)
    p_id = df['Primary_id']

In [None]:
print(train_df['label'].value_counts())
print(val_df['label'].value_counts())
print(test_df['label'].value_counts())

neutral          34203
contradiction     2958
entailment        2883
Name: label, dtype: int64
neutral          7307
contradiction     639
entailment        565
Name: label, dtype: int64
neutral          7972
entailment        599
contradiction     535
Name: label, dtype: int64


In [None]:
# under sample to get same number of labels
random_state = 42
train_min = min(train_df['label'].value_counts())
print(train_min)
s0 = train_df.label[train_df.label.eq("neutral")].sample(train_min, random_state=random_state).index
s1 = train_df.label[train_df.label.eq("contradiction")].sample(train_min, random_state=random_state).index 
s2 = train_df.label[train_df.label.eq("entailment")].sample(train_min, random_state=random_state).index 
train_df = train_df.loc[s0.union(s1).union(s2)]

val_min = min(val_df['label'].value_counts())
s0 = val_df.label[val_df.label.eq("neutral")].sample(val_min, random_state=random_state).index
s1 = val_df.label[val_df.label.eq("contradiction")].sample(val_min, random_state=random_state).index 
s2 = val_df.label[val_df.label.eq("entailment")].sample(val_min, random_state=random_state).index 
val_df = val_df.loc[s0.union(s1).union(s2)]

2883


In [None]:
print(train_df['label'].value_counts())
print(val_df['label'].value_counts())
print(test_df['label'].value_counts())

neutral          2883
contradiction    2883
entailment       2883
Name: label, dtype: int64
neutral          565
entailment       565
contradiction    565
Name: label, dtype: int64
neutral          7972
entailment        599
contradiction     535
Name: label, dtype: int64


#### Prepare dataset

In [None]:
class MNLIDataBert(Dataset):
    # initialize the dataset
    def __init__(self, train_df, val_df, test_df):
        self.label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        # if choice is 0, choose bert, otherwise choose deberta
        if choice == 0:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-deberta-base')
        self.train_data = None
        self.val_data = None
        self.test_data = None
        self.init_data()
    
    # load data splits
    def init_data(self):
        self.train_data = self.load_data(self.train_df)
        self.val_data = self.load_data(self.val_df)
        self.test_data = self.load_data(self.test_df)

    def load_data(self, df):
        MAX_LEN = 512
        token_ids = []
        mask_ids = []
        seg_ids = []
        y = []

        premise_list = df['ctr'].to_list()
        hypothesis_list = df['hypothesis'].to_list()
        label_list = df['label'].to_list()

        for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
            # tokenize the premise and the hypothesis
            premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
            hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
            # apply cls and sep token to the premise and hypothesis
            pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
            premise_len = len(premise_id)
            hypothesis_len = len(hypothesis_id)
            
            # get the mask id
            segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
            attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values
            
            token_ids.append(torch.tensor(pair_token_ids))
            seg_ids.append(segment_ids)
            mask_ids.append(attention_mask_ids)
            y.append(self.label_dict[label])
        
        # pad the sequence for the dataset
        token_ids = pad_sequence(token_ids, batch_first=True)
        mask_ids = pad_sequence(mask_ids, batch_first=True)
        seg_ids = pad_sequence(seg_ids, batch_first=True)
        y = torch.tensor(y)
        dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
        return dataset

    # get dataloaders for data splits
    def get_data_loaders(self, batch_size=32, shuffle=True):
        train_loader = DataLoader(
          self.train_data,
          shuffle=shuffle,
          batch_size=batch_size
        )

        val_loader = DataLoader(
          self.val_data,
          shuffle=shuffle,
          batch_size=batch_size
        )

        test_loader = DataLoader(
          self.test_data,
          shuffle=False,
          batch_size=batch_size
        )

        return train_loader, val_loader, test_loader

In [None]:
mnli_dataset = MNLIDataBert(train_df, val_df, test_df)

In [None]:
train_loader, val_loader, test_loader = mnli_dataset.get_data_loaders(batch_size=8)

In [None]:

# if choice is 0, choose bert, otherwise choose deberta
if choice == 0:
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
else:
    model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-deberta-base')
model.to(device)

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (intermed

In [None]:
param_optimizer = list(model.named_parameters())
# set weight decay value against bias, gamma, and beta
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.05},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
import torch
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=2e-6)


In [None]:
# calculating the accuracy for multi-label
def multi_acc(y_pred, y_test):
    acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
    return acc

In [None]:
from tqdm import tqdm

In [None]:
import time
# set manual seed to 0 so that the result is reproducible
torch.manual_seed(0)
# set num of epoch
EPOCHS = 2

def train(model, train_loader, val_loader, optimizer):  
    total_step = len(train_loader)
    
    for epoch in range(EPOCHS):
        start = time.time()
        model.train()
        total_train_loss = 0
        total_train_acc  = 0
        # get the training batch
        with tqdm(train_loader, unit="batch") as tepoch:
            for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(tepoch):
                tepoch.set_description(f"Epoch {epoch}")
                optimizer.zero_grad()
                pair_token_ids = pair_token_ids.to(device)
                mask_ids = mask_ids.to(device)
                seg_ids = seg_ids.to(device)
                labels = y.to(device)
                
                # get prediction logits and loss
                loss, prediction = model(pair_token_ids, 
                                  token_type_ids=seg_ids, 
                                  attention_mask=mask_ids, 
                                  labels=labels).values()
                # get final predictions
                pred = torch.log_softmax(prediction, dim=1).argmax(dim=1)
                # get multi label accuracy
                acc = multi_acc(prediction, labels)
                
                loss.backward()
                optimizer.step()

                total_train_loss += loss.item()
                total_train_acc  += acc.item()
        # calculate the traning accuracy and loss
        train_acc  = total_train_acc/len(train_loader)
        train_loss = total_train_loss/len(train_loader)
        
        model.eval()
        total_val_acc  = 0
        total_val_loss = 0
        
        # calculate the traning accuracy and loss
        with torch.no_grad():
            for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
                optimizer.zero_grad()
                pair_token_ids = pair_token_ids.to(device)
                mask_ids = mask_ids.to(device)
                seg_ids = seg_ids.to(device)
                labels = y.to(device)

                # get prediction logits and loss
                loss, prediction = model(pair_token_ids, 
                                    token_type_ids=seg_ids, 
                                    attention_mask=mask_ids, 
                                    labels=labels).values()

                # get final accuracy
                acc = multi_acc(prediction, labels)

                total_val_loss += loss.item()
                total_val_acc  += acc.item()
                
        # calculate the traning accuracy and loss
        val_acc  = total_val_acc/len(val_loader)
        val_loss = total_val_loss/len(val_loader)
        end = time.time()
        hours, rem = divmod(end-start, 3600)
        minutes, seconds = divmod(rem, 60)

        print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))


In [None]:
train(model, train_loader, val_loader, optimizer)

  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
Epoch 0: 100%|██████████████████████████████████████████████████████████████████| 1082/1082 [04:01<00:00,  4.48batch/s]


Epoch 1: train_loss: 1.1042 train_acc: 0.4588 | val_loss: 1.0086 val_acc: 0.4425
00:04:16.08


Epoch 1: 100%|██████████████████████████████████████████████████████████████████| 1082/1082 [03:56<00:00,  4.57batch/s]


Epoch 2: train_loss: 0.7776 train_acc: 0.6415 | val_loss: 1.1270 val_acc: 0.4588
00:04:12.35


In [None]:
def get_predictions(model, test_loader, optimizer):
    all_predictions = []
    all_labels = []
    with torch.no_grad():
          for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(test_loader):
            optimizer.zero_grad()
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)
            all_labels.append(labels)
            # get prediction logits and loss
            loss, prediction = model(pair_token_ids, 
                                token_type_ids=seg_ids, 
                                attention_mask=mask_ids, 
                                labels=labels).values()
            # get final predictions
            pred = torch.log_softmax(prediction, dim=1).argmax(dim=1).float()
            # save the predictions
            all_predictions.append(pred)
            
    return all_predictions, all_labels

In [None]:
all_test_predictions, all_test_labels = get_predictions(model, test_loader, optimizer)

In [None]:
# flatten the returned label
test_labels = [x.tolist() for x in all_test_labels]
test_labels = [j for sub in test_labels for j in sub]

In [None]:
len(all_test_labels)

1139

In [None]:
# flatten the predictions
test_predictions = [x.tolist() for x in all_test_predictions]
test_predictions = [j for sub in test_predictions for j in sub]

In [None]:
# regroup the final test
def get_final_ans(df, pred):
    idx = 0
    grouped_ans = []
    group = []
    while idx < len(df)-1:
        if df[idx:idx+1]['hypothesis'].item() == df[idx+1:idx+2]['hypothesis'].item():
            group.append(pred[idx])
        else:
            group.append(pred[idx])
            grouped_ans.append(group)
            group = []
        idx+=1
    return grouped_ans

final_grouping = get_final_ans(test_df, test_predictions)

In [None]:
import collections
import random
random.seed(0)
# get final prediction by grouping
def get_final_pred(grouping):
    ans = []
    ignore = {2.0}
    for i, g in enumerate(grouping):
        frequency = collections.Counter(x for x in g if x not in ignore).most_common(1)
        # if the model predict all neutral, we have to randomly 
        if len(frequency) == 0:
            frequency = random.randint(0,1)
        else:
            frequency = frequency[0][0]
        ans.append(frequency)
    return ans
prediction = get_final_pred(final_grouping)

In [None]:
# match hypothesis into a group
def get_hypothesis(df):
    idx = 0
    grouped_ans = []
    group = []
    while idx < len(df)-1:
        # if they are the same hypothesis, get them back into one
        if df[idx:idx+1]['hypothesis'].item() == df[idx+1:idx+2]['hypothesis'].item():
            idx+=1
            continue
        else:
            group.append(df[idx:idx+1]['hypothesis'].item())
        idx+=1
    return group

In [None]:
h_lst = get_hypothesis(test_df)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

true = []
for h in h_lst:
    true.append(label_dict[df[df['Statement'] == h]['label'].tolist()[0].lower()])


print(classification_report(true, prediction))

              precision    recall  f1-score   support

           0       0.48      0.29      0.36        84
           1       0.43      0.63      0.51        71

    accuracy                           0.45       155
   macro avg       0.45      0.46      0.43       155
weighted avg       0.46      0.45      0.43       155

