Fine Tune DistilBERT For  Text Pair Matching
Author: Nelson LIN (nelsonlin0321@outlook.com)

In [1]:
# import libaries
import torch
from torch import cuda
from torch.utils.data import Dataset,DataLoader

In [2]:
print(torch.__version__)

1.10.0+cu111


In [3]:
import os
import json
import random
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn import metrics
from collections import Counter

In [4]:
# 


In [5]:
from transformers import AutoTokenizer,DistilBertForSequenceClassification,default_data_collator,TrainingArguments,Trainer,EarlyStoppingCallback

In [6]:
device = "cuda" if cuda.is_available() else "cpu"

In [7]:
device

'cuda'

In [8]:
model_name =  "distilbert-base-uncased"

## 1) Import Data



In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# download from https://deepai.org/dataset/qnli
data_dir = "/content/drive/My Drive/Colab Notebooks/Data/QNLIv2/QNLI"

In [11]:
os.listdir(data_dir)


['train.tsv', 'test.tsv', 'dev.tsv']

In [12]:
def read_qnli_data(file_name):
    path = os.path.join(data_dir,file_name)
    with open(path) as f:
        text = f.readlines()

    header = text[0].strip().split("\t")
    lines = [line.strip().split("\t") for line in text[1:]]

    df = pd.DataFrame(lines,columns = header)
    return df

In [13]:
qnli_train_df = read_qnli_data("train.tsv")
qnli_dev_df = read_qnli_data("dev.tsv")


In [14]:
qnli_train_df['label'] = np.where(qnli_train_df['label']=='entailment',1,0)
qnli_dev_df['label'] = np.where(qnli_dev_df['label']=='entailment',1,0)

In [15]:
qnli_dev_df.head()

Unnamed: 0,index,question,sentence,label
0,0,What came into force after the new constitutio...,"As of that day, the new constitution heralding...",1
1,1,What is the first major city in the stream of ...,The most important tributaries in this area ar...,0
2,2,What is the minimum required if you want to te...,In most provinces a second Bachelor's Degree s...,0
3,3,How was Temüjin kept imprisoned by the Tayichi...,The Tayichi'ud enslaved Temüjin (reportedly wi...,1
4,4,"What did Herr Gott, dich loben wir become know...","He paraphrased the Te Deum as ""Herr Gott, dich...",0


## 2) Tokenization Features Engineering

In [16]:
max_length = 512

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [18]:
question = qnli_train_df['question'].iloc[0]
sentence = qnli_train_df['sentence'].iloc[0]

In [19]:
inputs = tokenizer(
    text = question,
    text_pair = sentence,
    add_special_tokens = True,
    max_length = max_length,
    padding = "max_length",
    return_token_type_ids = False,
    truncation = True,
    )

In [20]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [21]:
class QNLIDataset(Dataset):
    def __init__(self,dataframe,tokenizer,max_length,device):
        self.len = len(dataframe)
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device
        self.question_list = self.df['question'].to_list()
        self.sentence_list = self.df['sentence'].to_list()
        self.label_list = self.df['label'].to_list()

    def __len__(self):
        return self.len
    
    def __getitem__(self,index):
        question = self.question_list[index]
        sentence = self.sentence_list[index]
        label = self.label_list[index]

        inputs = self.tokenizer(
            text = question,
            text_pair = sentence,
            add_special_tokens = True,
            max_length = max_length,
            padding = "max_length",
            return_token_type_ids = False,
            truncation = True,
            # return_tensors = 'pt',
            )
        
        inputs= {
            'input_ids':torch.tensor(inputs['input_ids']),
            'attention_mask':torch.tensor(inputs['attention_mask']),
            'labels':torch.tensor(label),
        }

        inputs = {k:v.to(self.device) for (k,v) in inputs.items()}

        return inputs

In [39]:
train_sample_size = 500
test_sample_size = 100

In [40]:
train_dataset = QNLIDataset(qnli_train_df[:train_sample_size],tokenizer,max_length,'cpu')
dev_dataset = QNLIDataset(qnli_dev_df[:test_sample_size],tokenizer,max_length,'cpu')

In [41]:
train_dataset[:2]

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[ 101, 2043, 2106,  ...,    0,    0,    0],
         [ 101, 2029, 7421,  ...,    0,    0,    0]]),
 'labels': tensor([0, 0])}

##3) Fine Tune Model

In [24]:
model = DistilBertForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [25]:
model = model.to(device)

In [26]:
# """test"""
# sample_data = train_dataset[:6]
# model(**sample_data)

In [27]:
def compute_metrics(inputs):
    pred,labels = inputs
    pred = np.argmax(pred,axis = 1)
    accuracy = metrics.accuracy_score(labels,pred)
    recall = metrics.recall_score(labels,pred)
    precision = metrics.precision_score(labels,pred)
    f1 = metrics.f1_score(labels,pred)

    return {"accuracy":accuracy, "recall":recall, "precision":precision, "recall":recall, f1:"f1"} 

In [28]:
model_save_path = "/content/drive/My Drive/Colab Notebooks/Models/QNLI"

In [29]:
batch_size = 16

In [42]:
args = TrainingArguments(
    
    model_save_path,
    evaluation_strategy = "steps",
    eval_steps = 200,
    logging_steps = 400,
    save_steps = 400,
    do_train = True,
    do_eval = True,
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = 3,
    seed = 0,
    load_best_model_at_end = True,
)


In [43]:
train_dataset[:3]

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[ 101, 2043, 2106,  ...,    0,    0,    0],
         [ 101, 2029, 7421,  ...,    0,    0,    0],
         [ 101, 2054, 2048,  ...,    0,    0,    0]]),
 'labels': tensor([0, 0, 1])}

In [44]:
trainer  = Trainer(
    model = model,
    args = args,
    train_dataset =train_dataset,
    eval_dataset = dev_dataset ,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback] 
)


In [45]:
trainer.train()


Step,Training Loss,Validation Loss


TrainOutput(global_step=96, training_loss=0.6301178534825643, metrics={'train_runtime': 138.0628, 'train_samples_per_second': 0.695, 'total_flos': 308528686080000.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 8192, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 12288, 'train_mem_gpu_alloc_delta': 535674880, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 6563778560})

## 4) Evaluate 

In [50]:
from tqdm import tqdm

In [68]:
def evaluate_with_batch(model,dataset,batch_size,device):
    dataset_loader = DataLoader(dataset,batch_size)

    model.eval()

    loss_list = []
    labels_list = []
    pred_list = []
    prob_list = []

    pbar = tqdm(total = len(dataset_loader),desc = "Model Evaluate")


    for sample in dataset_loader:
        sample = {k:v.to(device) for (k,v) in sample.items()}

        with torch.no_grad():
            batch_result = model(**sample)
        
        loss = batch_result.loss.item()
        loss_list.append(loss)

        labels = sample['labels']
        del sample

        labels  = labels.detach().cpu().numpy()
        labels_list.extend(labels)

        logits = batch_result.logits
        probs = torch.sigmoid(logits)
        
        pred = torch.argmax(probs,axis = 1)
        pred = pred.detach().cpu().numpy()
        pred_list.extend(pred)

        prob = probs[:,1]
        prob = prob.detach().cpu().numpy()
        prob_list.extend(prob)

        pbar.update(1)

    pbar.close()


    accuracy = metrics.accuracy_score(labels_list,pred_list)
    recall = metrics.recall_score(labels_list,pred_list)
    precision = metrics.precision_score(labels_list,pred_list)
    f1 = metrics.f1_score(labels_list,pred_list)
    fpr,tpr, threshold = metrics.roc_curve(labels_list,prob_list,pos_label=1)

    auc =metrics.auc(fpr,tpr)
    loss = np.mean(loss_list)

    return {"accuracy":accuracy, "recall":recall, "precision":precision, "recall":recall, "f1":f1,'auc':auc,'loss':loss} 

In [69]:
evaluate_result = evaluate_with_batch(model,dev_dataset,batch_size,device)

Model Evaluate: 100%|██████████| 7/7 [00:03<00:00,  2.09it/s]


In [70]:
evaluate_result

{'accuracy': 0.7,
 'auc': 0.7567676767676766,
 'f1': 0.6808510638297872,
 'loss': 0.5915191173553467,
 'precision': 0.6530612244897959,
 'recall': 0.7111111111111111}