Fine Tune DistilBERT For  Text Pair Matching
Author: Nelson LIN (nelsonlin0321@outlook.com)

In [1]:
# import libaries
import torch
from torch import cuda
from torch.utils.data import Dataset,DataLoader

In [2]:
print(torch.__version__)

1.10.0+cu111


In [3]:
import os
import json
import random
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn import metrics
from collections import Counter

In [17]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 12.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 45.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fou

In [18]:
from transformers import AutoTokenizer,AlbertForSequenceClassification,TrainingArguments,Trainer,EarlyStoppingCallback

In [19]:
device = "cuda" if cuda.is_available() else "cpu"

In [20]:
device

'cuda'

In [21]:
model_name =  "albert-base-v2"

## 1) Import Data



In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
# download from https://deepai.org/dataset/qnli
data_dir = "/content/drive/My Drive/Colab Notebooks/Data/QNLIv2/QNLI"

In [24]:
os.listdir(data_dir)

['train.tsv', 'test.tsv', 'dev.tsv']

In [25]:
def read_qnli_data(file_name):
    path = os.path.join(data_dir,file_name)
    with open(path) as f:
        text = f.readlines()

    header = text[0].strip().split("\t")
    lines = [line.strip().split("\t") for line in text[1:]]

    df = pd.DataFrame(lines,columns = header)
    return df

In [26]:
qnli_train_df = read_qnli_data("train.tsv")
qnli_dev_df = read_qnli_data("dev.tsv")


In [27]:
qnli_train_df['label'] = np.where(qnli_train_df['label']=='entailment',1,0)
qnli_dev_df['label'] = np.where(qnli_dev_df['label']=='entailment',1,0)

In [28]:
qnli_dev_df.head()

Unnamed: 0,index,question,sentence,label
0,0,What came into force after the new constitutio...,"As of that day, the new constitution heralding...",1
1,1,What is the first major city in the stream of ...,The most important tributaries in this area ar...,0
2,2,What is the minimum required if you want to te...,In most provinces a second Bachelor's Degree s...,0
3,3,How was Temüjin kept imprisoned by the Tayichi...,The Tayichi'ud enslaved Temüjin (reportedly wi...,1
4,4,"What did Herr Gott, dich loben wir become know...","He paraphrased the Te Deum as ""Herr Gott, dich...",0


## 2) Tokenization Features Engineering

In [29]:
max_length = 512

In [30]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

In [31]:
question = qnli_train_df['question'].iloc[0]
sentence = qnli_train_df['sentence'].iloc[0]

In [32]:
inputs = tokenizer(
    text = question,
    text_pair = sentence,
    add_special_tokens = True,
    max_length = max_length,
    padding = "max_length",
    return_token_type_ids = False,
    truncation = True,
    )

In [33]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [34]:
class QNLIDataset(Dataset):
    def __init__(self,dataframe,tokenizer,max_length,device):
        self.len = len(dataframe)
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device
        self.question_list = self.df['question'].to_list()
        self.sentence_list = self.df['sentence'].to_list()
        self.label_list = self.df['label'].to_list()

    def __len__(self):
        return self.len
    
    def __getitem__(self,index):
        question = self.question_list[index]
        sentence = self.sentence_list[index]
        label = self.label_list[index]

        inputs = self.tokenizer(
            text = question,
            text_pair = sentence,
            add_special_tokens = True,
            max_length = max_length,
            padding = "max_length",
            return_token_type_ids = False,
            truncation = True,
            # return_tensors = 'pt',
            )
        
        inputs= {
            'input_ids':torch.tensor(inputs['input_ids']),
            'attention_mask':torch.tensor(inputs['attention_mask']),
            'labels':torch.tensor(label),
        }

        inputs = {k:v.to(self.device) for (k,v) in inputs.items()}

        return inputs

In [35]:
sample_ratio = 6

In [36]:
qnli_train_df = shuffle(qnli_train_df)
qnli_dev_df = shuffle(qnli_dev_df)

In [37]:
train_size = int(len(qnli_train_df)//sample_ratio)
dev_size = int(len(qnli_dev_df)//sample_ratio)

In [38]:
qnli_train_df = qnli_train_df[:train_size]
qnli_dev_df = qnli_dev_df[:dev_size]

In [39]:
train_dataset = QNLIDataset(qnli_train_df,tokenizer,max_length,'cpu')
dev_dataset = QNLIDataset(qnli_dev_df,tokenizer,max_length,'cpu')

In [40]:
train_dataset[:2]

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[   2,   98, 7783,  ...,    0,    0,    0],
         [   2,  630, 1119,  ...,    0,    0,    0]]),
 'labels': tensor([0, 0])}

##3) Fine Tune Model

In [41]:
model = AlbertForSequenceClassification.from_pretrained(model_name)

Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

In [42]:
model = model.to(device)

In [43]:
# """test"""
# sample_data = train_dataset[:6]
# model(**sample_data)

In [44]:
def compute_metrics(inputs):
    pred,labels = inputs
    pred = np.argmax(pred,axis = 1)
    accuracy = metrics.accuracy_score(labels,pred)
    recall = metrics.recall_score(labels,pred)
    precision = metrics.precision_score(labels,pred)
    f1 = metrics.f1_score(labels,pred)

    return {"accuracy":accuracy, "recall":recall, "precision":precision, "recall":recall, f1:"f1"} 

In [45]:
model_save_path = "/content/drive/My Drive/Colab Notebooks/Models/QNLI"

In [46]:
batch_size = 12

In [47]:
steps = len(train_dataset)//batch_size

In [48]:
evaluate_steps = steps//2

In [50]:
evaluate_steps = 700

In [51]:
# break

In [52]:
args = TrainingArguments(
    
    model_save_path,
    overwrite_output_dir = True,
    evaluation_strategy = "steps",
    eval_steps = evaluate_steps,
    logging_steps = evaluate_steps,
    save_steps = evaluate_steps,
    do_train = True,
    do_eval = True,
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = 2,
    seed = 0,
    load_best_model_at_end = True,
)


In [53]:
train_dataset[:3]

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[   2,   98, 7783,  ...,    0,    0,    0],
         [   2,  630, 1119,  ...,    0,    0,    0],
         [   2,   98, 1062,  ...,    0,    0,    0]]),
 'labels': tensor([0, 0, 1])}

In [55]:
trainer  = Trainer(
    model = model,
    args = args,
    train_dataset =train_dataset,
    eval_dataset = dev_dataset,
    tokenizer = tokenizer,
)

In [56]:
trainer.train()

***** Running training *****
  Num examples = 17457
  Num Epochs = 2
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 2910


Step,Training Loss,Validation Loss
700,0.4436,0.293181
1400,0.3804,0.254905
2100,0.2773,0.293418
2800,0.2715,0.261712


***** Running Evaluation *****
  Num examples = 910
  Batch size = 12
Saving model checkpoint to /content/drive/My Drive/Colab Notebooks/Models/QNLI/checkpoint-700
Configuration saved in /content/drive/My Drive/Colab Notebooks/Models/QNLI/checkpoint-700/config.json
Model weights saved in /content/drive/My Drive/Colab Notebooks/Models/QNLI/checkpoint-700/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/Colab Notebooks/Models/QNLI/checkpoint-700/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/Colab Notebooks/Models/QNLI/checkpoint-700/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 910
  Batch size = 12
Saving model checkpoint to /content/drive/My Drive/Colab Notebooks/Models/QNLI/checkpoint-1400
Configuration saved in /content/drive/My Drive/Colab Notebooks/Models/QNLI/checkpoint-1400/config.json
Model weights saved in /content/drive/My Drive/Colab Notebooks/Models/QNLI/checkpoint-1400/pytorch_model.bin
tokenizer

TrainOutput(global_step=2910, training_loss=0.33971586194644676, metrics={'train_runtime': 6438.3887, 'train_samples_per_second': 5.423, 'train_steps_per_second': 0.452, 'total_flos': 834378324848640.0, 'train_loss': 0.33971586194644676, 'epoch': 2.0})

## 4) Evaluate 

In [None]:
from tqdm import tqdm

In [None]:
def evaluate_with_batch(model,dataset,batch_size,device):
    dataset_loader = DataLoader(dataset,batch_size)

    model.eval()

    loss_list = []
    labels_list = []
    pred_list = []
    prob_list = []

    pbar = tqdm(total = len(dataset_loader),desc = "Model Evaluate")


    for sample in dataset_loader:
        sample = {k:v.to(device) for (k,v) in sample.items()}

        with torch.no_grad():
            batch_result = model(**sample)
        
        loss = batch_result.loss.item()
        loss_list.append(loss)

        labels = sample['labels']
        del sample

        labels  = labels.detach().cpu().numpy()
        labels_list.extend(labels)

        logits = batch_result.logits
        probs = torch.sigmoid(logits)
        
        pred = torch.argmax(probs,axis = 1)
        pred = pred.detach().cpu().numpy()
        pred_list.extend(pred)

        prob = probs[:,1]
        prob = prob.detach().cpu().numpy()
        prob_list.extend(prob)

        pbar.update(1)

    pbar.close()


    accuracy = metrics.accuracy_score(labels_list,pred_list)
    recall = metrics.recall_score(labels_list,pred_list)
    precision = metrics.precision_score(labels_list,pred_list)
    f1 = metrics.f1_score(labels_list,pred_list)
    fpr,tpr, threshold = metrics.roc_curve(labels_list,prob_list,pos_label=1)

    auc =metrics.auc(fpr,tpr)
    loss = np.mean(loss_list)

    return {"accuracy":accuracy, "recall":recall, "precision":precision, "recall":recall, "f1":f1,'auc':auc,'loss':loss} 

In [None]:
evaluate_result = evaluate_with_batch(model,dev_dataset,batch_size,device)

Model Evaluate: 100%|██████████| 76/76 [00:39<00:00,  1.91it/s]


In [None]:
evaluate_result

{'accuracy': 0.8681318681318682,
 'auc': 0.9430203836988413,
 'f1': 0.8604651162790699,
 'loss': 0.32897793854537766,
 'precision': 0.8872901678657075,
 'recall': 0.835214446952596}