Fine Tune DistilBERT For  Text Pair Matching
Author: Nelson LIN (nelsonlin0321@outlook.com)

In [1]:
# !pip install torch

In [2]:
# import libaries
import torch
from torch import cuda
from torch.utils.data import Dataset,DataLoader

In [3]:
print(torch.__version__)

1.9.0a0+gitd69c22d


In [4]:
import os
import json
import random
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn import metrics
from collections import Counter

In [5]:
# !pip install transformers

In [6]:
from transformers import AutoTokenizer,AlbertForSequenceClassification,TrainingArguments,Trainer,EarlyStoppingCallback

In [7]:
device = "cuda" if cuda.is_available() else "cpu"

In [8]:
device

'cuda'

In [9]:
model_name =  "albert-base-v2"

## 1) Import Data



In [10]:
# from google.colab import drive
# drive.mount('/content/drive')

In [11]:
# # download from https://deepai.org/dataset/qnli
# data_dir = "/content/drive/My Drive/Colab Notebooks/Data/QNLIv2/QNLI"

In [12]:
# os.listdir(data_dir)

In [13]:
# def read_qnli_data(file_name):
#     path = os.path.join(data_dir,file_name)
#     with open(path) as f:
#         text = f.readlines()

#     header = text[0].strip().split("\t")
#     lines = [line.strip().split("\t") for line in text[1:]]

#     df = pd.DataFrame(lines,columns = header)
#     return df

In [14]:
# qnli_train_df = read_qnli_data("train.tsv")
# qnli_dev_df = read_qnli_data("dev.tsv")


In [15]:
# qnli_train_df['label'] = np.where(qnli_train_df['label']=='entailment',1,0)
# qnli_dev_df['label'] = np.where(qnli_dev_df['label']=='entailment',1,0)

In [16]:
# qnli_dev_df.head()

In [17]:
def read_qnli_data(file_name, data_dir):
    path = os.path.join(data_dir, file_name)
    with open(path, encoding='utf-8-sig') as f:
        text = f.readlines()

    header = text[0].strip().split("\t")
    lines = [line.strip().split("\t") for line in text[1:]]

    df = pd.DataFrame(lines, columns=header)
    return df


def get_qnli_pandas_dataframe(data_dir):
    qnli_train_df = read_qnli_data("train.tsv",data_dir)
    qnli_dev_df = read_qnli_data("dev.tsv",data_dir)
    qnli_train_df['label'] = np.where(
        qnli_train_df['label'] == 'entailment', 1, 0)
    qnli_dev_df['label'] = np.where(qnli_dev_df['label'] == 'entailment', 1, 0)

    qnli_dev_df['question'] = qnli_dev_df['question'].apply(lambda x: x.strip())
    qnli_dev_df['sentence'] = qnli_dev_df['sentence'].apply(lambda x: x.strip()) 

    qnli_train_df['question'] = qnli_train_df['question'].apply(lambda x: x.strip())
    qnli_train_df['sentence'] = qnli_train_df['sentence'].apply(lambda x: x.strip()) 

    return qnli_dev_df, qnli_train_df


def read_document_to_list(document_path):
    with open(document_path, encoding='utf-8-sig') as f:
        document = f.readlines()
        sentence_list = [line.strip()
                         for line in document if len(line.strip()) != 0]
        return sentence_list


def read_document_dict(document_dir):

    document_dict = {}

    for document_file_name in os.listdir(document_dir):
        if document_file_name.endswith(".txt"):
            document_name = document_file_name.replace(
                ".txt", "").replace("_", " ")
            document_path = os.path.join(document_dir, document_file_name)
            document_dict[document_name] = read_document_to_list(document_path)

    return document_dict


def read_json(file_path):
    with open(file_path) as f:
        json_f = json.load(f)
    data = json_f['data']
    return data


def get_random_index(List):
    return random.sample(range(len(List)), 1)[0]


def load_data(data_path, load_impossible_answer=False):

    data = read_json(data_path)

    data_dict = {}
    title_list = []
    context_list = []
    question_list = []
    id_list = []
    answer_text_list = []
    answer_start_list = []
    is_impossible_list = []

    for paragraphs in data:
        title = paragraphs['title']
        context_qas_list = paragraphs['paragraphs']

        for context_qas in context_qas_list:
            context = context_qas['context']
            qas_list = context_qas['qas']

            for qas in qas_list:
                title_list.append(title)
                context_list.append(context)

                is_impossible = qas['is_impossible']
                is_impossible_list.append(is_impossible)

                id_ = qas['id']
                id_list.append(id_)
                question = qas['question']
                question_list.append(question)

                if not is_impossible:
                    answer_list = qas['answers']
                    idx = get_random_index(answer_list)
                    answer_text = answer_list[idx]['text']
                    answer_start = answer_list[idx]['answer_start']

                    answer_text_list.append(answer_text)
                    answer_start_list.append(answer_start)
                else:
                    if load_impossible_answer:
                        answer_list = qas['plausible_answers']
                        idx = get_random_index(answer_list)
                        answer_text = answer_list[idx]['text']
                        answer_start = answer_list[idx]['answer_start']
                        answer_text_list.append(answer_text)
                        answer_start_list.append(answer_start)
                    else:
                        answer_text_list.append("")
                        answer_start_list.append(-1)

    data_dict['id'] = id_list
    data_dict['title'] = title_list
    data_dict['context'] = context_list
    data_dict['question'] = question_list
    data_dict['answer_text'] = answer_text_list
    data_dict['answer_start'] = answer_start_list
    data_dict['is_impossible'] = is_impossible_list

    return data_dict


def get_squad_v2_pandas_dataframe(squad_v2_dir,include_impossible=False, load_impossible_answer=False):
    # download from https://rajpurkar.github.io/SQuAD-explorer/
    train_data_path = os.path.join(squad_v2_dir, "train-v2.0.json")
    dev_data_path = os.path.join(squad_v2_dir, 'dev-v2.0.json')

    train_data_dict = load_data(train_data_path, load_impossible_answer)
    dev_data_dict = load_data(dev_data_path, load_impossible_answer)

    train_data_df = pd.DataFrame(train_data_dict)
    dev_data_df = pd.DataFrame(dev_data_dict)

    if not include_impossible:
        train_data_df = train_data_df[train_data_df['is_impossible'] == False]
        dev_data_df = dev_data_df[dev_data_df['is_impossible'] == False]

    train_data_df['question'] = train_data_df['question'].apply(lambda x: x.strip())
    train_data_df['context'] = train_data_df['context'].apply(lambda x: x.strip()) 

    dev_data_df['question'] = dev_data_df['question'].apply(lambda x: x.strip())
    dev_data_df['context'] = dev_data_df['context'].apply(lambda x: x.strip()) 

    return train_data_df, dev_data_df


In [18]:
data_dir = "/data/QNLI"

In [19]:
os.listdir(data_dir)

['dev.tsv', 'test.tsv', 'train.tsv']

In [20]:
qnli_dev_df, qnli_train_df = get_qnli_pandas_dataframe(data_dir)

## 2) Tokenization Features Engineering

In [21]:
max_length = 512

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [23]:
question = qnli_train_df['question'].iloc[0]
sentence = qnli_train_df['sentence'].iloc[0]

In [24]:
inputs = tokenizer(
    text = question,
    text_pair = sentence,
    add_special_tokens = True,
    max_length = max_length,
    padding = "max_length",
    return_token_type_ids = False,
    truncation = True,
    )

In [25]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [26]:
class QNLIDataset(Dataset):
    def __init__(self,dataframe,tokenizer,max_length,device):
        self.len = len(dataframe)
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device
        self.question_list = self.df['question'].to_list()
        self.sentence_list = self.df['sentence'].to_list()
        self.label_list = self.df['label'].to_list()

    def __len__(self):
        return self.len
    
    def __getitem__(self,index):
        question = self.question_list[index]
        sentence = self.sentence_list[index]
        label = self.label_list[index]

        inputs = self.tokenizer(
            text = question,
            text_pair = sentence,
            add_special_tokens = True,
            max_length = max_length,
            padding = "max_length",
            return_token_type_ids = False,
            truncation = True,
            # return_tensors = 'pt',
            )
        
        inputs= {
            'input_ids':torch.tensor(inputs['input_ids']),
            'attention_mask':torch.tensor(inputs['attention_mask']),
            'labels':torch.tensor(label),
        }

        inputs = {k:v.to(self.device) for (k,v) in inputs.items()}

        return inputs

In [27]:
sample_ratio = 2

In [28]:
qnli_train_df = shuffle(qnli_train_df)
qnli_dev_df = shuffle(qnli_dev_df)

In [29]:
train_size = int(len(qnli_train_df)//sample_ratio)
dev_size = int(len(qnli_dev_df)//sample_ratio)

In [30]:
qnli_train_df = qnli_train_df[:train_size]
qnli_dev_df = qnli_dev_df[:dev_size]

In [31]:
train_dataset = QNLIDataset(qnli_train_df,tokenizer,max_length,'cpu')
dev_dataset = QNLIDataset(qnli_dev_df,tokenizer,max_length,'cpu')

In [32]:
train_dataset[:2]

{'input_ids': tensor([[   2,   56, 3939,  ...,    0,    0,    0],
         [   2,  184,  175,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([0, 1])}

##3) Fine Tune Model

In [33]:
model = AlbertForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.decoder.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

In [34]:
model = model.to(device)

In [35]:
# """test"""
# sample_data = train_dataset[:6]
# model(**sample_data)

In [36]:
def compute_metrics(inputs):
    pred,labels = inputs
    pred = np.argmax(pred,axis = 1)
    accuracy = metrics.accuracy_score(labels,pred)
    recall = metrics.recall_score(labels,pred)
    precision = metrics.precision_score(labels,pred)
    f1 = metrics.f1_score(labels,pred)

    return {"accuracy":accuracy, "recall":recall, "precision":precision, "recall":recall, f1:"f1"} 

In [37]:
model_save_path = "./singel-albert-qnli"

In [38]:
batch_size = 24

In [39]:
steps = len(train_dataset)//batch_size

In [40]:
steps 

2182

In [41]:
evaluate_steps = steps//5

In [42]:
evaluate_steps

436

In [43]:
evaluate_steps = 300

In [44]:
# break

In [45]:
args = TrainingArguments(
    
    model_save_path,
    overwrite_output_dir = True,
    evaluation_strategy = "steps",
    eval_steps = evaluate_steps,
    logging_steps = evaluate_steps,
    save_steps = evaluate_steps,
    do_train = True,
    do_eval = True,
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = 2,
    seed = 0,
    load_best_model_at_end = True,
)


In [46]:
train_dataset[:3]

{'input_ids': tensor([[   2,   56, 3939,  ...,    0,    0,    0],
         [   2,  184,  175,  ...,    0,    0,    0],
         [   2,   98,   23,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([0, 1, 0])}

In [47]:
trainer  = Trainer(
    model = model,
    args = args,
    train_dataset =train_dataset,
    eval_dataset = dev_dataset,
    tokenizer = tokenizer,
)

In [48]:
trainer.train()

***** Running training *****
  Num examples = 52371
  Num Epochs = 2
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 4366


Step,Training Loss,Validation Loss
300,0.5054,0.376926
600,0.3997,0.406909
900,0.3551,0.30757
1200,0.3319,0.292908
1500,0.3276,0.27506
1800,0.3185,0.278506
2100,0.2943,0.272521
2400,0.2342,0.333522
2700,0.2237,0.283174
3000,0.2006,0.273119


***** Running Evaluation *****
  Num examples = 2731
  Batch size = 24
Saving model checkpoint to ./singel-albert-qnli/checkpoint-300
Configuration saved in ./singel-albert-qnli/checkpoint-300/config.json
Model weights saved in ./singel-albert-qnli/checkpoint-300/pytorch_model.bin
tokenizer config file saved in ./singel-albert-qnli/checkpoint-300/tokenizer_config.json
Special tokens file saved in ./singel-albert-qnli/checkpoint-300/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2731
  Batch size = 24
Saving model checkpoint to ./singel-albert-qnli/checkpoint-600
Configuration saved in ./singel-albert-qnli/checkpoint-600/config.json
Model weights saved in ./singel-albert-qnli/checkpoint-600/pytorch_model.bin
tokenizer config file saved in ./singel-albert-qnli/checkpoint-600/tokenizer_config.json
Special tokens file saved in ./singel-albert-qnli/checkpoint-600/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2731
  Batch size = 24
Saving mo

TrainOutput(global_step=4366, training_loss=0.28115325513587214, metrics={'train_runtime': 2476.9921, 'train_samples_per_second': 42.286, 'train_steps_per_second': 1.763, 'total_flos': 2503134974545920.0, 'train_loss': 0.28115325513587214, 'epoch': 2.0})

## 4) Evaluate 

In [49]:
from tqdm import tqdm

In [50]:
def evaluate_with_batch(model,dataset,batch_size,device):
    dataset_loader = DataLoader(dataset,batch_size)

    model.eval()

    loss_list = []
    labels_list = []
    pred_list = []
    prob_list = []

    pbar = tqdm(total = len(dataset_loader),desc = "Model Evaluate")


    for sample in dataset_loader:
        sample = {k:v.to(device) for (k,v) in sample.items()}

        with torch.no_grad():
            batch_result = model(**sample)
        
        loss = batch_result.loss.item()
        loss_list.append(loss)

        labels = sample['labels']
        del sample

        labels  = labels.detach().cpu().numpy()
        labels_list.extend(labels)

        logits = batch_result.logits
        probs = torch.sigmoid(logits)
        
        pred = torch.argmax(probs,axis = 1)
        pred = pred.detach().cpu().numpy()
        pred_list.extend(pred)

        prob = probs[:,1]
        prob = prob.detach().cpu().numpy()
        prob_list.extend(prob)

        pbar.update(1)

    pbar.close()


    accuracy = metrics.accuracy_score(labels_list,pred_list)
    recall = metrics.recall_score(labels_list,pred_list)
    precision = metrics.precision_score(labels_list,pred_list)
    f1 = metrics.f1_score(labels_list,pred_list)
    fpr,tpr, threshold = metrics.roc_curve(labels_list,prob_list,pos_label=1)

    auc =metrics.auc(fpr,tpr)
    loss = np.mean(loss_list)

    return {"accuracy":accuracy, "recall":recall, "precision":precision, "recall":recall, "f1":f1,'auc':auc,'loss':loss} 

In [51]:
evaluate_result = evaluate_with_batch(model,dev_dataset,batch_size,device)

Model Evaluate: 100%|██████████| 114/114 [00:22<00:00,  4.96it/s]


In [52]:
evaluate_result

{'accuracy': 0.9077261076528744,
 'recall': 0.9090909090909091,
 'precision': 0.9077705156136529,
 'f1': 0.9084302325581395,
 'auc': 0.9637978010190399,
 'loss': 0.25596295470339164}