# Financial Question Matching

In [1]:
!git clone https://github.com/GitYCC/bert-minimal-tutorial.git

Cloning into 'bert-minimal-tutorial'...
remote: Enumerating objects: 88, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (71/71), done.[K
remote: Total 88 (delta 44), reused 52 (delta 15), pack-reused 0[K
Unpacking objects: 100% (88/88), done.


In [1]:
%cd bert-minimal-tutorial

/content/bert-minimal-tutorial


In [2]:
!pip install -q -r requirements.txt

In [3]:
import os
import json

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm.notebook import tqdm
import numpy as np

from utils import RunningAverage, tokenize_and_map

MODEL_NAME = 'bert-base-chinese'

## Dataloader

In [4]:
text_pairs = []
labels = []
with open('data/afqmc_train.json') as fr:
    for line in fr.readlines():
        row = json.loads(line)
        text_pairs.append((row['sentence1'], row['sentence2']))
        labels.append(int(row['label']))

In [5]:
idx = 24000
print('text pair:', text_pairs[idx])
print('label:', labels[idx])

text pair: ('怎样主动提升蚂蚁借呗的额度', '怎么才能提高借呗额度')
label: 1


In [6]:
class PairDataset(Dataset):
    def __init__(self, tokenizer, text_pairs, labels=None, max_len=512, for_train=True):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.for_train = for_train

        self.text_pairs = text_pairs
        self.labels = labels

    def __getitem__(self, idx):
        text_1, text_2 = self.text_pairs[idx]
        
        text_1 = text_1.lower()
        text_2 = text_2.lower()

        tokens_1 = self.tokenizer.tokenize(text_1)
        tokens_2 = self.tokenizer.tokenize(text_2)

        tokens_1, tokens_2 = self._cut_tokens_pair(tokens_1, tokens_2)

        processed_tokens = ['[CLS]'] + tokens_1 + ['[SEP]'] + tokens_2 + ['[SEP]']

        input_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(processed_tokens))
        token_type_ids = torch.tensor([0] * (2 + len(tokens_1)) + [1] * (1 + len(tokens_2)))
        attention_mask = torch.tensor([1] * len(processed_tokens))

        outputs = (input_ids, token_type_ids, attention_mask)

        if self.for_train:
            label = self.labels[idx]
            label = torch.tensor(label)

            outputs += (label, )

        return outputs

    def _cut_tokens_pair(self, tokens_1, tokens_2):
        diff =  (len(tokens_1) + len(tokens_2)) - (self.max_len - 3)
        if diff > 0:
            half_diff = int(np.ceil(diff / 2))
            tokens_1 = tokens_1[:-half_diff]
            tokens_2 = tokens_2[:-half_diff]
        return tokens_1, tokens_2

    def __len__(self):
        return len(self.text_pairs)

    def create_mini_batch(self, samples):
        outputs = list(zip(*samples))

        # zero pad 到同一序列長度
        input_ids = pad_sequence(outputs[0], batch_first=True)
        token_type_ids = pad_sequence(outputs[1], batch_first=True)
        attention_mask = pad_sequence(outputs[2], batch_first=True)

        batch_output = (input_ids, token_type_ids, attention_mask)
    
        if self.for_train:
            labels = torch.stack(outputs[3])
            batch_output += (labels, )

        return batch_output

In [7]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

dataset = PairDataset(tokenizer, text_pairs, labels)

CUT_RATIO = 0.9
train_size = int(CUT_RATIO * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

In [8]:
batch_size = 32

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    collate_fn=dataset.create_mini_batch,
    shuffle=True
)
valid_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=batch_size,
    collate_fn=dataset.create_mini_batch,
)

## Model

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=2,
    return_dict=True
)
model.to(device)

device: cuda


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Train

In [13]:
def train_batch(model, data, optimizer, device):
    model.train()
    input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

    outputs = model(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        labels=labels
    )
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()


def evaluate(model, valid_loader, device):
    model.eval()

    loss_averager = RunningAverage()
    acc_averger = RunningAverage()

    with torch.no_grad():
        for data in tqdm(valid_loader, desc='evaluate'):
            input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss_averager.add(outputs.loss.item())

            corrects = (outputs.logits.argmax(dim=-1) == labels).cpu().tolist()
            acc_averger.add_all(corrects)

    return loss_averager.get(), acc_averger.get()

In [14]:
lr = 0.00001
max_iter = 1000
show_per_iter = 50
valid_per_iter = 250
save_per_iter = 500
save_checkpoint_dir = 'models/'
model_prefix = 'cn_question_matching_'

assert save_per_iter % valid_per_iter == 0

optimizer = optim.Adam(model.parameters(), lr=lr)

i = 1
is_running = True
train_loss = RunningAverage()
model_paths = []
while is_running:
    for train_data in train_loader:
        loss = train_batch(model, train_data, optimizer, device)
        train_loss.add(loss)

        if i % show_per_iter == 0:
            print('train [{}]: loss={}'.format(i, train_loss.get()))
            train_loss.flush()

        if i % valid_per_iter == 0:
            loss, acc = evaluate(model, valid_loader, device)
            print(f'valid: loss={loss} acc={acc}')

        if i % save_per_iter == 0:
            path = os.path.join(save_checkpoint_dir, model_prefix + f'loss{loss:.5}/')
            print(f'save model at {path}')
            model.save_pretrained(path)
            model_paths.append(path)
        
        if i == max_iter:
            is_running = False
            break

        i += 1

train [50]: loss=0.628982640504837
train [100]: loss=0.6183788365125656
train [150]: loss=0.5729357522726058
train [200]: loss=0.5959760600328445
train [250]: loss=0.5800184053182602


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=108.0, style=ProgressStyle(description_wid…


valid: loss=0.5631059253657306 acc=0.6945253348864299
train [300]: loss=0.5607518929243088
train [350]: loss=0.542183734178543
train [400]: loss=0.5640632432699203
train [450]: loss=0.5536403554677963
train [500]: loss=0.5548027700185776


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=108.0, style=ProgressStyle(description_wid…


valid: loss=0.5343392913540205 acc=0.7009318578916716
save model at models/cn_question_matching_loss0.53434/
train [550]: loss=0.5368397849798202
train [600]: loss=0.5236724197864533
train [650]: loss=0.5177822095155716
train [700]: loss=0.5353243273496627
train [750]: loss=0.5458459061384201


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=108.0, style=ProgressStyle(description_wid…


valid: loss=0.5275931852283301 acc=0.704717530576587
train [800]: loss=0.5295763194561005
train [850]: loss=0.5227465814352036
train [900]: loss=0.5314224392175675
train [950]: loss=0.5388805562257767
train [1000]: loss=0.49153187513351443


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=108.0, style=ProgressStyle(description_wid…


valid: loss=0.5270194954894207 acc=0.704717530576587
save model at models/cn_question_matching_loss0.52702/


## Predict

In [16]:
reload_checkpoint = model_paths[-1]

text_pairs = [
    ('我的蚂蚁借呗 为什么额度降了', '为何我蚂蚁借呗额度降低了'),
    ('花呗分期需要多少钱，才能分期', '花呗达到多少额度才能分期'),
]

pred_dataset = PairDataset(tokenizer, text_pairs, for_train=False)

pred_loader = DataLoader(
    dataset=pred_dataset,
    batch_size=batch_size,
    collate_fn=pred_dataset.create_mini_batch,
)

model = BertForSequenceClassification.from_pretrained(reload_checkpoint)
model.to(device)

pred_labels = []
with torch.no_grad():
    for data in tqdm(pred_loader, desc='predict'):
        input_ids, token_type_ids, attention_mask = [d.to(device) for d in data]

        outputs = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )

        pred_labels += outputs.logits.argmax(dim=-1).cpu().tolist()

print('predict result: ', list(zip(text_pairs, pred_labels)))

HBox(children=(FloatProgress(value=0.0, description='predict', max=1.0, style=ProgressStyle(description_width=…


predict result:  [(('我的蚂蚁借呗 为什么额度降了', '为何我蚂蚁借呗额度降低了'), 1), (('花呗分期需要多少钱，才能分期', '花呗达到多少额度才能分期'), 1)]
