<a href="https://colab.research.google.com/github/HxyScotthuang/mwp-cl/blob/main/fit_CL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install SentencePiece
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/bert+ch/UniLM4MWP')
#os.chdir('/content/drive/MyDrive/bert+ch/UniLM4MWP')


Here we try to finetune the CL-data by loading the value of our CL pair.

In [4]:
from torch.optim import Adam
from datasets import load_dataset, load_metric
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModel,BertGenerationEncoder,BertGenerationDecoder,EncoderDecoderModel,BertTokenizer
import numpy as np
import torch
import tqdm
import re


In [5]:
CL_model = AutoModel.from_pretrained("Math23k_CL")
embedding_matrix = CL_model.embeddings.word_embeddings.weight
#tokenizer = AutoTokenizer.from_pretrained("Math23k_CL")
print(np.shape(embedding_matrix))
#torch.Size([30522, 768])

torch.Size([21128, 768])


Now we have extracted the word embedding for our given vocabulary. So we can just focus on the finetune part

In [6]:
def is_equal(a, b):
    """考虑多次浮点计算会出现无法取整的情况，
       用round()对两个结果取整检查是否相等
    """
    a = round(float(a), 6)
    b = round(float(b), 6)
    return a == b

In [7]:
def remove_bucket(equation):
    l_buckets, buckets = [], []
    for i, c in enumerate(equation):
        if c == '(':
            l_buckets.append(i)
        elif c == ')':
            buckets.append((l_buckets.pop(), i))
    try:
      eval_equation = eval(equation)
    except:
      pass
      #print(equation)
      
    for l, r in buckets:
        new_equation = '%s %s %s' % (
            equation[:l], equation[l + 1:r], equation[r + 1:]
        )
        try:
            if is_equal(eval(new_equation.replace(' ', '')), eval_equation):
                equation = new_equation
        except:
            pass
    return equation.replace(' ', '')

In [8]:
import re
def preprocess (question, equation, answer): 
    # 处理 中括号
    equation = equation.replace('[', '(').replace(']', ')').replace('^','**')
    # \d+:表达一个或多个数字；\1表示向前引用
    # 例如1(2/3)先替换为(1+2/3)，再替换为1+(2/3)
    # 处理带分数
    question = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', question)
    equation = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', equation)
    answer = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', answer)
    equation = re.sub('(\d+)\(', '\\1+(', equation)
    answer = re.sub('(\d+)\(', '\\1+(', answer)
    # 分数去括号
    question = re.sub('\((\d+/\d+)\)', '\\1', question)
    # 处理百分数
    equation = re.sub('([\.\d]+)%', '(\\1/100)', equation)
    answer = re.sub('([\.\d]+)%', '(\\1/100)', answer)
    # 冒号转除号、剩余百分号处理
    equation = equation.replace(':', '/').replace('%', '/100')
    answer = answer.replace(':', '/').replace('%', '/100')
    if equation[:2] == 'x=':
        equation = equation[2:]
    
    return question, remove_bucket(equation), answer

        

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#tokenizer = AutoTokenizer.from_pretrained("Math23k_CL")
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

In [10]:
from torch.utils.data.dataset import Dataset
import json
from torch.utils.data import DataLoader
class MyDataSet(Dataset):
  def __init__(self, json_path,tokenizer,start = 0):
    self.json_path = json_path
    # MathQA_train.json
    self.tokenizer = tokenizer
    self.data = json.load(open(self.json_path, 'r'))[start:]
    
  def __getitem__(self, index):
    # This method should return only 1 sample and label 
    # (according to "index"), not the whole dataset
    # So probably something like this for you:
    question = self.data[index]["segmented_text"]
    
    label = self.data[index]["equation"]
    ans = self.data[index]["ans"]

    q,eq,ans = preprocess(question, label, ans)
    inputs = self.tokenizer(q,padding="max_length",
            max_length=64,
            truncation=True,
            add_special_tokens=True,
            return_tensors="pt")
    inputs_id = inputs.input_ids[0]
    attention_mask = inputs.attention_mask[0]
    label = self.tokenizer(eq, padding="max_length",
          max_length=64,
          truncation=True,
          add_special_tokens=True,
          return_tensors="pt").input_ids[0]
    label = torch.tensor([-100 if item == tokenizer.pad_token_id else item for item in label])
    outs = {"input_ids":inputs_id,"label":label,"attention_mask":attention_mask,"text":q,"eq":eq,"ans":ans}
    return outs

  def __len__(self):
    return len(self.data)

In [11]:
# loading the data
#train_data = MyDataSet('dataset/math23k/trainset.json')
#train_data = MyDataSet('dataset/math23k/trainset.json',tokenizer,start = 5000)
train_data = MyDataSet('dataset/math23k/trainset.json',tokenizer)

test_data = MyDataSet('dataset/math23k/testset.json',tokenizer)
val_data = MyDataSet('dataset/math23k/validset.json',tokenizer)

train_dataloader = DataLoader(
    train_data,
    batch_size=8
 )

test_dataloader = DataLoader(
    test_data,
    batch_size=8
 )

val_dataloader = DataLoader(
    val_data,
    batch_size=8
 )


In [12]:
train_data[1]

{'input_ids': tensor([ 101,  671, 7555, 2339, 4923, 8024, 4508,  510,  734,  697, 7339, 1394,
          976, 8114, 1921, 2130, 2768, 8024, 4385, 1762, 4508, 7339, 1296, 4324,
          976, 8125, 1921, 1400, 8024,  734, 7339, 1217, 1057, 8024,  697, 7339,
         1348, 1394,  976,  749, 8110, 1921, 8024, 6821, 3198, 4508, 7339, 6444,
         6624, 8024,  734, 7339, 5326, 5330,  976, 8115, 1921, 2798, 2130, 2768,
         6821, 7555, 2339,  102]),
 'label': tensor([ 101,  122,  120,  113,  113,  122,  118,  113, 8110,  116, 8115,  114,
          120, 8114,  114,  120,  113, 8125,  118, 8115,  114,  114,  102, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [13]:
class My_model(torch.nn.Module):
  def __init__(self,bert,embedding_matrix):
    super(My_model, self).__init__()
    self.embedding = torch.nn.Embedding.from_pretrained(torch.tensor(embedding_matrix).float())
    self.embedding.weight.requires_grad = False
    self.bert_model = bert
  def forward(self,input_ids,decoder_input_ids,labels = None,return_dict=True):
    inputs_embeds = self.embedding(inputs_ids)
    outputs = self.bert_model(inputs_embeds=inputs_embeds,decoder_input_ids=decoder_input_ids, labels=labels,return_dict = return_dict)
    return outputs
  def generate(self,inputs_id):
    inputs_embeds = self.embedding(inputs_ids)
    return self.bert_model.generate(inputs_embeds = inputs_embeds)

In [14]:

encoder = BertGenerationEncoder.from_pretrained("bert-base-chinese", bos_token_id=101, eos_token_id=102)
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder = BertGenerationDecoder.from_pretrained(
    "bert-base-chinese", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
)
bert_model = EncoderDecoderModel(encoder=encoder, decoder=decoder)
model = bert_model
#model = My_model(bert_model,embedding_matrix)


You are using a model of type bert to instantiate a model of type bert-generation. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertGenerationEncoder: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'bert.pooler.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'bert.embeddings.token_type_embeddings.weight', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertGenerationEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertGenerationEncoder from the checkpoint of a 

In [15]:
ContinueTrain = False
if ContinueTrain:
  out_path = "finetune_out/bert_model_baseline"

  model = torch.load(out_path)

In [16]:
optimizer = Adam(model.parameters(),lr=2e-5)
out_path = "/finetune_out/"
num_epoch = 50
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.vocab_size = model.config.encoder.vocab_size
model.config.max_length = 500
model.config.min_length = 1
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 1.0
model.config.num_beams = 5
model.to(device)

EncoderDecoderModel(
  (encoder): BertGenerationEncoder(
    (embeddings): BertGenerationEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (d

In [17]:
'''
%%capture
!pip install git-python==1.0.3
'''

'\n%%capture\n!pip install git-python==1.0.3\n'

In [18]:

from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
Argument = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    output_dir = "finetune_out/bert_model_baseline_test",
    evaluation_strategy="steps",
    eval_steps = 500,
    logging_strategy="steps",
    logging_steps = 500,
    save_strategy = "steps",
    save_steps = 100000,
    learning_rate = 2e-5,
    per_device_train_batch_size= 8,
    per_device_eval_batch_size=8,
    num_train_epochs= 70,
    load_best_model_at_end= True
)


In [19]:

trainer = Seq2SeqTrainer(
    model= model,
    # The training arguments.
    args=Argument,
    # The training dataset.
    train_dataset=train_data,
   
    eval_dataset=val_data,
    
    tokenizer=tokenizer
    
)


In [None]:
trainer.train()

***** Running training *****
  Num examples = 18529
  Num Epochs = 70
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 162190
The following columns in the training set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: text, ans, eq. If text, ans, eq are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss
500,1.1203,0.248014
1000,0.2602,0.193364
1500,0.2064,0.170033
2000,0.1846,0.166626
2500,0.1771,0.163024
3000,0.1696,0.165784
3500,0.1623,0.156557
4000,0.1605,0.148463
4500,0.1579,0.152159
5000,0.1571,0.156872


***** Running Evaluation *****
  Num examples = 2316
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: text, ans, eq. If text, ans, eq are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2316
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: text, ans, eq. If text, ans, eq are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2316
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: text, ans, eq. If text, ans, eq are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluati

Step,Training Loss,Validation Loss
500,1.1203,0.248014
1000,0.2602,0.193364
1500,0.2064,0.170033
2000,0.1846,0.166626
2500,0.1771,0.163024
3000,0.1696,0.165784
3500,0.1623,0.156557
4000,0.1605,0.148463
4500,0.1579,0.152159
5000,0.1571,0.156872


***** Running Evaluation *****
  Num examples = 2316
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: text, ans, eq. If text, ans, eq are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2316
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: text, ans, eq. If text, ans, eq are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2316
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: text, ans, eq. If text, ans, eq are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluati

In [None]:
!nvidia-smi

In [None]:
'''
for epoch in range(num_epoch):  # loop over the dataset multiple times
    print("Start epoch ",epoch+1)
    running_loss = 0.0
    for k, data in enumerate(tqdm.tqdm(train_dataloader)):
        # zero the parameter gradients
        optimizer.zero_grad()

        inputs_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        labels = data["label"].to(device)
        ans = data["ans"]
        
        # get the inputs; data is a list of [inputs, labels]
        inputs, label, _,_, ans= data

        
        # turn tuple to list
        inputs = list(inputs)
        labels = list(label)
        ans = list(ans)
        
        # tokenize the input
        #inputs_ids = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).input_ids
        #labels = tokenizer(labels, return_tensors="pt", padding=True, truncation=True).input_ids  
        #inputs_ids = inputs_ids.to(device)
       # labels = label.to(device)


        # decoder_input_ids = tokenizer(labels[:-1], return_tensors="pt", padding=True, truncation=True).input_ids
        # labels = tokenizer(labels[1:], return_tensors="pt", padding=True, truncation=True).input_ids
            
        # forward + backward + optimize
        outputs = model(inputs_id=inputs_ids, decoder_inputs_id=labels)
        loss = outputs.loss
        
        outputs_check = model.generate(input_ids = inputs_ids,attention_mask = attention_mask)
        for i in range(len(outputs_check)):
            pred_eq = tokenizer.decode(outputs_check[i], skip_special_tokens=True)
            print(pred_eq)
            sum += 1
            try:
                if is_equal(eval(pred_eq),eval(ans[i])):
                  correct += 1
            except:
              pass
        
        loss.backward()
        
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        
    print(f'[{epoch + 1}, {k + 1:5d}] loss: {running_loss / len(train_dataloader):.6f}')
    #print("Accuracy:"+ str(correct / sum))
    running_loss = 0.0
'''
print('Finished Training')

In [None]:
out_path = "finetune_out/bert_model_baseline"
torch.save(model, out_path)

After saving the model, we are free to get out the model here

In [None]:
out_path = "finetune_out/bert_model_baseline"

model = torch.load(out_path)

In [None]:
# inference
import torch.nn.functional as F  #for softmax function 
model = model.cuda()
input = tokenizer(
    "我 有 3 个 电脑，你 有 2 个 电脑，我们 一共 有 多少 个 电脑 ?", return_tensors="pt", padding=True, truncation=True
)# Batch size 1

inputs_ids = input.input_ids.to(device)
attention_mask = input.attention_mask.to(device)
outputs = model.generate(inputs_ids,attention_mask = attention_mask)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

In [None]:
#inference in batch
sum = 0
correct = 0
for k, data in enumerate(tqdm.tqdm(train_dataloader)):
    # get the inputs; data is a list of [inputs, labels]

    inputs_ids = data["input_ids"].to(device)
    attention_mask = data["attention_mask"].to(device)
    labels = data["label"].to(device)
    ans = data["ans"]

    '''
    # turn tuple to list
    inputs = list(inputs)
    label = list(labels)
    ans = list(ans)

    # tokenize the input
    input = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True)
    inputs_ids = input.input_ids.to(device)
    '''
    #decoder_input = torch.tensor([[-100]]).to(device)
    #outputs = model(input_ids = inputs_ids, decoder_input_ids = decoder_input, return_dict=True)
    #outputs = outputs.logits.argmax(-1)
    outputs = model.generate(inputs_ids)
    for i in range(len(outputs)):
        pred_eq = tokenizer.decode(outputs[i], skip_special_tokens=True).replace(" ","")
        print(pred_eq)
        sum += 1
        try:
            if is_equal(eval(pred_eq),eval(ans[i])):
              correct += 1
        except:
            pass
          
print("Accuracy:"+ str(correct / sum))


Baseline log


Start epoch  65
100%|██████████| 1692/1692 [03:51<00:00,  7.32it/s]
[65,  1692] loss: 0.015328
Accuracy:0.7974720969768645
<br>
Start epoch  66
100%|██████████| 1692/1692 [03:54<00:00,  7.23it/s]
[66,  1692] loss: 0.008195<br>
Accuracy:0.8064158474388351
Start epoch  67
100%|██████████| 1692/1692 [03:49<00:00,  7.37it/s]
[67,  1692] loss: 0.006870<br>
Accuracy:0.8098898662133195
Start epoch  68
100%|██████████| 1692/1692 [04:04<00:00,  6.91it/s]
[68,  1692] loss: 0.018374<br>
Accuracy:0.7922980264616749
Start epoch  69
100%|██████████| 1692/1692 [03:53<00:00,  7.26it/s]
[69,  1692] loss: 0.007198<br>
Accuracy:0.8083376450587627
Start epoch  70
100%|██████████| 1692/1692 [03:51<00:00,  7.31it/s][70,  1692] loss: 0.009569
Accuracy:0.8074506615418731<br>
Finished Training



100%|██████████| 2317/2317 [01:36<00:00, 24.01it/s]Accuracy:0.4695727233491584

Baseline 5000 log

Start epoch  95
100%|██████████| 1692/1692 [04:53<00:00,  5.77it/s]
[95,  1692] loss: 0.014397<br>
Accuracy:0.7945894005469731
Start epoch  96
100%|██████████| 1692/1692 [04:53<00:00,  5.76it/s]
[96,  1692] loss: 0.017905<br>
Accuracy:0.7905240594278956
Start epoch  97
100%|██████████| 1692/1692 [04:53<00:00,  5.76it/s]
[97,  1692] loss: 0.014716<br>
Accuracy:0.7933328405647129
Start epoch  98
100%|██████████| 1692/1692 [04:54<00:00,  5.75it/s]
[98,  1692] loss: 0.027101<br>
Accuracy:0.7730800502623993
Start epoch  99
100%|██████████| 1692/1692 [04:52<00:00,  5.78it/s]
[99,  1692] loss: 0.027469<br>
Accuracy:0.7812107325005544
Start epoch  100
100%|██████████| 1692/1692 [04:53<00:00,  5.76it/s][100,  1692] loss: 0.014166
Accuracy:0.7976938428560869<br>
Finished Training<br>

100%|██████████| 2317/2317 [01:25<00:00, 27.02it/s]Accuracy:0.4695727233491584

CL 5000 log
Start epoch  95
100%|██████████| 1692/1692 [04:48<00:00,  5.86it/s]
[95,  1692] loss: 0.018977
Accuracy:0.7866804641880405 <br>
Start epoch  96
100%|██████████| 1692/1692 [04:48<00:00,  5.86it/s]
[96,  1692] loss: 0.020678
Accuracy:0.7894892453248578<br>
Start epoch  97
100%|██████████| 1692/1692 [04:48<00:00,  5.87it/s]
[97,  1692] loss: 0.019395
Accuracy:0.7870500406534112<br>
Start epoch  98
100%|██████████| 1692/1692 [04:47<00:00,  5.89it/s]
[98,  1692] loss: 0.012431
Accuracy:0.7951068075984922<br>
Start epoch  99
100%|██████████| 1692/1692 [04:48<00:00,  5.87it/s]
[99,  1692] loss: 0.022111
Accuracy:0.7860152265503733<br>
Start epoch  100
100%|██████████| 1692/1692 [04:46<00:00,  5.90it/s][100,  1692] loss: 0.012022
Accuracy:0.7986547416660507<br>
Finished Training

100%|██████████| 2317/2317 [01:25<00:00, 26.97it/s]Accuracy:0.47561501942166595