## 12.7.1 载入数据

In [1]:
# 数据导入到列表
academy_titles, job_titles = [], []

with open(r'academy_titles.txt', encoding='utf8') as f:
    for l in f:
        academy_titles.append(l.strip())

with open(r'job_titles.txt', encoding='utf8') as f:
    for l in f:
        job_titles.append(l.strip())

print(academy_titles[:5])
print(job_titles[:5])

['北师教育学，你我一起努力，让胜利酣畅淋漓。', '考博英语词汇', '出售人大新闻学院2015年考研权威资料', '【脑科院 郭桃梅课题组】科研助理招聘', '管理学院的同学帮帮忙呐～']
['【字节跳动内推】校招岗位全面开放，帮查进度！', '招聘兼职/ 笔试考务 /200-300 每人', '国企出版社招聘坐班兼职生', '【在线早教】教研实习生招聘', '【兼职】心理学公众号寻兼职写手']


In [2]:
data_list = []

for title in academy_titles:
    data_list.append((title, 0))

for title in job_titles:
    data_list.append((title, 1))

In [3]:
max_length = 0
for case in data_list:
    max_length = max(max_length, len(case[0])+2)
print(f"Max length of titles: {max_length}")

Max length of titles: 77


## 12.7.2 导入包和设置参数

In [4]:
import os
import time
import random
import torch
from torch import nn, LongTensor
from torch.optim import AdamW
import torch.nn.functional as F
from tqdm import tqdm

from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertForSequenceClassification

In [5]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [6]:
## Training parameters
max_train_epochs = 5
warmup_proportion = 0.05
gradient_accumulation_steps = 4
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01
max_grad_norm = 1.0

## Dataset parameters
training_set_split = 0.7
dataset_length = len(data_list)

## 12.7.3 定义DataPipe和DataLoader

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def text_to_token(text):
    return tokenizer.encode_plus(text, max_length=max_length, padding="max_length")

text_to_token("飞屋环游记")

{'input_ids': [101, 7607, 2238, 4384, 3952, 6381, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [8]:
from torchdata.datapipes.map import SequenceWrapper

datapipe = SequenceWrapper(data_list).shuffle()
train_datapipe, test_datapipe = datapipe.random_split(
    total_length=dataset_length, 
    weights={"train": training_set_split, "test": 1-training_set_split}, 
    seed=0
)

In [9]:
next(iter(train_datapipe))

('【脑科院 郭桃梅课题组】科研助理招聘', 0)

In [10]:
def collate_batch(batch):
    r = tokenizer([b[0] for b in batch], padding=True)
    input_ids = LongTensor(r['input_ids'])
    attention_mask = LongTensor(r['attention_mask'])
    label = LongTensor([b[1] for b in batch])
    return input_ids, attention_mask, label

In [11]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_datapipe, batch_size=batch_size, collate_fn=collate_batch)
test_loader = DataLoader(test_datapipe, batch_size=batch_size, collate_fn=collate_batch)

In [12]:
next(iter(train_loader))

(tensor([[  101,  4343,  1767,  2875,   772,  1501,  6817,  5852,  8020,  6440,
           4923,  1403,  8021,  2141,   739,  4495,  1568,  8013,   102,     0,
              0,     0,     0],
         [  101,  2875,  5470,  8038,  2825,  3318,  3118,  2898,   113,  7270,
           3309,  3300,  3126,  8021,   102,     0,     0,     0,     0,     0,
              0,     0,     0],
         [  101,  8127,  5440,  4777,  1325,  1380,  2110,  7368,  4638,  2110,
           7270,  2110,  1995,   812,  1762,  1525,  7027,   102,     0,     0,
              0,     0,     0],
         [  101,  1266,  1920,  7032,  6084,   683,  4798,  7444,  6206,  5440,
           3124,  3780,   720,   102,     0,     0,     0,     0,     0,     0,
              0,     0,     0],
         [  101,  1160,  2365,  8024,  2347,  2875,  1168,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0],
         [  101,  8232,  2399,   809,  1

## 12.7.4 定义评估函数

In [13]:
def get_score():
    y_true = []
    y_pred = []
    for step, batch in enumerate(tqdm(test_loader)):
        model.eval()            # turn to Evaluation Mode
        with torch.no_grad():
            input_ids, attention_mask = (b.to(device) for b in batch[:2])
        y_true += batch[2].numpy().tolist()
        logist = model(input_ids, attention_mask)[0]
        result = torch.argmax(logist, 1).cpu().numpy().tolist()
        y_pred += result
    correct = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            correct += 1
    accuracy = correct / len(y_pred)
    
    return accuracy

## 12.7.5 定义模型

In [14]:
model = BertForSequenceClassification.from_pretrained('bert-base-chinese')
model.to(device)

## Optimizer settings
no_decay = ['bias', 'LayerNorm.weight'] # No decay for bias and LayerNorm
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
print("Parameter Names:", [name for name, _ in param_optimizer if not any(nd in name for nd in no_decay)])

## Scheduler settings
total_steps = int(dataset_length * training_set_split) // gradient_accumulation_steps * max_train_epochs + 1
warmup_steps = int(warmup_proportion * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
print(f'Training(total) Steps: {total_steps}\nWarm-up Steps: {warmup_steps}')

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Parameter Names: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.query.weight', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.attention.self.value.weight', 'bert.encoder.layer.1.attention.output.dense.weight', 'bert.encoder.layer.1.intermediate.dense.weight', 'bert.encoder.layer.1.output.dense.weight', 'bert.encoder.layer.2.attention.self.query.weight', 'bert.encoder.layer.2.attention.self.key.weight', 'bert.encoder.layer.2.attention.self.value.weight', 'bert.encoder.layer.2.attention.output.dense.weight', 'bert.encoder.layer.2.intermediate

In [15]:
for epoch in range(max_train_epochs):
    b_time = time.time()
    model.train()                 # turn to Training Mode
    for step, batch in enumerate(tqdm(train_loader)):
        input_ids, attention_mask, label = [b.to(device) for b in batch]
        loss = model(input_ids, attention_mask, labels=label)
        loss = loss[0]
        loss.backward()
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
    print('Epoch = %d Epoch Mean Loss %.4f Time %.2f min' % (epoch+1, loss.item(), (time.time() - b_time)/60))
    print(get_score())

622it [04:43,  2.19it/s]


Epoch = 1 Epoch Mean Loss 0.0107 Time 4.73 min


267it [00:36,  7.39it/s]


0.9990623534927332


622it [04:55,  2.11it/s]


Epoch = 2 Epoch Mean Loss 0.0012 Time 4.92 min


267it [00:36,  7.33it/s]


1.0


622it [04:52,  2.12it/s]


Epoch = 3 Epoch Mean Loss 0.0014 Time 4.88 min


267it [00:35,  7.60it/s]


1.0


622it [04:52,  2.13it/s]


Epoch = 4 Epoch Mean Loss 0.0002 Time 4.88 min


267it [00:35,  7.50it/s]


1.0


622it [04:52,  2.13it/s]


Epoch = 5 Epoch Mean Loss 0.0009 Time 4.87 min


267it [00:38,  6.98it/s]

0.9990623534927332



