## 0. 参数设置

In [1]:
## Training parameters
max_train_epochs = 5
warmup_proportion = 0.1
gradient_accumulation_steps = 4
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01
max_grad_norm = 1.0

## Dataset parameters
training_set_split = 0.7
label_to_id = {'时尚': 0, '家居': 1, '教育': 2}
sample_ratio = 0.3

## 1. 读取数据集

In [2]:
import os
import time
import random
from tqdm import tqdm

import torch
from torch import nn, LongTensor
from torch.optim import AdamW
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertForSequenceClassification

2023-05-13 15:03:13.522021: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-13 15:03:14.712347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-13 15:03:14.712457: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [3]:
def get_file_label(categories, frac):
    _files, _labels = [], []
    for category in categories:
        dir_path = f'/THUCNews/{category}/'
        file_list = os.listdir(dir_path)
        file_list = random.sample(file_list, int(frac * len(file_list)))
        file_list = [dir_path + file for file in file_list]
        _files += file_list
        _labels += [category] * len(file_list)
    return _files, _labels

all_files, all_labels = get_file_label(categories=list(label_to_id.keys()), frac=sample_ratio)
dataset_length = len(all_labels)
data_list = [(all_files[idx], all_labels[idx]) for idx in range(dataset_length)]

In [4]:
from torchdata.datapipes.map import SequenceWrapper

datapipe = SequenceWrapper(data_list).shuffle()
train_datapipe, test_datapipe = datapipe.random_split(
    total_length=dataset_length, 
    weights={"train": training_set_split, "test": 1-training_set_split}, 
    seed=0
)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
tokenizer("飞屋环游记")

{'input_ids': [101, 7607, 2238, 4384, 3952, 6381, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [6]:
def get_text(file):
    text = []
    with open(file, encoding='utf8') as f:
        [text.append(line.strip()) for line in f if line.strip()]
    text = " ".join(text)
    return text


def collate_batch(batch):    
    r = tokenizer([get_text(b[0]) for b in batch], padding='max_length', max_length=512, truncation=True)
    input_ids = LongTensor(r['input_ids'])
    attention_mask = LongTensor(r['attention_mask'])
    label = LongTensor([label_to_id[b[1]] for b in batch])

    return input_ids, attention_mask, label

In [7]:
train_loader = DataLoader(train_datapipe, batch_size=batch_size, collate_fn=collate_batch)
test_loader = DataLoader(test_datapipe, batch_size=batch_size, collate_fn=collate_batch)

next(iter(train_loader))

(tensor([[ 101, 2990, 1184,  ..., 2349, 7770,  102],
         [ 101, 8166, 2399,  ...,  702, 6121,  102],
         [ 101, 3173, 3857,  ..., 3698, 1962,  102],
         ...,
         [ 101, 5299, 1745,  ...,    0,    0,    0],
         [ 101,  860, 7741,  ..., 6574, 7030,  102],
         [ 101, 7305, 4970,  ...,  679,  788,  102]]),
 tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 tensor([2, 1, 1, 2, 2, 0, 2, 1]))

In [8]:
def get_score():
    y_true = []
    y_pred = []
    for step, batch in enumerate(tqdm(test_loader)):
        model.eval()            # turn to Evaluation Mode
        with torch.no_grad():
            input_ids, attention_mask = (b.to(device) for b in batch[:2])
        y_true += batch[2].numpy().tolist()
        logist = model(input_ids, attention_mask)[0]
        result = torch.argmax(logist, 1).cpu().numpy().tolist()
        y_pred += result
    correct = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            correct += 1
    accuracy = correct / len(y_pred)
    
    return accuracy

In [9]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [10]:
## Set model
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=len(label_to_id))
model.to(device)
print(model.config)

## Optimizer settings
no_decay = ['bias', 'LayerNorm.weight'] # No decay for bias and LayerNorm
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
# print("Parameter Names:", [name for name, _ in param_optimizer if not any(nd in name for nd in no_decay)])

## Scheduler settings
total_steps = int(dataset_length * training_set_split) // gradient_accumulation_steps * max_train_epochs + 1
warmup_steps = int(warmup_proportion * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
print(f'Training(total) Steps: {total_steps}\nWarm-up Steps: {warmup_steps}')

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

Training(total) Steps: 23066
Warm-up Steps

In [None]:
for epoch in range(max_train_epochs):
    b_time = time.time()
    model.train()                 # turn to Training Mode
    for step, batch in enumerate(tqdm(train_loader)):
        input_ids, attention_mask, label = [b.to(device) for b in batch]
        loss = model(input_ids, attention_mask, labels=label)
        loss = loss[0]
        loss.backward()
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
    print('Epoch = %d Epoch Mean Loss %.4f Time %.2f min' % (epoch+1, loss.item(), (time.time() - b_time)/60))
    print(get_score())

2307it [19:15,  2.00it/s]


Epoch = 1 Epoch Mean Loss 0.0196 Time 19.26 min


989it [03:12,  5.13it/s]


0.9922882427307206


2307it [19:10,  2.01it/s]


Epoch = 2 Epoch Mean Loss 0.0031 Time 19.17 min


989it [03:11,  5.18it/s]


0.9932996207332491


2307it [19:13,  2.00it/s]


Epoch = 3 Epoch Mean Loss 0.0016 Time 19.22 min


989it [03:10,  5.20it/s]


0.9973451327433628


2307it [19:05,  2.01it/s]


Epoch = 4 Epoch Mean Loss 0.0030 Time 19.10 min


989it [03:09,  5.21it/s]


0.9987357774968394


2307it [19:09,  2.01it/s]


Epoch = 5 Epoch Mean Loss 0.0014 Time 19.16 min


599it [01:55,  5.27it/s]