In [1]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th
import torch.optim as optim
from torch import nn
from torch.utils.data import random_split
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, AutoModel, BertTokenizer, BertModel,
                          AutoModelForCausalLM, AutoModelForSequenceClassification,
                          BitsAndBytesConfig, TrainingArguments,
                          DataCollatorWithPadding, DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, DataCollatorForTokenClassification,
                          pipeline)

In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")

device = cuda; devive_cnt = 1


In [3]:
path_project = os.getcwd()
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = os.path.join(os.path.dirname(path_project), "model")
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 载入数据源

In [4]:
filename = "axb/super_glue-test.arrow"

In [5]:
# 自定义 Dataset 类
class Dataset(th.utils.data.Dataset):
    def __init__(self, filename, data_type):
        super(Dataset, self).__init__()
        self.dataset = load_dataset(
            path=data_type,
            data_files=os.path.join(path_data, filename),
            split="all"
        )

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        sentence1 = self.dataset[i]["sentence1"]
        sentence2 = self.dataset[i]["sentence2"]
        label = self.dataset[i]["label"]
        return sentence1, sentence2, label

In [6]:
dataset = Dataset(filename, data_type="arrow")
dataset_train, dataset_test = random_split(dataset, lengths=[0.8, 0.2])

In [7]:
dataset_train[0]

('Notorious B.I.G. passed away.',
 'During Notorious B.I.G.\'s funeral procession through the streets of Brooklyn, someone interrupted the somber atmosphere by playing "Hyponotize" at full volume, which prompted the public to dance and sing along.',
 1)

## step-2: tokenizer

In [8]:
checkpoint = "bert-large-uncased"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    trust_remote_code=True
)

In [10]:
print(tokenizer.pad_token)
print(tokenizer.eos_token)

[PAD]
None


## step-3: 配置量化参数

In [11]:
config_bnb = BitsAndBytesConfig(
    load_in_8bit=True,
    # load_in_4bit=True,
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=th.bfloat16,
    # bnb_4bit_use_double_quant=True
)

## step-4: 载入基础大模型

In [12]:
model_base = BertModel.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    trust_remote_code=True,
    # device_map="auto",
    # torch_dtype=th.float16,
    # quantization_config=config_bnb
)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [13]:
for i, (name, parm) in enumerate(model_base.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

0  name: embeddings.word_embeddings.weight;  shape: torch.Size([30522, 1024]);  dtype: torch.float32;  device: cpu
1  name: embeddings.position_embeddings.weight;  shape: torch.Size([512, 1024]);  dtype: torch.float32;  device: cpu
2  name: embeddings.token_type_embeddings.weight;  shape: torch.Size([2, 1024]);  dtype: torch.float32;  device: cpu
3  name: embeddings.LayerNorm.weight;  shape: torch.Size([1024]);  dtype: torch.float32;  device: cpu
4  name: embeddings.LayerNorm.bias;  shape: torch.Size([1024]);  dtype: torch.float32;  device: cpu
5  name: encoder.layer.0.attention.self.query.weight;  shape: torch.Size([1024, 1024]);  dtype: torch.float32;  device: cpu
6  name: encoder.layer.0.attention.self.query.bias;  shape: torch.Size([1024]);  dtype: torch.float32;  device: cpu
7  name: encoder.layer.0.attention.self.key.weight;  shape: torch.Size([1024, 1024]);  dtype: torch.float32;  device: cpu
8  name: encoder.layer.0.attention.self.key.bias;  shape: torch.Size([1024]);  dtype: t

In [14]:
for param in model_base.parameters():
    param.requires_grad_(False)

## step-5: 定义整理函数

In [15]:
def collate_fn(dataset):
    sents = [x[0:2] for x in dataset]
    labels = [x[2] for x in dataset]
    # max_length = max(len(x[0]) + len(x[1]) for x in sents) + 3

    # 编码
    inputs = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                         truncation=True,
                                         padding="max_length",
                                        #  max_length=max_length,
                                         max_length=512,
                                         add_special_tokens=True,
                                         return_token_type_ids=True,
                                         return_attention_mask=True,
                                         return_special_tokens_mask=True,
                                         return_tensors="pt",
                                         return_length=True)

    labels = th.LongTensor(labels)  # torch.int64
    return inputs, labels

## step-6: 配置模型参数

In [16]:
config_model = {
    "embedding_dim": 1024,
    "hidden_dim": 512,
    "dropout": 0.2,
    "epochs": 3,
    "batch_size": 64,
    # "gradient_steps": 1,
    "learning_rate": 0.001,
    "weight_decay": 0.01,
    # "max_seq_lenght": 512
}

## step-7: 模型训练

In [17]:
class Model(th.nn.Module):
    def __init__(self, model_base, config_model):
        super(Model, self).__init__()
        self.model_base = model_base
        self.embedding_dim = config_model.get("embedding_dim")
        self.hidden_dim = config_model.get("hidden_dim")
        self.dropout = config_model.get("dropout")

        self.lstm = th.nn.LSTM(input_size=self.embedding_dim,
                               hidden_size=self.hidden_dim,
                               num_layers=1,
                               dropout=self.dropout,
                               batch_first=True,
                               bidirectional=True)

        self.mlp = th.nn.Sequential(
            # layer-1
            th.nn.Linear(in_features=self.hidden_dim * 2, out_features=256),
            th.nn.ReLU(),
            th.nn.Dropout(p=self.dropout),
            th.nn.LayerNorm(normalized_shape=256),
            # layer-2
            th.nn.Linear(in_features=256, out_features=128),
            th.nn.ReLU(),
            th.nn.Dropout(p=self.dropout),
            th.nn.LayerNorm(normalized_shape=128),
            # layer-3
            th.nn.Linear(in_features=128, out_features=64),
            th.nn.ReLU(),
            th.nn.Dropout(p=self.dropout),
            th.nn.LayerNorm(normalized_shape=64),
            # layer-out
            th.nn.Linear(in_features=64, out_features=2)
        )

    def forward(self, inputs):
        # bert layer
        tokens = inputs["input_ids"]
        segments = inputs["token_type_ids"]
        valid_lens = inputs["attention_mask"]
        
        output_bert = self.model_base(
            input_ids=tokens,
            token_type_ids=segments,
            attention_mask=valid_lens
        ).last_hidden_state

        # lstm layer
        output_lstm, [ht, ct] = self.lstm(output_bert)

        # mlp_layer
        out_mlp = self.mlp(output_lstm[:, 0, :])
        return out_mlp

In [18]:
model_sft = Model(model_base, config_model).to(device)

In [19]:
trainable_params = 0
all_params = 0

for param in model_sft.parameters():
    if param.requires_grad:
        trainable_params += param.numel()
    all_params += param.numel()

print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params:.4f}")

trainable params: 6604226 || all params: 341746114 || trainable%: 1.9325


In [20]:
opti = optim.AdamW(params=model_sft.parameters(), 
                   lr=config_model.get("learning_rate"), 
                   betas=(0.9, 0.999), 
                   eps=10**-8, 
                   weight_decay=config_model.get("weight_decay"))
objt = nn.CrossEntropyLoss(reduction="mean")
epochs = config_model.get("epochs")


In [21]:
loader_train = th.utils.data.DataLoader(dataset=dataset_train,
                                        batch_size=config_model.get("batch_size"),
                                        collate_fn=collate_fn,
                                        shuffle=True,
                                        drop_last=True)

In [22]:
loader_test = th.utils.data.DataLoader(dataset=dataset_test,
                                       batch_size=config_model.get("batch_size"),
                                       collate_fn=collate_fn,
                                       shuffle=False,
                                       drop_last=False)

In [23]:
for epoch in range(epochs):
    # train
    loss_train_tmp = 0
    model_sft.train()
    for (i, (inputs, labels)) in enumerate(loader_train):
        inputs = inputs.to(device)
        labels = labels.to(device)

        output_mlp = model_sft(inputs)
        loss = objt(output_mlp, labels)
        loss_train_tmp += loss.item()

        opti.zero_grad()
        loss.backward()
        opti.step()
    loss_train = loss_train_tmp / (i + 1)
    
    # test
    loss_test_tmp = 0
    model_sft.eval()
    for (i, (inputs, labels)) in enumerate(loader_test):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        output_mlp = model_sft(inputs)
        loss = objt(output_mlp, labels)
        loss_test_tmp += loss.item()
    loss_test = loss_test_tmp / (i + 1)
    
    print(f"epoch {epoch}  loss_train {loss_train:.4f}  loss_test {loss_test:.4f}")

epoch 0  loss_train 0.7302  loss_test 0.6561
epoch 1  loss_train 0.6971  loss_test 0.6740
epoch 2  loss_train 0.6897  loss_test 0.6545


## step-8: 模型推理

In [24]:
sent1 = "Missouri lawmakers are considering a boycott of companies that boycott Israel."
sent2 = "Missouri lawmakers are considering a government boycott of companies that boycott Israel."
sents = [(sent1, sent2)]

In [25]:
inputs = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                     truncation=True,
                                     padding="max_length",
                                     max_length=512,
                                     add_special_tokens=True,
                                     return_token_type_ids=True,
                                     return_attention_mask=True,
                                     return_special_tokens_mask=True,
                                     return_tensors="pt",
                                     return_length=True)
inputs = inputs.to(device)

In [26]:
model_sft.eval()
with th.inference_mode():
    out_mlp = model_sft(inputs)
    y_hat = th.softmax(out_mlp, dim=1)
    y_pred = th.argmax(y_hat, dim=1)

print(y_pred)

tensor([1], device='cuda:0')
