In [2]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th
import torch.optim as optim
from torch import nn
from torch.utils.data import random_split
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, AutoModel, BertTokenizer, BertModel,
                          AutoModelForCausalLM, AutoModelForSequenceClassification,
                          BitsAndBytesConfig, TrainingArguments,
                          DataCollatorWithPadding, DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, DataCollatorForTokenClassification,
                          Trainer)
from sklearn.metrics import (recall_score, precision_score, f1_score, confusion_matrix)

In [3]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")

device = cuda; devive_cnt = 1


In [4]:
path_project = os.getcwd()
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = os.path.join(os.path.dirname(path_project), "model")
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 载入数据源

In [5]:
filename = "axb/super_glue-test.arrow"

In [6]:
# 直接用 load_dataset 加载
dataset = load_dataset(
            path="arrow",
            data_files=os.path.join(path_data, filename),
            split="all"
        )
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="label", shuffle=True, seed=0)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'idx', 'label'],
        num_rows: 883
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'idx', 'label'],
        num_rows: 221
    })
})

## step-2: tokenizer

In [8]:
checkpoint = "bert-large-uncased"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    trust_remote_code=True
)

In [10]:
print(tokenizer.pad_token)
print(tokenizer.eos_token)

[PAD]
None


## step-3: 配置量化参数

In [11]:
config_bnb = BitsAndBytesConfig(
    load_in_8bit=True,
    # load_in_4bit=True,
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=th.bfloat16,
    # bnb_4bit_use_double_quant=True
)

## step-4: 载入基础大模型

In [12]:
model_base = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    trust_remote_code=True,
    # device_map="auto",
    # torch_dtype=th.float16,
    # quantization_config=config_bnb
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at c:\my_project\MyGit\Machine-Learning-Column\model\bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
for i, (name, parm) in enumerate(model_base.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

0  name: bert.embeddings.word_embeddings.weight;  shape: torch.Size([30522, 1024]);  dtype: torch.float32;  device: cpu
1  name: bert.embeddings.position_embeddings.weight;  shape: torch.Size([512, 1024]);  dtype: torch.float32;  device: cpu
2  name: bert.embeddings.token_type_embeddings.weight;  shape: torch.Size([2, 1024]);  dtype: torch.float32;  device: cpu
3  name: bert.embeddings.LayerNorm.weight;  shape: torch.Size([1024]);  dtype: torch.float32;  device: cpu
4  name: bert.embeddings.LayerNorm.bias;  shape: torch.Size([1024]);  dtype: torch.float32;  device: cpu
5  name: bert.encoder.layer.0.attention.self.query.weight;  shape: torch.Size([1024, 1024]);  dtype: torch.float32;  device: cpu
6  name: bert.encoder.layer.0.attention.self.query.bias;  shape: torch.Size([1024]);  dtype: torch.float32;  device: cpu
7  name: bert.encoder.layer.0.attention.self.key.weight;  shape: torch.Size([1024, 1024]);  dtype: torch.float32;  device: cpu
8  name: bert.encoder.layer.0.attention.self.ke

In [14]:
for (name, param) in model_base.named_parameters():
    if name.startswith("bert"):
        param.requires_grad_(False)

## step-5: 定义整理函数

In [15]:
# task: text classification
def tokenize_function(sample):
    inputs = tokenizer(text=sample["sentence1"], text_pair=sample["sentence2"], max_length=512, truncation=True)
    inputs["labels"] = sample["label"]
    return inputs

In [None]:
# task: text generation
# def tokenize_function(sample):
#     inputs = tokenizer(text=sample["query"], max_length=512, truncation=True)
#     labels = tokenizer(text_target=sample["response"], max_length=128, truncation=True)
#     inputs["labels"] = labels["input_ids"]
#     return inputs

In [None]:
# task: text generation-glm
# def tokenize_function(sample):
#     inputs = tokenizer(text=sample["query"], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
#     inputs = tokenizer.build_inputs_for_generation(inputs, target=sample["response"], max_gen_length=128, padding=True)
#     return inputs

In [16]:
dataset_tokenized = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

In [17]:
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 883
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 221
    })
})

In [18]:
dataset_train_tokenized = dataset_tokenized["train"]
dataset_test_tokenized = dataset_tokenized["test"]

In [19]:
# collate_fn = DataCollatorForLanguageModeling(tokenizer, mlm=False) 
collate_fn = DataCollatorWithPadding(tokenizer)
# collate_fn = DataCollatorForSeq2Seq(tokenizer, padding=True)
# collate_fn = DataCollatorForTokenClassification(tokenizer)

## step-6: 配置模型参数

In [20]:
config_model = {
    "embedding_dim": 1024,
    "hidden_dim": 512,
    "dropout": 0.2,
    "epochs": 10,
    "batch_size": 64,
    "gradient_steps": 1,
    "learning_rate": 0.001,
    "weight_decay": 0.01,
    "max_seq_lenght": 512
}

## step-7: 模型训练

In [30]:
model_sft = model_base.to(device)
model_sft.gradient_checkpointing_enable() 
model_sft.enable_input_require_grads()
model_sft.config.use_cache = False

In [22]:
trainable_params = 0
all_params = 0

for param in model_sft.parameters():
    if param.requires_grad:
        trainable_params += param.numel()
    all_params += param.numel()

print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params:.4f}")

trainable params: 2050 || all params: 335143938 || trainable%: 0.0006


In [23]:
def compute_metrics(eval_predict):
    preds, labels = eval_predict
    preds = preds.argmax(axis=-1)
    f1 = f1_score(labels, preds)
    return {"f1": f1}

In [24]:
args_train = TrainingArguments(
    output_dir=os.path.join(path_output, "model_sft"),
    num_train_epochs=config_model.get("epochs"),
    per_device_train_batch_size=config_model.get("batch_size"),
    per_device_eval_batch_size=config_model.get("batch_size"),
    gradient_accumulation_steps=config_model.get("gradient_steps"),
    gradient_checkpointing=True, 
    optim="adamw_torch",
    learning_rate=config_model.get("learning_rate"),
    weight_decay=config_model.get("weight_decay"),
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=3,
    metric_for_best_model="f1",
    load_best_model_at_end=True
)

In [25]:
trainer = Trainer(
    model=model_sft,
    tokenizer=tokenizer,
    args=args_train,
    data_collator=collate_fn,
    train_dataset=dataset_train_tokenized,
    eval_dataset=dataset_test_tokenized,
    compute_metrics=compute_metrics
)

In [26]:
res_train = trainer.train()

  0%|          | 0/140 [00:00<?, ?it/s]

{'loss': 2.1506, 'grad_norm': 4.360788345336914, 'learning_rate': 0.009000000000000001, 'epoch': 1.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.9693963527679443, 'eval_f1': 0.015384615384615384, 'eval_runtime': 2.1395, 'eval_samples_per_second': 103.298, 'eval_steps_per_second': 1.87, 'epoch': 1.0}
{'loss': 2.3663, 'grad_norm': 24.328449249267578, 'learning_rate': 0.008, 'epoch': 2.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.7942917943000793, 'eval_f1': 0.7335243553008596, 'eval_runtime': 2.1391, 'eval_samples_per_second': 103.314, 'eval_steps_per_second': 1.87, 'epoch': 2.0}
{'loss': 1.0314, 'grad_norm': 3.5117805004119873, 'learning_rate': 0.006999999999999999, 'epoch': 3.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.6912957429885864, 'eval_f1': 0.7335243553008596, 'eval_runtime': 2.1641, 'eval_samples_per_second': 102.119, 'eval_steps_per_second': 1.848, 'epoch': 3.0}
{'loss': 1.6823, 'grad_norm': 11.935359954833984, 'learning_rate': 0.006, 'epoch': 4.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.703606367111206, 'eval_f1': 0.1875, 'eval_runtime': 2.1464, 'eval_samples_per_second': 102.963, 'eval_steps_per_second': 1.864, 'epoch': 4.0}
{'loss': 1.0164, 'grad_norm': 18.509540557861328, 'learning_rate': 0.005, 'epoch': 5.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.6919796466827393, 'eval_f1': 0.7371428571428571, 'eval_runtime': 2.1584, 'eval_samples_per_second': 102.392, 'eval_steps_per_second': 1.853, 'epoch': 5.0}
{'loss': 1.413, 'grad_norm': 22.21833610534668, 'learning_rate': 0.004, 'epoch': 6.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.2213325500488281, 'eval_f1': 0.015384615384615384, 'eval_runtime': 2.1858, 'eval_samples_per_second': 101.105, 'eval_steps_per_second': 1.83, 'epoch': 6.0}
{'loss': 1.0191, 'grad_norm': 7.280005931854248, 'learning_rate': 0.003, 'epoch': 7.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.144714117050171, 'eval_f1': 0.7335243553008596, 'eval_runtime': 2.0962, 'eval_samples_per_second': 105.43, 'eval_steps_per_second': 1.908, 'epoch': 7.0}
{'loss': 0.8856, 'grad_norm': 7.667726039886475, 'learning_rate': 0.002, 'epoch': 8.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.7121362686157227, 'eval_f1': 0.1232876712328767, 'eval_runtime': 2.0603, 'eval_samples_per_second': 107.265, 'eval_steps_per_second': 1.941, 'epoch': 8.0}
{'loss': 0.7652, 'grad_norm': 3.4028127193450928, 'learning_rate': 0.001, 'epoch': 9.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.684851884841919, 'eval_f1': 0.7371428571428571, 'eval_runtime': 2.1015, 'eval_samples_per_second': 105.162, 'eval_steps_per_second': 1.903, 'epoch': 9.0}
{'loss': 0.6987, 'grad_norm': 4.860345363616943, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.6815494894981384, 'eval_f1': 0.7371428571428571, 'eval_runtime': 2.1235, 'eval_samples_per_second': 104.072, 'eval_steps_per_second': 1.884, 'epoch': 10.0}
{'train_runtime': 119.7151, 'train_samples_per_second': 73.758, 'train_steps_per_second': 1.169, 'train_loss': 1.3028578553880965, 'epoch': 10.0}


## step-8: 模型推理

In [55]:
sent1 = "Missouri lawmakers are considering a boycott of companies that boycott Israel."
sent2 = "Missouri lawmakers are considering a government boycott of companies that boycott Israel."
sents = [(sent1, sent2)]

In [56]:
inputs = tokenizer(sent1, sent2, max_length=512, truncation=True, return_tensors="pt")
# inputs = tokenizer(sents, max_length=512, truncation=True, return_tensors="pt")
inputs = inputs.to(device)

In [57]:
model_sft.eval()
with th.inference_mode():
    out_mlp = model_sft(**inputs)
    y_hat = th.softmax(out_mlp.logits, dim=1)
    y_pred = th.argmax(y_hat, dim=1)

print(y_hat)
print(y_pred)

tensor([[0.3943, 0.6057]], device='cuda:0')
tensor([1], device='cuda:0')
