# NLP的主要任务

## Token 分类任务

实体命名识别 （NER）：找出句子中的实体（如人物、地点或组织）。这可以通过为每个实体指定一个类别的标签，如果没有实体则会输出无实体的标签。

词性标注 （POS）：将句子中的每个单词标记为对应于特定的词性（如名词、动词、形容词等）。

分块（chunking）：找出属于同一实体的 tokens 这个任务（可以与词性标注或命名实体识别结合）可以被描述为将位于块开头的 token 赋予一个标签（通常是 “ B- ” （Begin），代表该token位于实体的开头），将位于块内的 tokens 赋予另一个标签（通常是 “ I- ”（inner）代表该token位于实体的内部），将不属于任何块的 tokens 赋予第三个标签（通常是 “ O ” （outer）代表该token不属于任何实体）

In [1]:
#CoNLL-2003 数据集
from datasets import load_dataset

raw_datasets = load_dataset("conll2003",trust_remote_code=True)

In [2]:
raw_datasets["train"].to_pandas().head()

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"[EU, rejects, German, call, to, boycott, Briti...","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[11, 21, 11, 12, 21, 22, 11, 12, 0]","[3, 0, 7, 0, 0, 0, 7, 0, 0]"
1,1,"[Peter, Blackburn]","[22, 22]","[11, 12]","[1, 2]"
2,2,"[BRUSSELS, 1996-08-22]","[22, 11]","[11, 12]","[5, 0]"
3,3,"[The, European, Commission, said, on, Thursday...","[12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 3...","[11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 1...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ..."
4,4,"[Germany, 's, representative, to, the, Europea...","[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 2...","[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 1...","[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ..."


In [3]:

for part,dataset in raw_datasets.items():
    dataset.to_json(f"./data/conll2003-{part}.jsonl")

Creating json from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

In [4]:
raw_datasets['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [3]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [4]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


In [6]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True) # 文本已经是单词列表了
inputs.tokens() # tokenizer自动添加了两个符号，并且将lamb拆分了，对应的ner_tags也应该多三个

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [10]:
inputs.word_ids() #可以看到tokenized结果的单词属于原来单词的位置


[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [7]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # 新单词的开始!
            current_word = word_id
            # 特殊 tokens 的标签设置为 -100 。这是因为默认情况下， -100 会被我们的损失函数（交叉熵）忽略
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # 特殊的token
            new_labels.append(-100)
        else:
            # 与前一个 tokens 类型相同的单词
            label = labels[word_id]
            # 如果标签是 B-XXX 我们将其更改为 I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [8]:
labels = raw_datasets['train'][0]['ner_tags']
inputs = tokenizer(raw_datasets['train'][0]['tokens'],is_split_into_words=True)
new_labels = align_labels_with_tokens(labels,inputs.word_ids())
print(labels)
new_labels

[3, 0, 7, 0, 0, 0, 7, 0, 0]


[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [9]:
inputs = tokenizer(raw_datasets['train'][:2]['tokens'],is_split_into_words=True)
print(inputs.word_ids(0),inputs.tokens(0))
print(inputs.word_ids(1),inputs.tokens(1))

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None] ['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
[None, 0, 1, None] ['[CLS]', 'Peter', 'Blackburn', '[SEP]']


In [10]:
# 将上面的tokenize和对其label的操作封装函数
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = raw_datasets.map(tokenize_and_align_labels,batched=True,remove_columns=raw_datasets['train'].column_names)

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [12]:
# 不能像 第三章 那样直接使用 DataCollatorWithPadding 因为那样只会填充输入（inputs ID、注意掩码和 tokens 类型 ID）。除了输入部分，在这里我们还需要对标签也使用与输入完全相同的方式填充
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
sample = [tokenized_datasets["train"][i] for i in range(2)]
print(sample)
batch = data_collator(sample)
print(batch) # 可以看到labels也被填充了

[{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]}, {'input_ids': [101, 1943, 14428, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1], 'labels': [-100, 1, 2, -100]}]
{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -10

In [13]:
# 对比一下填充前后的label
print(tokenized_datasets['train'][1]['labels'])
print(batch['labels'][1]) # 填充后的

[-100, 1, 2, -100]
tensor([-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100])


In [17]:
# 每个周期计算一次指标 需要定义compute_metrics函数，输入：预测值和标签数组，返回带有指标名称的评估结果字典
# 用于评估 Token 分类预测的经典框架是 seqeval 。要使用此指标，我们首先需要安装 seqeval 
# !pip install seqeval


In [14]:
import evaluate
metric = evaluate.load("seqeval")

In [15]:
labels = raw_datasets['train'][0]['ner_tags']
labels = [label_names[i] for i in labels]
print(labels) # 第一个样本的的label

prediction = labels.copy()
print(prediction)
prediction[2] = 'O' # 捏造出一个预测
metric.compute(predictions=[prediction],references=[labels]) #输入是预测列表（不是一个）和标签列表

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

In [16]:
# 通常模型预测结果是logit和真实标签结果labels，需要手动argmax logits
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # 删除忽略的索引(特殊 tokens )并转换为标签
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [17]:
# token分类问题，需要提供标签转换器
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
print(id2label)
print(label2id)

{'0': 'O', '1': 'B-PER', '2': 'I-PER', '3': 'B-ORG', '4': 'I-ORG', '5': 'B-LOC', '6': 'I-LOC', '7': 'B-MISC', '8': 'I-MISC'}
{'O': '0', 'B-PER': '1', 'I-PER': '2', 'B-ORG': '3', 'I-ORG': '4', 'B-LOC': '5', 'I-LOC': '6', 'B-MISC': '7', 'I-MISC': '8'}


In [18]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
model.config.num_labels # 查看模型的标签数量对不对

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


9

In [22]:
# 微调模型
# from huggingface_hub import notebook_login

# notebook_login()

from transformers import TrainingArguments

args = TrainingArguments(
    output_dir='./my_model/bert-finetuned-ner',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False, # 如果登录了，可以设置为True保存到 hf
)



In [None]:
from transformers import Trainer

# 使用trainer就不需要dataloader，只要把原来给dataloader的datacollator给trainer就行了

trainer = Trainer(
    model = model,
    args=args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)
model.num_parameters() # 110M的参数参与训练
trainer.train()

In [44]:
from transformers import AutoModelForTokenClassification
# 加载本地的微调好的模型
model = AutoModelForTokenClassification.from_pretrained('./my_model/bert-finetuned-ner/checkpoint-5268')
print(model)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [54]:
# 简单测试一下
from torch.utils.data import DataLoader
import torch

train_dataloader = DataLoader(
    tokenized_datasets['train'],shuffle=False,batch_size=8,collate_fn=data_collator
)
for batch in train_dataloader:
    break
# batch = {k:v.to(torch.device('cuda')) for k,v in batch.items()}
outputs = model(**batch)
# print(outputs)
print(outputs.logits.shape)
out = torch.argmax(outputs.logits,dim=-1)
print(out[0].tolist())
res = [label_names[p] for p in out[0].tolist()]
print(tokenizer.decode(batch['input_ids'][0]))
print(res)

torch.Size([8, 50, 9])
[0, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0]
[CLS] EU rejects German call to boycott British lamb. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
['O', 'B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-ORG', 'O', 'O', 'O', 'O']


### pytorch微调模型

In [20]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from tqdm.auto import tqdm
import torch

# 1. 训练和验证数据
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

# 2. 模型
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

# 3. 优化器
optimizer = AdamW(model.parameters(), lr=2e-5)

# 4. 加速器重定义 数据+模型+优化器
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

# 5. 学习率调度器
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# 6. 评估函数（模型结果是logits） 返回转为文字的预测和标签
def postprocess(predictions, labels):  # 模型预测，真实标签，都是数字
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # 删除忽略的索引(特殊 tokens )并转换为标签
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

# 7. 训练循环
progress_bar = tqdm(range(num_training_steps))

output_dir = './my_model/torch/bert=finetuned-ner'
for epoch in range(num_train_epochs):
    # 训练
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # 评估
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # 因为模型的数据分布在不同GPU，不同GPU上的batch长度不一样，调整为一样，填充模型的预测和标签后才能调用 gathere()
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions) # 收集各个GPU的计算结果
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # 保存并上传
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model) # 模型被包装到了分布式类里，这里直接获取原始模型
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        # repo.push_to_hub(
        #     commit_message=f"Training in progress epoch {epoch}", blocking=False
        # )


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5268 [00:00<?, ?it/s]

epoch 0: {'precision': 0.9363850555368562, 'recall': 0.9029535864978903, 'f1': 0.9193654990085922, 'accuracy': 0.9817360334373344}
epoch 1: {'precision': 0.9434533826994278, 'recall': 0.9223428759460349, 'f1': 0.9327787021630615, 'accuracy': 0.9846353093542121}
epoch 2: {'precision': 0.9473241332884551, 'recall': 0.9246057818659659, 'f1': 0.9358270989193683, 'accuracy': 0.9851798434096662}


### PEFT LORA模型注意点

1. 调用get_peft_model（） 后，使用的基础模型结构是会被修改的，添加 LoRA 适配器层。
2. LORA微调的时候，如果指定了target_modules=["query", "key", "value", "classifier"]也就是lora作用的模块，如果不指定，LORA的trainer保存的时候保存的模块参数和作用的模块lora参数不一致，导致加载模型失败（没有合并lora和基础模型情况下分开加载）
3. LORA微调后一定要保存模型，合并保存！merged_model = model.merge_and_unload()

In [None]:
# 如果用lora来做微调会快很多
from transformers import AutoModelForTokenClassification
# 加载本地的微调好的模型 加载本地模型之前，一定要
model = AutoModelForTokenClassification.from_pretrained('./my_model/bert-finetuned-ner/checkpoint-5268')
from peft import LoraConfig,get_peft_model,TaskType
config = LoraConfig(TaskType.TOKEN_CLS,lora_alpha=32,lora_dropout=0.1,r=8,)
model = get_peft_model(model=model,peft_config=config)

from transformers import Trainer,TrainingArguments

# model.to(torch.device('cuda'))
args = TrainingArguments(
    output_dir='./my_model/lora/bert-finetuned-ner',
    learning_rate=5e-5,
    eval_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=3,
    weight_decay=0.1,
    push_to_hub=False,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [49]:
model.print_trainable_parameters() # 训练参数降低到了0.3m

trainable params: 301,833 || all params: 108,028,434 || trainable%: 0.2794


In [58]:
# 测试一下效果
# lora模型微调的效果
metric = evaluate.load("seqeval")
valid_loader = DataLoader(tokenized_datasets['validation'],batch_size=32,shuffle=True,collate_fn=data_collator)
model.to(torch.device('cpu'))
model.eval()
for batch in valid_loader:
    with torch.no_grad():
        batch = {k:v.to(torch.device('cpu'))for k,v in batch.items()}
        print(compute_metrics((model(**batch).logits,batch['labels'])))
        break

{'precision': 0.8333333333333334, 'recall': 0.8490566037735849, 'f1': 0.8411214953271028, 'accuracy': 0.9620253164556962}


# 字符分类任务总结


数据格式关键：id,token,ner_tags
数据还要有feature.names

分类的数据tokenize后，对应的ner_tags也要扩展

因此：也要DataCollatorForTokenClassification，能再对token扩展的同时也对labels拓展

验证，可以用evaluate的库加载seqeval

微调模型可以将模型替换为lora模型

然后开始torch手动或者trainer自动微调
