In [1]:
from transformers import AutoTokenizer

#加载分词工具
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [15]:
from datasets import load_dataset
from datasets import load_from_disk

#加载数据集
#从网络加载
#datasets = load_dataset(path='glue', name='sst2')

#从本地磁盘加载数据
datasets = load_from_disk('./data/glue_sst2')


#分词
def f(data):
    return tokenizer(
        data['sentence'],
        padding='max_length',
        truncation=True,
        max_length=30,
    )


datasets = datasets.map(f, batched=True, batch_size=1000, num_proc=4)

# 取数据子集，否则数据太多跑不动
dataset_train = datasets['train'].shuffle().select(range(1000))
dataset_test = datasets['validation'].shuffle().select(range(200))


Loading cached processed dataset at /home/mylady/code/python/DL-pytorch/apps/huggingface/data/glue_sst2/train/cache-a440844e75f8838e_*_of_00004.arrow
Loading cached processed dataset at /home/mylady/code/python/DL-pytorch/apps/huggingface/data/glue_sst2/validation/cache-efcadb6d9ecf7b0a_*_of_00004.arrow
Loading cached processed dataset at /home/mylady/code/python/DL-pytorch/apps/huggingface/data/glue_sst2/test/cache-b26b14726e876062_*_of_00004.arrow


In [None]:
del datasets


In [16]:


dataset_train

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [9]:
from transformers import AutoModelForSequenceClassification


#加载模型
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-cased',
    num_labels=2
)


print(sum([i.nelement() for i in model.parameters()]) / 10000)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

10831.181


In [10]:
import numpy as np
from datasets import load_metric
from transformers.trainer_utils import EvalPrediction


In [4]:
# pip install scikit-learn

'''
开启代理:
export https_proxy=127.0.0.1:7890
export http_proxy=127.0.0.1:7890


重启服务:
jupyter-lab --allow-root --ip=0.0.0.0 --port=8888
'''
# 加载评价函数
# 有时会因为网络问题卡主,反复尝试会成功的
metric = load_metric('accuracy')


# metric = load_metric('/home/mylady/.cache/huggingface/datasets/downloads/accuracy.py')

In [5]:


# 定义评价函数
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = logits.argmax(axis=1)
    return metric.compute(predictions=logits, references=labels)


# 模拟测试输出
eval_pred = EvalPrediction(
    predictions=np.array([[0, 1], [2, 3], [4, 5], [6, 7]]),
    label_ids=np.array([1, 1, 1, 1]),
)

compute_metrics(eval_pred)

{'accuracy': 1.0}

In [12]:
from transformers import TrainingArguments, Trainer


# 初始化训练参数
args = TrainingArguments(output_dir='./output_dir',
                         evaluation_strategy='epoch',
                         no_cuda=True)


args.num_train_epochs = 1
args.learning_rate = 1e-4
args.weight_decay = 1e-2
args.per_device_eval_batch_size = 32
args.per_device_train_batch_size = 16

In [17]:

# 初始化训练器
trainer = Trainer(model=model,
                  args=args,
                  train_dataset=dataset_train,
                  eval_dataset=dataset_test,
                  compute_metrics=compute_metrics,
                 )

# 评价模型
trainer.evaluate()

{'eval_loss': 0.6868857741355896,
 'eval_accuracy': 0.535,
 'eval_runtime': 2.3954,
 'eval_samples_per_second': 83.494,
 'eval_steps_per_second': 2.922}

In [18]:
# 训练
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.396098,0.825


TrainOutput(global_step=63, training_loss=0.5112935626317584, metrics={'train_runtime': 43.0955, 'train_samples_per_second': 23.204, 'train_steps_per_second': 1.462, 'total_flos': 15416663400000.0, 'train_loss': 0.5112935626317584, 'epoch': 1.0})

In [19]:
# 评价模型
trainer.evaluate()

{'eval_loss': 0.3960981070995331,
 'eval_accuracy': 0.825,
 'eval_runtime': 1.7252,
 'eval_samples_per_second': 115.931,
 'eval_steps_per_second': 4.058,
 'epoch': 1.0}

In [20]:
# 保存模型

save_model_path = './output_dir'
trainer.save_model(output_dir=save_model_path)

## collate_fn 定义

In [22]:
import torch


def collate_fn(data):
    label = [i['label'] for i in data]
    input_ids = [i['input_ids'] for i in data]
    token_type_ids = [i['token_type_ids'] for i in data]
    attention_mask = [i['attention_mask'] for i in data]

    label = torch.LongTensor(label)
    input_ids = torch.LongTensor(input_ids)
    token_type_ids = torch.LongTensor(token_type_ids)
    attention_mask = torch.LongTensor(attention_mask)

    return label, input_ids, token_type_ids, attention_mask



In [23]:

# 数据加载器
loader_test = torch.utils.data.DataLoader(dataset=dataset_test,
                                          batch_size=4,
                                          collate_fn=collate_fn,
                                          shuffle=True,
                                          drop_last=True)

for i, (label, input_ids, token_type_ids, attention_mask) in enumerate(loader_test):
    
    break

label, input_ids, token_type_ids, attention_mask

(tensor([1, 0, 1, 0]),
 tensor([[  101,  2276,  8144,  1104,  9163,   118,   118,  1105,  3254,  1643,
          17432,   118,   118,  1120, 12686,  9022, 15648,  1110,   170,  5871,
           8674,  1158,  3362,  1115,  4642,  1106,  1587,  1104,  1103,   102],
         [  101,  2257,   117,  4509,  1105, 12678, 14255,  4704, 25981,  1158,
            119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
         [  101,  1111,  1103,  1148,  1159,  1107,  1201,   117,  1260, 11437,
           2180, 11902,  1116,  1996, 15962,   117,  3229,  1272,  1119,   112,
            188,  1151, 14030,  1118,  1103,  3110,  1250,  1104,  1117,   102],
         [  101,  1104,  1736,   117,  1118,  1167,  7649, 12307,  1122,   112,
            188,  1253,  2385,  2213,   119,   102,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[0

In [24]:
import torch


#测试
def test_v1():
    
    # 加载参数
    model.load_state_dict(torch.load('./output_dir/pytorch_model.bin'))

    model.eval()

    # 运算
    out = model(input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask)

    #[4, 2] -> [4]
    out = out['logits'].argmax(dim=1)

    correct = (out == label).sum().item()

    return correct / len(label)

In [25]:

# 开始测试
test_v1()


0.75