In [11]:
import torch
import pandas as pd

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)
     -------------------------------------- 123.1/123.1 kB 1.4 MB/s eta 0:00:00
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp39-none-win_amd64.whl.metadata (3.8 kB)
Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
   ---------------------------------------- 7.9/7.9 MB 11.8 MB/s eta 0:00:00
Downloading safetensors-0.4.0-cp39-none-win_amd64.whl (277 kB)
   ---------------------------------------- 277.2/277.2 kB 8.6 MB/s eta 0:00:00
Installing collected packages: safetensors, transformers
Successfully installed safetensors-0.4.0 transformers-4.35.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
from datasets import Dataset
from transformers import BertTokenizer
import os

In [13]:
#定义一个载入原始数据的函数
def load_data(base_path):
    paths = os.listdir(base_path)   #获取base_path目录下的所有文件和文件夹的名称，并将它们存储在paths列表中
    result = []   
    for path in paths:   #遍历paths列表中的每个元素，即每个文件名
        with open(os.path.join(base_path, path), 'r', encoding='utf-8') as f:   #使用open()函数打开当前文件名拼接而成的完整路径，并指定以只读模式打开文件。同时，设置文件的编码格式为UTF-8，并将打开的文件对象赋值给变量f
            result.append(f.readline())   #从文件中读取一行数据，并将其添加到result列表中
    return result

In [14]:
#定义一个读取数据的函数，将读取到的数据转化为datasets.Dataset
def get_dataset(base_path):
    pos_data = load_data(os.path.join(base_path, 'pos'))
    neg_data = load_data(os.path.join(base_path, 'neg'))   #分别调用了之前定义的load_data函数，分别读取了base_path/pos和base_path/neg目录下的数据，并将其存储在pos_data和neg_data变量中
    texts = pos_data + neg_data
    labels = [[1., 0.]]*len(pos_data) + [[0., 1.]] * len(neg_data)   #创建了一个标签列表labels，其中[[1., 0.]]*len(pos_data)表示将[1., 0.]这个标签重复len(pos_data)次，[[0., 1.]] * len(neg_data)表示将[0., 1.]这个标签重复len(neg_data)次，然后将这两部分标签列表合并
    dataset = Dataset.from_dict({'texts':texts, 'labels':labels})   #使用Dataset.from_dict()方法将文本数据和标签数据组装成一个字典，并使用该字典创建了一个datasets.Dataset对象
    return dataset

In [15]:
#读取数据
train_dataset = get_dataset('C:/Users/18094/Desktop/project/aclImdb_v1/aclImdb/train/')
test_dataset = get_dataset('C:/Users/18094/Desktop/project/aclImdb_v1/aclImdb/test/')

KeyboardInterrupt: 

In [7]:
print(train_dataset)

Dataset({
    features: ['texts', 'labels'],
    num_rows: 25000
})


In [16]:
cache_dir='C:/Users/18094/Desktop/project/transformersModels/bert-base-uncased2'
tokenizer = BertTokenizer.from_pretrained(cache_dir)

In [17]:
#转化数据，转化成模型可以接受的形式
# 设置最大长度
MAX_LENGTH = 512

# 使用文本标记器对texts进行编码
train_dataset = train_dataset.map(lambda e: tokenizer(e['texts'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)#map:对整个训练集进行映射操作；lambda e:匿名函数，接受一个参数e；truncation=True表示进行截断处理，padding='max_length'表示进行填充操作，max_length=MAX_LENGTH表示限制最大长度为512个token
test_dataset = test_dataset.map(lambda e: tokenizer(e['texts'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [18]:
print(print(train_dataset.features))

{'texts': Value(dtype='string', id=None), 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
None


In [20]:
#保存数据至本地
train_dataset.save_to_disk('C:/Users/18094/Desktop/project/data/train_dataset')
test_dataset.save_to_disk('C:/Users/18094/Desktop/project/data/test_dataset')

PermissionError: Tried to overwrite C:\Users\18094\Desktop\project\data\train_dataset but a dataset can't overwrite itself.

In [2]:
#训练模型
#导入必要的库
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, BertConfig
import torch
from datasets import Dataset
import json
import os

In [3]:
# 将num_labels设置为2
model = BertForSequenceClassification.from_pretrained('C:/Users/18094/Desktop/project/transformersModels/bert-base-uncased2', num_labels=2) #num_labels=2表示输出类别数量为2，二分类任务

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at C:/Users/18094/Desktop/project/transformersModels/bert-base-uncased2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
#加载处理好的数据
train_dataset = Dataset.load_from_disk('C:/Users/18094/Desktop/project/data/train_dataset')
test_dataset = Dataset.load_from_disk('C:/Users/18094/Desktop/project/data/test_dataset')

In [5]:
#冻结BERT参数
for param in model.base_model.parameters():
    param.requires_grad = False     #冻结了对应参数的梯度计算

In [12]:
#创建trainer
# 训练超参配置
training_args = TrainingArguments(
    output_dir='C:/Users/18094/Desktop/project/my_results',          #结果输出地址
    num_train_epochs=20,              # 训练总批次
    per_device_train_batch_size=64,  # 训练批大小
    per_device_eval_batch_size=64,   # 评估批大小
    logging_dir='C:/Users/18094/Desktop/project/my_logs',            # 日志存储位置
)

# 创建Trainer
trainer = Trainer(
    model=model.to('cuda'),              # 需要训练的模型，转移至gpu上
    args=training_args,                  # 训练参数
    train_dataset=train_dataset,         # training dataset 训练集
    eval_dataset=test_dataset,           # evaluation dataset 测试集
)

In [None]:
# 开始训练
trainer.train()
# 开始评估模型
trainer.evaluate()

# 保存模型 会保存到配置的output_dir处
trainer.save_model()


Step,Training Loss
500,0.6793
1000,0.6617
1500,0.6483
2000,0.6385
2500,0.6304
3000,0.6224
3500,0.6175


In [1]:
#加载模型
output_config_file = 'C:/Users/18094/Desktop/project/my_results/config.json'   #模型配置文件
output_model_file = 'C:/Users/18094/Desktop/project/my_results/pytorch_model.bin'  #模型数据文件

config = BertConfig.from_json_file(output_config_file)   #从指定的JSON文件中加载BERT模型的配置
model = BertForSequenceClassification(config)   #创建一个BertForSequenceClassification模型，并将上一步加载的配置传递给它
state_dict = torch.load(output_model_file)   #加载保存在output_model_file路径下的模型权重参数
model.load_state_dict(state_dict)   #将参数加载到之前创建的BERT模型中

True
1
11.6
