In [2]:
# 引入所需的库和模块
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, set_seed
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader

# 定义超参数
BATCH_SIZE = 32
EPOCHS = 50
MAX_LENGTH = 32  # 最大长度 句子长度不超32
LR = 5e-5
WEIGHT_DECAY = 0.01  # 权重衰减系数
TRAIN_SIZE = 0.7  # 训练集比例

# 划分随机种子
set_seed(42)

# 加载csv文件作为训练集
raw_dataset = load_dataset('csv', data_files='train_dataset.csv', split='train')
# 对标签进行编码 将babel列向量数字编码
raw_dataset = raw_dataset.class_encode_column('label')
# 加载模型bert中文预训练模型
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
# 定义分词函数 对输入的文本进行编码 分词器编码
def tokenize_function(examples): # 对query列 超过max_length截断 少于的补零
    return tokenizer(examples['query'], truncation=True, padding='max_length', max_length=MAX_LENGTH)

# 应用分词函数编码应用分词函数编码到整个数据集 分batch 去掉#列即序号
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['#'])

# 训练集和验证集划分 以观测训练效果
tokenized_dataset = tokenized_dataset.train_test_split(train_size=TRAIN_SIZE, seed=42)
train_dataset = tokenized_dataset['train']
valid_dataset = tokenized_dataset['test']

# 设置该三列数据格式为pytorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
valid_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# 数据加载 训练集shuffle 验证集可以不shuffle
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)


# 模型
model = AutoModelForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=train_dataset.features['label'].num_classes)
model.to('cuda')
# 优化器
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
# lr scheduler 自动调整LR 定义LR上限 每轮训练步骤数
scheduler = OneCycleLR(optimizer, max_lr=LR, epochs=EPOCHS, steps_per_epoch=len(train_dataloader))


# 训练
best_acc = 0
for epoch in range(EPOCHS):
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['label'].to('cuda')

        # 前向传播算loss 反向传播更新梯度
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        # 更新参数 更新学习率 清空梯度
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.detach().item()
        
    # 评估模式 计算验证集精度
    model.eval()
    valid_predictions = []
    for batch in valid_dataloader:
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        #收集预测结果
        valid_predictions.extend(outputs.logits.argmax(-1).cpu().numpy())
    
    valid_acc = np.mean(valid_predictions == valid_dataset['label'].numpy())
    print(f"Epoch: {epoch+1}, Train Loss: {total_loss:.4f}, Validation Accuracy: {valid_acc:.4f}")
    
    # 保存在验证集上最好的结果
    if valid_acc >= best_acc:
        model.save_pretrained('output')



# 预测结果
# 加载测试集 保证编码与训练时一致
test_dataset = load_dataset('csv', data_files='test_dataset.csv', split='train')
test_id = test_dataset['id']
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns='id')
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 调用模型训练好的参数
model = AutoModelForSequenceClassification.from_pretrained('output')
model.to('cuda')

# 评估模式 预测结果
model.eval()
predictions = []
for batch in test_loader:
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    # 禁用梯度
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    predictions.extend(outputs.logits.argmax(-1).cpu().numpy())

# 向量转字符串
predicted_labels = train_dataset.features['label'].int2str(predictions)

# 下载结果csv文件
submission = pd.DataFrame({'id': test_id, 'label': predicted_labels})
submission.to_csv('submit_sample.csv', index=False)

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-206d9704fbfa435d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-206d9704fbfa435d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-3d6225af378207a4.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-206d9704fbfa435d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-9b0a6e9383ed3794.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/csv/default-206d9704fbfa435d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-20b85e459a398546.arrow and /root/.cache/huggingface/datasets/csv/default-206d9704fbfa435d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-1b7448ecfb336b37.arrow
Some weights of the model checkpoint at bert-base-chinese were not 

Epoch: 1, Train Loss: 298.1447, Validation Accuracy: 0.0032
Epoch: 2, Train Loss: 295.5498, Validation Accuracy: 0.0032
Epoch: 3, Train Loss: 292.4140, Validation Accuracy: 0.0080
Epoch: 4, Train Loss: 286.9923, Validation Accuracy: 0.0032
Epoch: 5, Train Loss: 281.7360, Validation Accuracy: 0.0080
Epoch: 6, Train Loss: 272.5039, Validation Accuracy: 0.0288
Epoch: 7, Train Loss: 261.3970, Validation Accuracy: 0.0511
Epoch: 8, Train Loss: 248.9655, Validation Accuracy: 0.0911
Epoch: 9, Train Loss: 235.7528, Validation Accuracy: 0.1326
Epoch: 10, Train Loss: 222.2296, Validation Accuracy: 0.1581
Epoch: 11, Train Loss: 206.0690, Validation Accuracy: 0.2428
Epoch: 12, Train Loss: 189.6237, Validation Accuracy: 0.2604
Epoch: 13, Train Loss: 173.3506, Validation Accuracy: 0.3035
Epoch: 14, Train Loss: 156.1347, Validation Accuracy: 0.3227
Epoch: 15, Train Loss: 140.2641, Validation Accuracy: 0.3466
Epoch: 16, Train Loss: 124.2066, Validation Accuracy: 0.3754
Epoch: 17, Train Loss: 108.6831, 

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-67caf77471a5a48e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-67caf77471a5a48e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-8cdda239e55d5b4c.arrow
