In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, set_seed
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader

# 定义超参数
BATCH_SIZE = 32
EPOCHS = 50
MAX_LENGTH = 35  # 最大长度 query句子长度大多不超过35
LR = 5e-5
TRAIN_SIZE = 0.8  # 训练集比例
# set_seed(42)



# 加载csv文件作为训练集
raw_dataset = load_dataset('csv', data_files='train_dataset.csv', split='train')
# 对标签进行编码 将babel列向量数字编码
raw_dataset = raw_dataset.class_encode_column('label')
# 加载模型bert中文预训练模型
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
# 定义分词器函数 对query列文本进行编码
def tokenize_function(examples): # 对query列 超过max_length截断 少于的补零
    return tokenizer(examples['query'], truncation=True, padding='max_length', max_length=MAX_LENGTH)

# 应用分词函数编码应用分词函数编码到整个数据集 分batch 去掉#列即序号
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['#'])

# 训练集和验证集划分 以观测训练效果
tokenized_dataset = tokenized_dataset.train_test_split(train_size=TRAIN_SIZE, seed=42)
train_dataset = tokenized_dataset['train']
valid_dataset = tokenized_dataset['test']

# 设置该三列数据格式为pytorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
valid_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# 数据加载 训练集shuffle 验证集可以不shuffle
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)



# 模型
model = AutoModelForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=train_dataset.features['label'].num_classes)
model.to('cuda')
# 优化器
optimizer = AdamW(model.parameters(), lr=LR)
# lr scheduler 自动调整LR 定义LR上限 每轮训练步骤数
scheduler = OneCycleLR(optimizer, max_lr=LR, epochs=EPOCHS, steps_per_epoch=len(train_dataloader))



# 训练
best_acc = 0
for epoch in range(EPOCHS):
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['label'].to('cuda')

        # 前向传播算loss 反向传播更新梯度
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        # 更新参数 更新学习率 清空梯度
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.detach().item()
        
    # 评估模式 计算验证集精度
    model.eval()
    valid_predictions = []
    for batch in valid_dataloader:
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        #收集预测结果
        valid_predictions.extend(outputs.logits.argmax(-1).cpu().numpy())
    
    valid_acc = np.mean(valid_predictions == valid_dataset['label'].numpy())
    print(f"Epoch: {epoch+1}, Train Loss: {total_loss:.4f}, Validation Accuracy: {valid_acc:.4f}")
    
    # 保存在验证集上最好的结果
    if valid_acc >= best_acc:
        model.save_pretrained('output')



# 预测结果
# 加载测试集 保证编码与训练时一致
test_dataset = load_dataset('csv', data_files='test_dataset.csv', split='train')
test_id = test_dataset['id']
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns='id')
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 调用模型训练好的参数
model = AutoModelForSequenceClassification.from_pretrained('output')
model.to('cuda')

# 评估模式 预测结果
model.eval()
predictions = []
for batch in test_loader:
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    # 禁用梯度
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    predictions.extend(outputs.logits.argmax(-1).cpu().numpy())

# 向量转字符串
predicted_labels = train_dataset.features['label'].int2str(predictions)

# 下载结果csv文件
submission = pd.DataFrame({'id': test_id, 'label': predicted_labels})
submission.to_csv('submit_sample.csv', index=False)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-206d9704fbfa435d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-206d9704fbfa435d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


Casting to class labels:   0%|          | 0/2086 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

Map:   0%|          | 0/2086 [00:00<?, ? examples/s]

Downloading pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 1, Train Loss: 343.7296, Validation Accuracy: 0.0024
Epoch: 2, Train Loss: 341.0904, Validation Accuracy: 0.0000
Epoch: 3, Train Loss: 338.1317, Validation Accuracy: 0.0048
Epoch: 4, Train Loss: 333.0221, Validation Accuracy: 0.0024
Epoch: 5, Train Loss: 326.4313, Validation Accuracy: 0.0120
Epoch: 6, Train Loss: 316.1377, Validation Accuracy: 0.0311
Epoch: 7, Train Loss: 301.0961, Validation Accuracy: 0.0766
Epoch: 8, Train Loss: 286.9303, Validation Accuracy: 0.1029
Epoch: 9, Train Loss: 269.3817, Validation Accuracy: 0.1699
Epoch: 10, Train Loss: 251.5050, Validation Accuracy: 0.2321
Epoch: 11, Train Loss: 233.1118, Validation Accuracy: 0.2943
Epoch: 12, Train Loss: 213.9955, Validation Accuracy: 0.3254
Epoch: 13, Train Loss: 194.3248, Validation Accuracy: 0.3421
Epoch: 14, Train Loss: 173.5190, Validation Accuracy: 0.3660
Epoch: 15, Train Loss: 152.7261, Validation Accuracy: 0.3971
Epoch: 16, Train Loss: 133.8985, Validation Accuracy: 0.4163
Epoch: 17, Train Loss: 115.6827, 

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-67caf77471a5a48e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


Map:   0%|          | 0/1423 [00:00<?, ? examples/s]