In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, set_seed
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader
import nlpaug.augmenter.word as naw

# 超参
BATCH_SIZE = 32
EPOCHS = 70
MAX_LENGTH = 40  # query句子最大长度不超过40
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01  # 权重衰减系数
MODEL_NAME = 'bert-base-chinese'

set_seed(42)

# 加载csv文件作为训练集
raw_dataset = load_dataset('csv', data_files='train_dataset.csv', split='train')

# 将label列编码为数字
raw_dataset = raw_dataset.class_encode_column('label')

# 加载bert中文预训练模型的自动分词器
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 定义分词器函数 对query列文本进行编码
def encode(examples):
    return tokenizer(examples['query'], truncation=True, padding='max_length', max_length=MAX_LENGTH)

# 应用分词函数编码应用分词函数编码到整个数据集 分batch 去掉#列即序号
raw_dataset = raw_dataset.map(encode, batched=True, remove_columns='#')

# 加载nlpaug包中的ContextualWordEmbsAug上下文数据增强方法 模型仍使用bert-base-chinese
augmenter = naw.ContextualWordEmbsAug(model_path='bert-base-chinese', action='substitute')

# 定义数据增强函数 增强query列
def augment_text(example):
    example['query'] = augmenter.augment(example['query'])
    return example

# 应用数据增强函数
augmented_dataset = raw_dataset.map(augment_text)

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-2de56802f1ede12a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-2de56802f1ede12a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-87761e9464147f5e.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-2de56802f1ede12a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-08941be77ca52344.arrow


Map:   0%|          | 0/2659 [00:00<?, ? examples/s]

In [2]:
# 不划分训练集于验证集
train_dataset = raw_dataset
valid_dataset = raw_dataset

# 设置该三列数据格式为pytorch张量
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
valid_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# 创建dataloader 训练集shuffle
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 模型
# 使用huggingface中调用序列分类器
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=train_dataset.features['label'].num_classes)
device = 'cuda'
model.to(device)
# 优化器
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
# lr scheduler
scheduler = OneCycleLR(optimizer, max_lr=LEARNING_RATE, epochs=EPOCHS, steps_per_epoch=len(train_dataloader))


# 训练
best_acc = 0
for epoch in range(EPOCHS):
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # 前向传播输入query的张量、掩码、label的张量 计算loss 反向传播更新梯度
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        # 更新参数 更新lr 清空梯度
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        # loss加总
        total_loss += loss.detach().item()

    # evaluation 计算验证集精度
    model.eval()
    valid_predictions = []
    for batch in valid_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # 禁用梯度
        with torch.no_grad():
            # 由input_ids和attention_mask计算结果
            outputs = model(input_ids, attention_mask=attention_mask)
        # 收集预测结果 收集类别维度最大概率的结果
        valid_predictions.extend(outputs.logits.argmax(-1).cpu().numpy())
    # 计算accuracy
    valid_acc = np.mean(valid_predictions == valid_dataset['label'].numpy())
    print(f"Epoch: {epoch + 1}, Train Loss: {total_loss:.3f}, Validation Accuracy: {valid_acc:.4f}")

    # 保存在验证集上最好的结果
    if valid_acc >= best_acc:
        model.save_pretrained('output')


# 预测结果
# 加载测试集 保证编码与训练时一致
test_dataset = load_dataset('csv', data_files='test_dataset.csv', split='train')
test_id = test_dataset['id']
test_dataset = test_dataset.map(encode, batched=True, remove_columns='id')
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 调用模型训练好的参数
model = AutoModelForSequenceClassification.from_pretrained('output')
model.to(device)

# 评估模式 收集预测结果
model.eval()
predictions = []
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    # 收集结果
    predictions.extend(outputs.logits.argmax(-1).cpu().numpy())

# 转字符串
predicted_labels = train_dataset.features['label'].int2str(predictions)

# 保存预测结果 以csv格式存储
submission = pd.DataFrame({'id': test_id, 'label': predicted_labels})
submission.to_csv('nlpaug.csv', index=False)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 1, Train Loss: 544.345, Validation Accuracy: 0.0008
Epoch: 2, Train Loss: 539.080, Validation Accuracy: 0.0019
Epoch: 3, Train Loss: 532.383, Validation Accuracy: 0.0079
Epoch: 4, Train Loss: 524.414, Validation Accuracy: 0.0192
Epoch: 5, Train Loss: 513.254, Validation Accuracy: 0.0560
Epoch: 6, Train Loss: 498.051, Validation Accuracy: 0.1489
Epoch: 7, Train Loss: 479.370, Validation Accuracy: 0.3076
Epoch: 8, Train Loss: 458.186, Validation Accuracy: 0.4874
Epoch: 9, Train Loss: 432.474, Validation Accuracy: 0.6709
Epoch: 10, Train Loss: 408.360, Validation Accuracy: 0.7439
Epoch: 11, Train Loss: 379.353, Validation Accuracy: 0.8315
Epoch: 12, Train Loss: 347.991, Validation Accuracy: 0.8876
Epoch: 13, Train Loss: 314.044, Validation Accuracy: 0.9094
Epoch: 14, Train Loss: 281.611, Validation Accuracy: 0.9361
Epoch: 15, Train Loss: 246.660, Validation Accuracy: 0.9515
Epoch: 16, Train Loss: 210.692, Validation Accuracy: 0.9673
Epoch: 17, Train Loss: 175.910, Validation Accura

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-67caf77471a5a48e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


Map:   0%|          | 0/1423 [00:00<?, ? examples/s]

In [3]:
EPOCHS = 120 #改epoch为120 其他均不变

train_dataset = raw_dataset
valid_dataset = raw_dataset

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
valid_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

#模型
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=train_dataset.features['label'].num_classes)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = OneCycleLR(optimizer, max_lr=LEARNING_RATE, epochs=EPOCHS, steps_per_epoch=len(train_dataloader))

# 训练
best_acc = 0
for epoch in range(EPOCHS):
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.detach().item()

    model.eval()
    valid_predictions = []
    for batch in valid_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        valid_predictions.extend(outputs.logits.argmax(-1).cpu().numpy())

    valid_acc = np.mean(valid_predictions == valid_dataset['label'].numpy())
    print(f"Epoch: {epoch + 1}, Train Loss: {total_loss:.3f}, Validation Accuracy: {valid_acc:.4f}")

    if valid_acc >= best_acc:
        model.save_pretrained('output')

# 预测
test_dataset = load_dataset('csv', data_files='test_dataset.csv', split='train')
test_id = test_dataset['id']
test_dataset = test_dataset.map(encode, batched=True, remove_columns='id')
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = AutoModelForSequenceClassification.from_pretrained('output')
model.to(device)

model.eval()
predictions = []
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    predictions.extend(outputs.logits.argmax(-1).cpu().numpy())

predicted_labels = train_dataset.features['label'].int2str(predictions)

submission = pd.DataFrame({'id': test_id, 'label': predicted_labels})
submission.to_csv('nlpaug.csv', index=False)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: 1, Train Loss: 543.099, Validation Accuracy: 0.0030
Epoch: 2, Train Loss: 539.380, Validation Accuracy: 0.0023
Epoch: 3, Train Loss: 533.444, Validation Accuracy: 0.0079
Epoch: 4, Train Loss: 528.144, Validation Accuracy: 0.0124
Epoch: 5, Train Loss: 520.337, Validation Accuracy: 0.0237
Epoch: 6, Train Loss: 511.517, Validation Accuracy: 0.0440
Epoch: 7, Train Loss: 502.885, Validation Accuracy: 0.0955
Epoch: 8, Train Loss: 491.150, Validation Accuracy: 0.1625
Epoch: 9, Train Loss: 478.098, Validation Accuracy: 0.2779
Epoch: 10, Train Loss: 462.987, Validation Accuracy: 0.4002
Epoch: 11, Train Loss: 447.406, Validation Accuracy: 0.5258
Epoch: 12, Train Loss: 429.293, Validation Accuracy: 0.6465
Epoch: 13, Train Loss: 410.384, Validation Accuracy: 0.7115
Epoch: 14, Train Loss: 390.526, Validation Accuracy: 0.8063
Epoch: 15, Train Loss: 368.068, Validation Accuracy: 0.8511
Epoch: 16, Train Loss: 346.565, Validation Accuracy: 0.8913
Epoch: 17, Train Loss: 322.325, Validation Accura

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-67caf77471a5a48e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Map:   0%|          | 0/1423 [00:00<?, ? examples/s]