In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re

# 检查是否有可用的GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# 加载BERT模型和tokenizer，并将它们移动到GPU上
tokenizer = BertTokenizer.from_pretrained("model/bret_base_chinese/tokenizer")
model = BertForSequenceClassification.from_pretrained("model/bret_base_chinese/model", num_labels=5)
model.to(device)
# 数据预处理
MAX_LEN = 64

You are using a model of type ernie to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at model/ernie-1.0 and are newly initialized: ['encoder.layer.5.output.LayerNorm.bias', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.3.attention.self.key.bias', 'encoder.layer.6.attention.self.query.weight', 'embeddings.token_type_embeddings.weight', 'encoder.layer.5.output.dense.weight', 'encoder.layer.3.output.LayerNorm.bias', 'encoder.layer.7.attention.self.key.bias', 'encoder.layer.4.attention.output.dense.bias', 'encoder.layer.1.attention.output.LayerNorm.weight', 'encoder.layer.9.output.LayerNorm.weight', 'encoder.layer.2.attention.self.key.weight', 'encoder.layer.4.attention.self.query.weight', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.6.attention.output.dense.bias', 'encoder.layer.9.attention.self.value.bias'

In [None]:
def preprocess_data(df, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    labels = []

    for index, row in df.iterrows():
        text = row[0]
        label = row[1]

        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])
        labels.append(label)

    input_ids = torch.cat(input_ids, dim=0).to(device)
    attention_masks = torch.cat(attention_masks, dim=0).to(device)
    labels = torch.tensor(labels, dtype=torch.long).to(device)

    return input_ids, attention_masks, labels

In [None]:
# 准备训练数据和验证数据
train_df = pd.read_table('input/train.txt', header=None)
dev_df = pd.read_table('input/dev.txt', header=None)

train_df = train_df[train_df[1] != 5]
dev_df = dev_df[dev_df[1] != 5]
train_input_ids, train_attention_masks, train_labels = preprocess_data(train_df, tokenizer, MAX_LEN)
dev_input_ids, dev_attention_masks, dev_labels = preprocess_data(dev_df, tokenizer, MAX_LEN)

# 创建TensorDataset和DataLoader
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
dev_dataset = TensorDataset(dev_input_ids, dev_attention_masks, dev_labels)

train_batch_size = 128
dev_batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=dev_batch_size)

In [None]:
# 定义优化器和损失函数
# optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 训练模型
num_epochs = 7

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}",end='\t')

    # 在验证集上评估模型性能
    model.eval()
    val_preds = []
    val_labels = []

    for batch in dev_dataloader:
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        logits = outputs.logits  
        val_preds.extend(logits.argmax(dim=1).tolist())
        val_labels.extend(labels.tolist())

    correct_predictions = [1 if p == t else 0 for p, t in zip(val_preds, val_labels)]
    accuracy = sum(correct_predictions) / len(correct_predictions)
    print(f'Test_Accuracy: {accuracy * 100:.2f}%')

Epoch 1/7, Average Loss: 1.4475	Test_Accuracy: 61.02%
Epoch 2/7, Average Loss: 0.9417	Test_Accuracy: 70.66%
Epoch 3/7, Average Loss: 0.6711	Test_Accuracy: 78.28%
Epoch 4/7, Average Loss: 0.5483	Test_Accuracy: 79.34%
Epoch 5/7, Average Loss: 0.4781	Test_Accuracy: 82.84%
Epoch 6/7, Average Loss: 0.4343	Test_Accuracy: 83.58%
Epoch 7/7, Average Loss: 0.3832	Test_Accuracy: 83.58%


In [None]:
test_df = pd.read_table('input/dev.txt', header=None)

# 数据预处理
test_input_ids, test_attention_masks, _ = preprocess_data(test_df, tokenizer, MAX_LEN)

# 创建 DataLoader
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_batch_size = 64
test_dataloader = DataLoader(test_dataset, batch_size=test_batch_size)

# 将模型设置为评估模式
model.eval()

# 存储预测结果
predictions = []

# 遍历测试数据并进行预测
for batch in test_dataloader:
    input_ids, attention_mask = batch
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)  # 计算概率分布
    predicted_labels = probabilities.tolist()
    predictions.extend(predicted_labels)

# 输出概率分布和对应的分类标签
predicted_labels = [probs.index(max(probs)) for probs in predictions]
predicted_probabilities = [max(probs) for probs in predictions]

true_labels = test_df[1]

# 计算准确度
correct_predictions = [1 if p == t else 0 for p, t in zip(predicted_labels, true_labels)]
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 78.90%


In [None]:
true_labels = test_df[1]
best_predicted = []
best_threshold = 0
best_acc = 0

for i in range(100):
    predicted = predicted_labels.copy()
    threshold = i/100
    predicted = [5 if prob < threshold else label for label, prob in zip(predicted, predicted_probabilities)]

    correct_predictions = [1 if p == t else 0 for p, t in zip(predicted, true_labels)]
    accuracy = sum(correct_predictions) / len(correct_predictions)

    if accuracy > best_acc:
        best_acc = accuracy
        best_predicted = predicted
        best_threshold = threshold
        
print(best_threshold,best_acc)

0.48 0.79
