In [6]:
import torch
import pandas as pd
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re

# 检查是否有可用的GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# 加载BERT模型和tokenizer，并将它们移动到GPU上
model_name = 'model/bert-base-chinese/'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name,num_labels=6)

# 添加Dropout层
dropout_rate = 0.1
model.classifier.dropout = nn.Dropout(p=dropout_rate)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at model/bert-base-chinese/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
# 数据预处理
MAX_LEN = 64

def preprocess_data(df, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    labels = []
    
    for index, row in df.iterrows():
        text = row[0]   
        label = row[1]
        
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])
        labels.append(label)

    input_ids = torch.cat(input_ids, dim=0).to(device)
    attention_masks = torch.cat(attention_masks, dim=0).to(device)
    labels = torch.tensor(labels, dtype=torch.long).to(device)

    return input_ids, attention_masks, labels

In [12]:
preprocess_data(dev_df, tokenizer, MAX_LEN)

(tensor([[ 101, 5131, 2228,  ...,    0,    0,    0],
         [ 101, 5131, 2228,  ...,    0,    0,    0],
         [ 101,  123, 1798,  ...,    0,    0,    0],
         ...,
         [ 101, 1969, 2027,  ...,    0,    0,    0],
         [ 101, 1159, 3309,  ...,    0,    0,    0],
         [ 101,  100, 1798,  ...,    0,    0,    0]], device='cuda:0'),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
 tensor([4, 3, 3, 2, 3, 5, 3, 1, 3, 3, 1, 2, 0, 4, 2, 2, 2, 0, 1, 2, 4, 3, 3, 3,
         3, 2, 3, 1, 3, 4, 1, 3, 0, 2, 3, 0, 3, 3, 1, 1, 5, 3, 0, 4, 2, 1, 2, 3,
         0, 2, 2, 4, 3, 4, 4, 1, 2, 1, 4, 0, 5, 5, 0, 3, 0, 1, 5, 0, 2, 2, 3, 1,
         3, 2, 1, 4, 2, 1, 5, 3, 2, 0, 2, 1, 1, 0, 1, 2, 5, 1, 4, 1, 0, 4, 3, 1,
         4, 2, 2, 2, 2, 4, 3, 3, 4, 0, 1, 1, 3, 1, 1, 1, 0, 1, 1, 0, 2, 5, 4, 1,
 

In [9]:
# 准备训练数据和验证数据
train_df = pd.read_table('input/train.txt',header=None)
dev_df = pd.read_table('input/dev.txt',header=None)

train_input_ids, train_attention_masks, train_labels = preprocess_data(train_df, tokenizer, MAX_LEN)
dev_input_ids, dev_attention_masks, dev_labels = preprocess_data(dev_df, tokenizer, MAX_LEN)

# 创建TensorDataset和DataLoader
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
dev_dataset = TensorDataset(dev_input_ids, dev_attention_masks, dev_labels)

train_batch_size = 16
dev_batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=dev_batch_size)

In [10]:
# 定义优化器和损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=len(train_dataloader) * 1, num_training_steps=len(train_dataloader) * 10)


# 训练模型
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}",end='\t')

    # 在验证集上评估模型性能
    model.eval()
    val_preds = []
    val_labels = []

    for batch in dev_dataloader:
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        logits = outputs.logits  
        val_preds.extend(logits.argmax(dim=1).tolist())
        val_labels.extend(labels.tolist())

    correct_predictions = [1 if p == t else 0 for p, t in zip(val_preds, val_labels)]
    accuracy = sum(correct_predictions) / len(correct_predictions)
    print(f'Test_Accuracy: {accuracy * 100:.2f}%')


KeyboardInterrupt



In [None]:
test_df = pd.read_table('input/pred.txt', header=None)

test_df[1] = 0

test_input_ids, test_attention_masks, test_labels = preprocess_data(test_df, tokenizer,MAX_LEN)

batch_size = 16
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()
test_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        
test_df[1] = test_preds

In [None]:
# 获取测试数据的第一列
text = test_df[0]
label = test_df[1]

# 指定要保存的文件名
output_file = f"output/res{accuracy}.txt"

# 打开文件并将数据写入
with open(output_file, "w", encoding="utf-8") as file:
    for text,label in zip(text,label):
        file.write(str(text) + "\t")
        file.write(str(label) + "\n")