## Roberta 分类模型

In [None]:
## 加载文件及模型
import numpy as np
import pandas as pd
from dataclasses import dataclass, asdict
from typing import Optional

from loguru import logger

logger.add("out.log")
import numpy
import torch.cuda
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import BertForSequenceClassification, AutoTokenizer, AutoModel
from sklearn.metrics import classification_report

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LR = 1e-5

加载模型

In [None]:
## 加载模型
model_path = "/data/liyunhan/Model/chinese-roberta-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path,num_labels=2)
## 注意num_labels
model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

定义数据集, 加载数据

In [None]:
## 定义与加载数据集
class DatasetClassify(Dataset):
    def __init__(self, path):
        # 使用pandas读取CSV文件
        df = pd.read_csv(path,encoding='utf-8')
        self.data_list = df.to_dict('records')  # 将DataFrame转换成list of dictionaries

    def __getitem__(self, index):
        item = self.data_list[index]
        content = item['content']
        label = item['label']

        # 如果需要特殊的分隔符，例如'[SEP]', 可以在这里添加
        # 但通常对于文本分类任务，我们不需要在输入文本中加入分隔符
        # content = content + '[SEP]'  # 只有当需要时才使用

        return content, label

    def __len__(self):
        return len(self.data_list)

def collator_fn(batch):
    batch = numpy.array(batch)

    data_batch = batch[:, 0]
    label_batch = numpy.array(batch[:, 1], dtype=int)
    data_batch = tokenizer(data_batch.tolist(), max_length=256, padding=True, truncation=True,
                           return_tensors="pt").to(DEVICE)
    return data_batch, torch.tensor(label_batch, device=DEVICE, dtype=torch.long)

train_data_loader = DataLoader(DatasetClassify("/data/liyunhan/Model/data3.csv"), batch_size=32, shuffle=True,
                               collate_fn=collator_fn)
dev_data_loader = DataLoader(DatasetClassify("/data/liyunhan/Model/data3.csv"), batch_size=32, shuffle=False,
                             collate_fn=collator_fn)

## 定义测试函数
@torch.no_grad()
def eval():
    num_true = 0
    num_total = 0
    for item, label in tqdm(dev_data_loader, position=0, leave=True):
        output = model(**item, labels=label)
        pre_label = output.logits.detach().cpu().numpy()
        real_label = label.detach().cpu().numpy()
        pre_label = np.argmax(pre_label, axis=1)
        num_true += np.sum(real_label == pre_label)
        num_total += len(pre_label)
    acc = num_true/num_total
    logger.info("\n" + str(acc))
    return acc

## 开始训练

In [None]:
## 开始训练
EPOCHS = 46
step = 0
accu_max = 0.0
loss_total_min = 20
num_training_steps = len(train_data_loader) * EPOCHS
for epoch in range(EPOCHS):
    loss_total = 0.0
    for index, (item, label) in enumerate(tqdm(train_data_loader), start=1):
        step = epoch * len(train_data_loader) + index
        output = model(labels=label, **item)
        loss = output.loss
        loss_total += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        logger.info(f"第{epoch}轮的损失为{loss_total}")
        
        model.eval()
        accu_score = eval()
        model.train()
        if accu_score > accu_max or loss_total<loss_total_min:
            accu_max = accu_score
            loss_total_min = loss_total
            torch.save(model, "/data/liyunhan/Model/Model_saved/classify_model.pt")
            print("保存模型")
        if epoch > 0:
            LR = LR * 0.6
        loss_total = 0.0

## 测试模型

In [None]:
import torch
from transformers import AutoTokenizer
model_path = "/data/liyunhan/Model/chinese-roberta-wwm-ext"
classify_model = torch.load("/data/liyunhan/Model/Model_saved/classify_model.pt")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
classify_model.to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_path)

def predict_single_sentence(sentence: str):
    # Tokenize the sentence with the same settings as during training
    inputs = tokenizer(sentence, return_tensors="pt", max_length=256, padding=True, truncation=True)
    # Move the input tensors to the correct device
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    
    # Set the model to evaluation mode (important if your model has layers like dropout or batchnorm)
    classify_model.eval()

    # Perform inference
    with torch.no_grad():
        outputs = classify_model(**inputs)
    #print(outputs)
    
    # Get the predicted class. This assumes that you're using a classification model
    # and that the model returns logits.
    # You might need to modify this depending on what your model's forward pass returns
    _, predicted = torch.max(outputs.logits, 1)
    
    return predicted.item()  # Convert the tensor to a Python scalar

使用测试集测试

In [None]:
import pandas as pd
df = pd.read_csv("/data/liyunhan/Model/data3.csv",encoding = 'utf-8')

n=0
id_list = []
for i in range(len(df['content'])):
    label = df['label'][i]
    id = predict_single_sentence(df['content'][i])
    id_list.append(id)
    if label == id:
        n = n+1
print("准确率为:",n/len(df['content']))

手动输入内容测试

In [None]:
content = "\n4. 条款：原文中提到“双方均有权向甲方所在地人民法院起诉。”\n判断过程：原文明确约定了甲方所在地法院作为管辖机构，没有提及乙方所在地法院或仲裁机构。\n结论：存在该问题。'"
id = predict_single_sentence(content)
print(id)