In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizerFast, BertModel
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from seqeval.scheme import IOB2
from tqdm import tqdm
import numpy as np

def read_bio_file(file_path):
    """从BIO格式的文件中读取句子和标签。"""
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    tags.append(splits[1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
# --- 请修改为您自己的文件路径 ---
file_path = r"C:\Users\Administrator\Desktop\Project\bio_dataset_cleaned.txt"
sentences, labels = read_bio_file(file_path)

# 加载BERT分词器
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# 创建标签到ID的映射
label_list = sorted(set(label for label_seq in labels for label in label_seq))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
num_labels = len(label2id)

def encode_examples(sentences, labels, max_length=128):
    """将文本和标签编码为模型输入格式。"""
    input_ids = []
    attention_masks = []
    label_ids = []

    for sent, label_seq in zip(sentences, labels):
        encoding = tokenizer(
            sent,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )

        word_ids = encoding.word_ids(batch_index=0)
        aligned_labels = []
        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(label2id["O"])
            elif word_idx != prev_word_idx:
                aligned_labels.append(label2id[label_seq[word_idx]])
            else:
                aligned_labels.append(label2id["O"])
            prev_word_idx = word_idx

        input_ids.append(encoding['input_ids'][0])
        attention_masks.append(encoding['attention_mask'][0])
        label_ids.append(torch.tensor(aligned_labels))

    return input_ids, attention_masks, label_ids

NameError: name 'read_bio_file' is not defined