In [31]:
def read_bio_file(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    tags.append(splits[1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

# 用法
sentences, labels = read_bio_file(r"C:\Users\Administrator\Desktop\Project\bio_dataset_cleaned.txt")




In [32]:
from transformers import BertTokenizerFast
import torch

# 加载 BERT 分词器
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# 创建标签映射
label_list = sorted(set(label for label_seq in labels for label in label_seq))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
num_labels = len(label2id)

def encode_examples(sentences, labels, max_length=128):
    input_ids = []
    attention_masks = []
    label_ids = []

    for sent, label in zip(sentences, labels):
        # 分词（每个单词一个元素）
        encoding = tokenizer(
            sent,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )

        word_ids = encoding.word_ids(batch_index=0)
        aligned_labels = []
        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(label2id["O"])  # 用 'O' 替换原来的 -100
  # 忽略填充位
            elif word_idx != prev_word_idx:
                aligned_labels.append(label2id[label[word_idx]])  # 第一个子词：保留标签
            else:
                aligned_labels.append(label2id["O"])  # 用 'O' 替换原来的 -100
  # 后续子词：忽略
            prev_word_idx = word_idx

        input_ids.append(encoding['input_ids'][0])
        attention_masks.append(encoding['attention_mask'][0])
        label_ids.append(torch.tensor(aligned_labels))

    return input_ids, attention_masks, label_ids


In [3]:
input_ids, attention_masks, label_ids = encode_examples(sentences, labels)


In [4]:
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, input_ids, attention_masks, label_ids):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.label_ids = label_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": self.label_ids[idx]
        }


In [5]:
from torch.utils.data import DataLoader

# 构建数据集对象
dataset = NERDataset(input_ids, attention_masks, label_ids)

# 构建 DataLoader（可设置 batch_size）
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:
import torch.nn as nn
from transformers import BertModel
from torchcrf import CRF

class BERT_CRF(nn.Module):
    def __init__(self, bert_model_name, num_labels):
        super(BERT_CRF, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        # 您使用的是默认的 batch_first=False
        self.crf = CRF(num_labels, batch_first=False)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        emissions = self.classifier(sequence_output)  # Shape: [batch, seq_len, num_labels]

        # 统一将维度转换为 (seq_len, batch, ...) 以适配CRF层
        emissions = emissions.permute(1, 0, 2)  # Shape: [seq_len, batch, num_labels]
        mask = attention_mask.permute(1, 0).bool() # Shape: [seq_len, batch]

        if labels is not None:
            # 计算损失时，labels也需要转换为 (seq_len, batch)
            labels = labels.permute(1, 0)
            loss = -self.crf(emissions, labels, mask=mask, reduction='mean')
            return loss
        else:
            predictions = self.crf.decode(emissions, mask=mask)
            return predictions




In [16]:
import torch

# ✅ 这里改为使用 label2id
num_labels = len(label2id)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERT_CRF('bert-base-uncased', num_labels)
model.to(device)



BERT_CRF(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [9]:
from torch.optim import AdamW


# 设置优化器
optimizer = AdamW(model.parameters(), lr=2e-5)

# 训练轮数
epochs = 3


In [10]:
from tqdm import tqdm

def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss.mean().backward()
        optimizer.step()

        total_loss += loss.mean().item()


    avg_loss = total_loss / len(dataloader)
    return avg_loss


In [11]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    avg_loss = train(model, train_loader, optimizer, device)
    print(f"Average Loss: {avg_loss:.4f}")


Epoch 1/3


  return forward_call(*args, **kwargs)
Training: 100%|██████████| 95/95 [04:21<00:00,  2.76s/it]


Average Loss: 4.5374
Epoch 2/3


Training: 100%|██████████| 95/95 [04:36<00:00,  2.91s/it]


Average Loss: 2.3601
Epoch 3/3


Training: 100%|██████████| 95/95 [04:33<00:00,  2.88s/it]

Average Loss: 1.6064





In [30]:
def predict(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            predictions = model(input_ids=input_ids, attention_mask=attention_mask)

            # ✅ 打印模型输出与输入信息
            print(f"predictions type: {type(predictions)}")
            print(f"len(predictions): {len(predictions)}")
            print(f"type(predictions[0]): {type(predictions[0])}")
            print(f"predictions[0] 示例: {predictions[0]}")
            print(f"labels.shape: {labels.shape}")
            print(f"attention_mask.shape: {attention_mask.shape}")
            break  # 只看第一个 batch

    return [], []






In [29]:
print(len(train_loader))  # 看有没有数据


95


In [24]:
from seqeval.metrics import classification_report, f1_score
from seqeval.scheme import IOB2

def evaluate(preds, trues, id2label):
    # 将 ID 序列转换成标签序列
    preds_label = [[id2label[idx] for idx in seq] for seq in preds]
    trues_label = [[id2label[idx] for idx in seq] for seq in trues]

    print("📊 分类报告:")
    print(classification_report(trues_label, preds_label, mode='strict', scheme=IOB2))
    print(f"F1-score: {f1_score(trues_label, preds_label):.4f}")


In [25]:
# 假设你用的是训练集
preds, trues = predict(model, train_loader, device)
evaluate(preds, trues, id2label)


ValueError: not enough values to unpack (expected 3, got 2)

In [1]:
import pickle

with open("ner_data.pkl", "rb") as f:
    ner_data = pickle.load(f)
print(type(ner_data))
print(ner_data.keys())  # 如果是 dict
print(len(ner_data))    # 如果是 list


<class 'dict'>
dict_keys(['input_ids', 'attention_mask', 'label_ids', 'tag2id', 'id2tag'])
5
