#BIO 포맷으로 변환

In [1]:
import json
from transformers import AutoTokenizer

# 1. BioBERT 토크나이저 로딩
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# 2. 파일 경로 설정
input_path = "Database/NER/gpt4_ner_results.jsonl"
output_path = "Database/NER/ner_bio_format_results.jsonl"

# 3. 사용할 엔티티 태그 종류
entity_labels = ["INGREDIENT", "SYMPTOM", "DOSAGE", "SENSITIVE_CONDITION", "PERSONAL_INFO"]

# 4. BIO 태깅 함수
def convert_to_bio(text, entities):
    tokens = tokenizer.tokenize(text)
    labels = ['O'] * len(tokens)

    for ent_type in entity_labels:
        for phrase in entities.get(ent_type, []):
            phrase_tokens = tokenizer.tokenize(phrase)
            for i in range(len(tokens) - len(phrase_tokens) + 1):
                if tokens[i:i + len(phrase_tokens)] == phrase_tokens:
                    labels[i] = f'B-{ent_type}'
                    for j in range(1, len(phrase_tokens)):
                        labels[i + j] = f'I-{ent_type}'
                    break  # 하나만 처리하고 끝

    return {"tokens": tokens, "labels": labels}

# 5. 입력 JSONL 읽고 변환
bio_data = []
with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        try:
            item = json.loads(line)
            if "text" in item and "entities" in item:
                bio_entry = convert_to_bio(item["text"], item["entities"])
                bio_data.append(bio_entry)
        except Exception as e:
            print(f"Error parsing line: {e}")

# 6. BIO JSONL 저장 (한 줄씩 JSON 객체로)
with open(output_path, "w", encoding="utf-8") as f:
    for item in bio_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")
        f.flush()

print(f"✅ BIO 포맷 변환 완료! 저장 경로: {output_path}")

✅ BIO 포맷 변환 완료! 저장 경로: Database/NER/ner_bio_format_results.jsonl


# finetuning


In [3]:
!pip install seqeval



In [1]:
!pip install torchcrf
# !pip install torch



In [3]:
import sys
print(sys.executable)

/opt/anaconda3/bin/python


In [10]:
import sys
!{sys.executable} -m pip install torchcrf



In [2]:
import pip
print(pip.__version__)
!{sys.executable} -m pip list | grep torchcrf

24.2
zsh:1: parse error near `-m'


In [None]:
# 1. 라이브러리 임포트
import json
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModel, Trainer, TrainingArguments,
    DataCollatorForTokenClassification, EarlyStoppingCallback
)
from torch.utils.data import Dataset
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from torch import nn
from torchcrf import CRF



ModuleNotFoundError: No module named 'torchcrf'

In [None]:
# 2. 데이터 로드
with open("Database/NER/ner_bio_format_results.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# 3. 라벨 정의 및 매핑
label_list = [
    "O", "B-INGREDIENT", "I-INGREDIENT", "B-DOSAGE", "I-DOSAGE",
    "B-SYMPTOM", "I-SYMPTOM", "B-SENSITIVE_CONDITION", "I-SENSITIVE_CONDITION",
    "B-PERSONAL_INFO", "I-PERSONAL_INFO"
]
label_to_id = {label: i for i, label in enumerate(label_list)}

# 4. 토크나이저
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# 5. train/test 분할
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 6. 인코딩 + 레이블 정렬 함수
def encode_and_align_labels(data):
    tokens = [d['tokens'] for d in data]
    labels = [[label_to_id[tag] for tag in d['labels']] for d in data]
    encodings = tokenizer(tokens, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
    encodings.pop("offset_mapping")

    labels_aligned = []
    for i, label in enumerate(labels):
        word_ids = encodings.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] % 2 == 1 else label[word_idx] + 1)
            previous_word_idx = word_idx
        labels_aligned.append(label_ids)
    return encodings, labels_aligned

train_encodings, train_labels = encode_and_align_labels(train_data)
test_encodings, test_labels = encode_and_align_labels(test_data)

# 7. Dataset 정의
class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = NERDataset(train_encodings, train_labels)
test_dataset = NERDataset(test_encodings, test_labels)

# 8. CRF 모델 정의
class BioBERT_CRF(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs[0])
        emissions = self.classifier(sequence_output)
        if labels is not None:
            loss = -self.crf(emissions, labels, mask=attention_mask.bool(), reduction='mean')
            return {"loss": loss, "logits": emissions}
        else:
            pred = self.crf.decode(emissions, mask=attention_mask.bool())
            return {"logits": pred}

model = BioBERT_CRF("dmis-lab/biobert-base-cased-v1.1", num_labels=len(label_list))

# 9. 평가 메트릭 정의
def compute_metrics(p):
    predictions, labels = p
    if isinstance(predictions[0][0], int):  # CRF decode 결과
        decoded_preds = predictions
    else:
        decoded_preds = predictions.argmax(axis=-1)

    true_labels = []
    true_preds = []
    for pred, label in zip(decoded_preds, labels):
        pred_label = []
        true_label = []
        for p_i, l_i in zip(pred, label):
            if l_i != -100:
                pred_label.append(label_list[p_i])
                true_label.append(label_list[l_i])
        true_preds.append(pred_label)
        true_labels.append(true_label)

    print("\n" + classification_report(true_labels, true_preds))

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

# 10. 학습 설정 (early stopping 포함)
training_args = TrainingArguments(
    output_dir="./biobert_ner_output",
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

# 11. Trainer 정의 + EarlyStopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 12. 학습 및 저장
trainer.train()
trainer.save_model("./biobert_ner_model")
print("✅ 학습 및 모델 저장 완료")

# 13. 예측 후처리 예시
predictions = trainer.predict(test_dataset).predictions
decoded_preds = predictions if isinstance(predictions[0][0], int) else predictions.argmax(axis=-1)

print("\n🎯 예측 결과 예시")
for i in range(3):
    print(f"\n[문장 {i+1}]")
    result = []
    for p_i, l_i in zip(decoded_preds[i], test_labels[i]):
        if l_i != -100:
            result.append(f"PRED: {label_list[p_i]:<25} | TRUE: {label_list[l_i]}")
    print("\n".join(result))

ModuleNotFoundError: No module named 'torchcrf'

In [5]:
import json

def merge_tokens(tokens):
    text = ""
    for tok in tokens:
        if tok.startswith("##"):
            text += tok[2:]
        elif text:
            text += " " + tok
        else:
            text = tok
    return text.strip()

def recover_entities(tokens, labels):
    entities = {
        "INGREDIENT": [],
        "SYMPTOM": [],
        "DOSAGE": [],
        "SENSITIVE_CONDITION": [],
        "PERSONAL_INFO": []
    }
    current_tokens = []
    current_type = None

    for token, label in zip(tokens, labels):
        if label == "O":
            if current_tokens:
                entities[current_type].append(merge_tokens(current_tokens))
                current_tokens = []
                current_type = None
        elif label.startswith("B-"):
            if current_tokens:
                entities[current_type].append(merge_tokens(current_tokens))
            current_type = label[2:]
            current_tokens = [token]
        elif label.startswith("I-") and current_type == label[2:]:
            current_tokens.append(token)
        else:
            if current_tokens:
                entities[current_type].append(merge_tokens(current_tokens))
            current_tokens = []
            current_type = None

    if current_tokens:
        entities[current_type].append(merge_tokens(current_tokens))

    return entities

def convert_bio_json_to_rag_jsonl(input_path, output_path="Database/NER/metadata_converted.jsonl"):
    rag_data = []
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            tokens = item["tokens"]
            labels = item["labels"]
            text = merge_tokens(tokens)
            meta = recover_entities(tokens, labels)
            rag_data.append({
                "text": text,
                "meta": meta
            })

    with open(output_path, "w", encoding="utf-8") as f:
        for entry in rag_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"✅ 변환 완료: {output_path}")

In [6]:
convert_bio_json_to_rag_jsonl("Database/NER/ner_bio_format_results.jsonl")

✅ 변환 완료: Database/NER/metadata_converted.jsonl
