In [8]:
# pip install transformers
%pip install python-dotenv openai pandas tqdm

Collecting python-dotenv
  Using cached python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting openai
  Downloading openai-1.82.0-py3-none-any.whl.metadata (25 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Using cached anyio-4.9.0-py3-none-any.whl.metadata (4.7 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Using cached jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.11.5-py3-none-any.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.2/67.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sniffio (from openai)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Us

# Sentence classification

In [4]:
## 분류된 정답 데이터 기반 fine-tuning

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# 1. 데이터 준비
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [3]:
# 2. 모델 설정
def setup_model(num_labels):
    # BioBERT 모델과 토크나이저 로드
    model_name = "dmis-lab/biobert-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )
    return model, tokenizer

In [33]:
from tqdm import tqdm
from sklearn.metrics import f1_score
from collections import defaultdict
import numpy as np

def train_model(model, train_loader, val_loader, device, num_epochs=10, label_names=None):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    
    model.to(device)
    best_val_accuracy = 0.0  # 최고 accuracy 추적
    
    for epoch in range(num_epochs):
        # 학습
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        train_pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} [Train]')
        for batch in train_pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            predictions = torch.argmax(outputs.logits, dim=1)
            train_correct += (predictions == labels).sum().item()
            train_total += labels.size(0)
            
            train_pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{(train_correct/train_total)*100:.2f}%'
            })
        
        # 검증
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        all_preds = []
        all_labels = []

        correct_per_class = defaultdict(int)
        total_per_class = defaultdict(int)
        
        val_pbar = tqdm(val_loader, desc=f'Epoch {epoch+1}/{num_epochs} [Val]')
        with torch.no_grad():
            for batch in val_pbar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                
                predictions = torch.argmax(outputs.logits, dim=1)
                val_correct += (predictions == labels).sum().item()
                val_total += labels.size(0)

                # F1 및 클래스별 accuracy 집계용
                all_preds.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                for true_label, pred_label in zip(labels.cpu().numpy(), predictions.cpu().numpy()):
                    total_per_class[true_label] += 1
                    if true_label == pred_label:
                        correct_per_class[true_label] += 1
                
                val_pbar.set_postfix({
                    'loss': f'{outputs.loss.item():.4f}',
                    'acc': f'{(val_correct/val_total)*100:.2f}%'
                })
        
        # 결과 계산
        train_accuracy = (train_correct / train_total) * 100
        val_accuracy = (val_correct / val_total) * 100
        val_f1 = f1_score(all_labels, all_preds, average='macro')

        print(f'\nEpoch {epoch+1} 결과:')
        print(f'Training Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_accuracy:.2f}%')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_accuracy:.2f}%, F1 Score: {val_f1:.4f}')
        
        # 클래스별 Accuracy 출력
        print("클래스별 Accuracy:")
        num_classes = len(set(all_labels))
        for i in range(num_classes):
            acc = correct_per_class[i] / total_per_class[i] if total_per_class[i] > 0 else 0.0
            label_str = label_names[i] if label_names and i < len(label_names) else f"Class {i}"
            print(f"{label_str}: {acc:.2%} ({correct_per_class[i]}/{total_per_class[i]})")
        
        # 최고 accuracy 모델 저장
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), '/kaggle/working/best_biobert_model.pt')
            print(f'모델 저장됨 (Validation Accuracy: {val_accuracy:.2f}%)')

In [34]:
merged_df = pd.read_csv('/kaggle/input/classify-pubmed/merged_data.csv')
merged_df = merged_df[['text', 'label']]
merged_df.head()

Unnamed: 0,text,label
0,"Patiromer, an oral potassium (K(+)) binder, ha...",0
1,"Selenium (p = 0.0001, OR 0.788, 95% CI 0.703-0...",1
2,"Potassium may protect against MM, while Calciu...",1
3,The guide outlines key dietary restrictions as...,0
4,Culturally relevant substitutions and preparat...,0


In [35]:
def main():
    # 데이터 로드
    # merged_df는 이전에 합친 데이터프레임
    texts = merged_df['text'].values
    labels = merged_df['label'].values
    
    # 데이터 분할
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )
    
    # 모델과 토크나이저 설정
    num_labels = 5  # 레이블의 고유한 값 개수
    model, tokenizer = setup_model(num_labels)
    
    # 데이터셋 생성
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)
    
    # 데이터로더 생성
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    
    # GPU 사용 가능 여부 확인
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 모델 학습
    train_model(model, train_loader, val_loader, device)

if __name__ == "__main__":
    main()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10 [Train]: 100%|██████████| 68/68 [01:09<00:00,  1.02s/it, loss=1.2044, acc=38.71%]
Epoch 1/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.85it/s, loss=1.1511, acc=57.72%]



Epoch 1 결과:
Training Loss: 1.4705, Accuracy: 38.71%
Validation Loss: 1.2100, Accuracy: 57.72%, F1 Score: 0.5226
클래스별 Accuracy:
Class 0: 35.00% (14/40)
Class 1: 87.04% (47/54)
Class 2: 93.85% (61/65)
Class 3: 41.38% (24/58)
Class 4: 20.00% (11/55)
모델 저장됨 (Validation Accuracy: 57.72%)


Epoch 2/10 [Train]: 100%|██████████| 68/68 [01:09<00:00,  1.03s/it, loss=0.8092, acc=71.98%]
Epoch 2/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.82it/s, loss=0.7155, acc=73.90%]



Epoch 2 결과:
Training Loss: 0.9007, Accuracy: 71.98%
Validation Loss: 0.7908, Accuracy: 73.90%, F1 Score: 0.7277
클래스별 Accuracy:
Class 0: 60.00% (24/40)
Class 1: 72.22% (39/54)
Class 2: 86.15% (56/65)
Class 3: 63.79% (37/58)
Class 4: 81.82% (45/55)
모델 저장됨 (Validation Accuracy: 73.90%)


Epoch 3/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.3431, acc=86.27%]
Epoch 3/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.83it/s, loss=0.5127, acc=76.47%]



Epoch 3 결과:
Training Loss: 0.4929, Accuracy: 86.27%
Validation Loss: 0.6757, Accuracy: 76.47%, F1 Score: 0.7561
클래스별 Accuracy:
Class 0: 80.00% (32/40)
Class 1: 87.04% (47/54)
Class 2: 84.62% (55/65)
Class 3: 53.45% (31/58)
Class 4: 78.18% (43/55)
모델 저장됨 (Validation Accuracy: 76.47%)


Epoch 4/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0637, acc=94.10%]
Epoch 4/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.81it/s, loss=0.4344, acc=80.51%]



Epoch 4 결과:
Training Loss: 0.2501, Accuracy: 94.10%
Validation Loss: 0.6475, Accuracy: 80.51%, F1 Score: 0.7952
클래스별 Accuracy:
Class 0: 72.50% (29/40)
Class 1: 87.04% (47/54)
Class 2: 89.23% (58/65)
Class 3: 63.79% (37/58)
Class 4: 87.27% (48/55)
모델 저장됨 (Validation Accuracy: 80.51%)


Epoch 5/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0571, acc=97.42%]
Epoch 5/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.82it/s, loss=0.6233, acc=81.99%]



Epoch 5 결과:
Training Loss: 0.1199, Accuracy: 97.42%
Validation Loss: 0.6465, Accuracy: 81.99%, F1 Score: 0.8100
클래스별 Accuracy:
Class 0: 72.50% (29/40)
Class 1: 92.59% (50/54)
Class 2: 86.15% (56/65)
Class 3: 65.52% (38/58)
Class 4: 90.91% (50/55)
모델 저장됨 (Validation Accuracy: 81.99%)


Epoch 6/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0424, acc=99.35%]
Epoch 6/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.79it/s, loss=0.5711, acc=81.99%]



Epoch 6 결과:
Training Loss: 0.0497, Accuracy: 99.35%
Validation Loss: 0.7050, Accuracy: 81.99%, F1 Score: 0.8115
클래스별 Accuracy:
Class 0: 80.00% (32/40)
Class 1: 90.74% (49/54)
Class 2: 84.62% (55/65)
Class 3: 62.07% (36/58)
Class 4: 92.73% (51/55)


Epoch 7/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0169, acc=99.82%]
Epoch 7/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.80it/s, loss=0.7734, acc=80.88%]



Epoch 7 결과:
Training Loss: 0.0286, Accuracy: 99.82%
Validation Loss: 0.7580, Accuracy: 80.88%, F1 Score: 0.8012
클래스별 Accuracy:
Class 0: 70.00% (28/40)
Class 1: 94.44% (51/54)
Class 2: 86.15% (56/65)
Class 3: 70.69% (41/58)
Class 4: 80.00% (44/55)


Epoch 8/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0236, acc=99.63%]
Epoch 8/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.82it/s, loss=0.8827, acc=82.72%]



Epoch 8 결과:
Training Loss: 0.0228, Accuracy: 99.63%
Validation Loss: 0.7812, Accuracy: 82.72%, F1 Score: 0.8163
클래스별 Accuracy:
Class 0: 62.50% (25/40)
Class 1: 92.59% (50/54)
Class 2: 83.08% (54/65)
Class 3: 77.59% (45/58)
Class 4: 92.73% (51/55)
모델 저장됨 (Validation Accuracy: 82.72%)


Epoch 9/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0105, acc=99.82%] 
Epoch 9/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.81it/s, loss=0.4008, acc=82.72%]



Epoch 9 결과:
Training Loss: 0.0141, Accuracy: 99.82%
Validation Loss: 0.7828, Accuracy: 82.72%, F1 Score: 0.8201
클래스별 Accuracy:
Class 0: 72.50% (29/40)
Class 1: 92.59% (50/54)
Class 2: 87.69% (57/65)
Class 3: 72.41% (42/58)
Class 4: 85.45% (47/55)


Epoch 10/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0056, acc=99.91%] 
Epoch 10/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.78it/s, loss=0.4568, acc=83.09%]



Epoch 10 결과:
Training Loss: 0.0102, Accuracy: 99.91%
Validation Loss: 0.8066, Accuracy: 83.09%, F1 Score: 0.8219
클래스별 Accuracy:
Class 0: 80.00% (32/40)
Class 1: 94.44% (51/54)
Class 2: 86.15% (56/65)
Class 3: 62.07% (36/58)
Class 4: 92.73% (51/55)
모델 저장됨 (Validation Accuracy: 83.09%)


In [36]:
from transformers import AutoModelForSequenceClassification

# 예: BioBERT 기반 5-class 분류 모델
model = AutoModelForSequenceClassification.from_pretrained(
    'dmis-lab/biobert-base-cased-v1.1',
    num_labels=5  # 분류할 클래스 수에 맞게 수정
)
model.load_state_dict(torch.load('/kaggle/working/best_biobert_model.pt'))
model.to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [45]:
model.eval()

# 4. 예시 추론
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
inputs = tokenizer("Patients with malnutrition, alcoholism, inflammatory bowel disease, and malabsorption syndromes are at an increased risk of zinc deficiency.", return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    pred_class = torch.argmax(probs, dim=1)
    print("예측 클래스:", pred_class.item())

예측 클래스: 3


## !!Sentence Classification 용 데이터베이스 구축 필요

## Sentence Classification 실행 후 분류 결과 저장 (CSV)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

model_path = "./best_biobert_model.pt"  # 저장된 모델 경로
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).to("cuda")
model.eval()

def classify_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        label = torch.argmax(probs, dim=1).item()
    return label

# 입력 CSV: 'sentences.csv' (컬럼: 'text')
df = pd.read_csv("sentences.csv")
df['label'] = df['text'].apply(classify_sentence)
df.to_csv("classified_sentences.csv", index=False)

ModuleNotFoundError: No module named 'transformers'

# NER

## NER 정답 데이터 생성
# !!!! id 부여 해서 관리해야함
- sentence classification 정답 데이터 기반 생성

In [2]:
pip install "numpy<2.0.0"

Collecting numpy<2.0.0
  Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.4
    Uninstalling numpy-2.2.4:
      Successfully uninstalled numpy-2.2.4
Successfully installed numpy-1.26.4
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip show openai

Name: openai
Version: 1.61.1
Summary: The official Python library for the openai API
Home-page: https://github.com/openai/openai-python
Author: 
Author-email: OpenAI <support@openai.com>
License: 
Location: /opt/anaconda3/lib/python3.12/site-packages
Requires: anyio, distro, httpx, jiter, pydantic, sniffio, tqdm, typing-extensions
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [9]:
from openai import OpenAI
from dotenv import load_dotenv
import pandas as pd
import os
import json
from tqdm import tqdm

# 환경 변수에서 API 키 로드
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

INPUT_CSV_PATH = "Database/NER/sentence_classification_answer_not_100.csv"
OUTPUT_JSONL_PATH = "Database/NER/gpt4_ner_results_not_100.jsonl"
MODEL_NAME = "gpt-4-turbo"

# 프롬프트 생성 함수
def create_prompt(text):
    return f"""
다음 문장에서 영양소 관련 엔티티를 추출해줘. 각 엔티티는 다음과 같은 유형 중 하나야:

- INGREDIENT: 구체적인 영양소 또는 화학 성분 이름 (예: "Vitamin A", "Potassium", "Iron"). 일반적인 물질이나 작용(ex. "fluid", "energy")은 제외.
- SYMPTOM: 명확히 정의된 질병명이나 의학적 증상만 포함해줘. "risk", "likelihood", "association" 같은 표현은 제외하고, "MM risk"와 같은 경우엔 "MM"만 추출해줘.
- DOSAGE: 수치 + 단위 조합으로 된 복용량만 포함 (예: "5000 IU/day", "10mg of iron").
- SENSITIVE_CONDITION: 임신, 수유, 약물복용, 노인, 어린이 등의 민감 조건.
- PERSONAL_INFO: 나이, 성별, 키, 몸무게 등 인구통계적 정보.

반드시 아래와 같은 JSON 형식으로만 응답해줘:

{{
  "text": "<원본 문장>",
  "entities": {{
    "INGREDIENT": [...],
    "SYMPTOM": [...],
    "DOSAGE": [...],
    "SENSITIVE_CONDITION": [...],
    "PERSONAL_INFO": [...]
  }}
}}

다음 문장:
\"{text}\"
""".strip()

# GPT 호출 함수
def extract_entities_with_gpt(text, label, model=MODEL_NAME):
    # ✅ label이 0이면 GPT 호출 자체를 생략
    if label == 0:
        return None
    
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "너는 pubmed 데이터를 기반으로 한 의학 논문에서 정보를 구조화해주는 전문가야."
                },
                {
                    "role": "user",
                    "content": create_prompt(text)
                }
            ],
            temperature=0
        )
        content = response.choices[0].message.content.strip()
        return json.loads(content)
    except json.JSONDecodeError as e:
        return {"text": text, "error": f"JSON 파싱 실패: {str(e)}", "raw": content}
    except Exception as e:
        return {"text": text, "error": str(e)}

# CSV 불러오기
df = pd.read_csv(INPUT_CSV_PATH)
if "text" not in df.columns or "label" not in df.columns:
    raise ValueError("CSV에 'text' 또는 'label' 컬럼이 없습니다.")

# 저장
with open(OUTPUT_JSONL_PATH, "w", encoding="utf-8") as f:
    for _, row in tqdm(df.iterrows(), total=len(df), desc="NER 처리 중"):
        text = row["text"]
        label = row["label"]

        result = extract_entities_with_gpt(text, label)
        if result is not None:
            f.write(json.dumps(result, ensure_ascii=False) + "\n")
            f.flush()

NER 처리 중: 100%|██████████| 1133/1133 [55:21<00:00,  2.93s/it] 


- 단일 문장 테스팅용 코드

In [21]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
from tqdm import tqdm

# 환경 변수에서 API 키 로드
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# 👉 여러 문장 테스트
test_sentences = [
    "Selenium (p = 0.0001, OR 0.788, 95% CI 0.703-0.883) and Potassium (p = 0.045, OR 0.463, 95% CI 0.219-0.982) were significantly negatively associated with MM risk, suggesting a protective effect.",
    "The guide outlines key dietary restrictions associated with dialysis-specifically for phosphorus, potassium, sodium, and fluid intake-and presents alternatives using familiar Filipino foods.",
    "This panting causes respiratory hypocapnia, which increases the renal excretion of buffer molecules including sodium, potassium, and bicarbonate."
]

def create_prompt(text):
    return f"""
다음 문장에서 영양소 관련 엔티티를 추출해줘. 각 엔티티는 다음과 같은 유형 중 하나야:

- INGREDIENT: 구체적인 영양소 또는 화학 성분 이름 (예: "Vitamin A", "Potassium", "Iron"). 일반적인 물질이나 작용(ex. "fluid", "energy")은 제외.
- SYMPTOM: 명확히 정의된 질병명이나 의학적 증상만 포함해줘. "risk", "likelihood", "association" 같은 표현은 제외하고, "MM risk"와 같은 경우엔 "MM"만 추출해줘.
- DOSAGE: 수치 + 단위 조합으로 된 복용량만 포함 (예: "5000 IU/day", "10mg of iron").
- SENSITIVE_CONDITION: 임신, 수유, 약물복용, 노인, 어린이 등의 민감 조건.
- PERSONAL_INFO: 나이, 성별, 키, 몸무게 등 인구통계적 정보.

반드시 아래와 같은 JSON 형식으로만 응답해줘:

{{
  "text": "<원본 문장>",
  "entities": {{
    "INGREDIENT": [...],
    "SYMPTOM": [...],
    "DOSAGE": [...],
    "SENSITIVE_CONDITION": [...],
    "PERSONAL_INFO": [...]
  }}
}}

다음 문장:
\"{text}\"
"""

# GPT 호출 함수
def extract_entities_with_gpt(text, model="gpt-3.5-turbo-1106"): # gpt-4-turbo-1106 모델 사용
    """
    GPT API를 사용하여 문장에서 엔티티를 추출하는 함수.
    예외 처리 및 JSON 파싱 안정성 추가.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "너는 pubmed 데이터를 기반으로 한 의학 논문에서 정보를 구조화해주는 전문가야."
                },
                {
                    "role": "user",
                    "content": create_prompt(text)
                }
            ],
            temperature=0
        )

        content = response.choices[0].message.content.strip()

        # 응답이 JSON이 아닐 수 있으므로 확인
        return json.loads(content)

    except json.JSONDecodeError as e:
        return {"text": text, "error": f"JSON 파싱 실패: {str(e)}", "raw": content}

    except Exception as e:
        return {"text": text, "error": str(e)}

# 실행 (with tqdm)
results = []
for sentence in tqdm(test_sentences, desc="GPT 엔티티 추출 중"):
    result = extract_entities_with_gpt(sentence)
    results.append(result)

# 결과 확인
print(json.dumps(results, indent=2, ensure_ascii=False))
with open("gpt4_ner_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

GPT 엔티티 추출 중: 100%|██████████| 3/3 [00:08<00:00,  2.82s/it]

[
  {
    "text": "Selenium (p = 0.0001, OR 0.788, 95% CI 0.703-0.883) and Potassium (p = 0.045, OR 0.463, 95% CI 0.219-0.982) were significantly negatively associated with MM risk, suggesting a protective effect.",
    "entities": {
      "INGREDIENT": [
        "Selenium",
        "Potassium"
      ],
      "SYMPTOM": [
        "MM risk"
      ],
      "DOSAGE": [],
      "SENSITIVE_CONDITION": [],
      "PERSONAL_INFO": []
    }
  },
  {
    "text": "The guide outlines key dietary restrictions associated with dialysis-specifically for phosphorus, potassium, sodium, and fluid intake-and presents alternatives using familiar Filipino foods.",
    "entities": {
      "INGREDIENT": [
        "phosphorus",
        "potassium",
        "sodium"
      ],
      "SYMPTOM": [
        "dialysis"
      ],
      "SENSITIVE_CONDITION": []
    }
  },
  {
    "text": "This panting causes respiratory hypocapnia, which increases the renal excretion of buffer molecules including sodium, potassium, and 




## BIO 포맷으로 변환

In [None]:
import json
from transformers import AutoTokenizer

# 1. BioBERT 토크나이저 로딩
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# 2. 파일 경로 설정
input_path = "Database/NER/gpt4_ner_results.jsonl"
output_path = "Database/NER/ner_bio_format_results.jsonl"

# 3. 사용할 엔티티 태그 종류
entity_labels = ["INGREDIENT", "SYMPTOM", "DOSAGE", "SENSITIVE_CONDITION", "PERSONAL_INFO"]

# 4. BIO 태깅 함수
def convert_to_bio(text, entities):
    tokens = tokenizer.tokenize(text)
    labels = ['O'] * len(tokens)

    for ent_type in entity_labels:
        for phrase in entities.get(ent_type, []):
            phrase_tokens = tokenizer.tokenize(phrase)
            for i in range(len(tokens) - len(phrase_tokens) + 1):
                if tokens[i:i + len(phrase_tokens)] == phrase_tokens:
                    labels[i] = f'B-{ent_type}'
                    for j in range(1, len(phrase_tokens)):
                        labels[i + j] = f'I-{ent_type}'
                    break  # 하나만 처리하고 끝

    return {"tokens": tokens, "labels": labels}

# 5. 입력 JSONL 읽고 변환
bio_data = []
with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        try:
            item = json.loads(line)
            if "text" in item and "entities" in item:
                bio_entry = convert_to_bio(item["text"], item["entities"])
                bio_data.append(bio_entry)
        except Exception as e:
            print(f"Error parsing line: {e}")

# 6. BIO JSONL 저장 (한 줄씩 JSON 객체로)
with open(output_path, "w", encoding="utf-8") as f:
    for item in bio_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")
        f.flush()

print(f"✅ BIO 포맷 변환 완료! 저장 경로: {output_path}")

FileNotFoundError: [Errno 2] No such file or directory: 'ner_structured_results.jsonl'

- 테스팅용

In [22]:
import json
from transformers import AutoTokenizer

# BioBERT tokenizer 로딩
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# 입력 및 출력 파일 경로
input_path = "gpt4_ner_results.json"
output_path = "ner_bio_format.json"

# 엔티티 타입 리스트
entity_labels = ["INGREDIENT", "SYMPTOM", "DOSAGE", "SENSITIVE_CONDITION", "PERSONAL_INFO"]

# BIO 변환 함수
def convert_to_bio(text, entities):
    tokens = tokenizer.tokenize(text)
    labels = ['O'] * len(tokens)

    for ent_type in entity_labels:
        for phrase in entities.get(ent_type, []):
            phrase_tokens = tokenizer.tokenize(phrase)
            for i in range(len(tokens) - len(phrase_tokens) + 1):
                if tokens[i:i + len(phrase_tokens)] == phrase_tokens:
                    labels[i] = f"B-{ent_type}"
                    for j in range(1, len(phrase_tokens)):
                        labels[i + j] = f"I-{ent_type}"
                    break  # 중복 방지
    return {"tokens": tokens, "labels": labels}

# 파일 로딩 및 변환 실행
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

bio_data = []
for item in data:
    if "text" in item and "entities" in item:
        bio_entry = convert_to_bio(item["text"], item["entities"])
        bio_data.append(bio_entry)

# 저장
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(bio_data, f, indent=2, ensure_ascii=False)

print(f"✅ BIO 변환 완료! 저장 위치: {output_path}")

✅ BIO 변환 완료! 저장 위치: ner_bio_format.json


## finetuning

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from torch.utils.data import Dataset
import json

class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 데이터 로드 (BIO 포맷 JSONL)
with open("ner_dataset.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

label_list = ["O", "B-INGREDIENT", "I-INGREDIENT", "B-DOSAGE", "I-DOSAGE", "B-SYMPTOM", "I-SYMPTOM",
              "B-TARGET", "I-TARGET", "B-SENSITIVE", "I-SENSITIVE", "B-GENDER", "I-GENDER", "B-AGE_GROUP", "I-AGE_GROUP"]
label_to_id = {label: i for i, label in enumerate(label_list)}

# 전처리
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
tokens = [d['tokens'] for d in data]
labels = [[label_to_id[tag] for tag in d['labels']] for d in data]

encodings = tokenizer(tokens, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
# offset_mapping은 NER 학습에 불필요하므로 제거
encodings.pop("offset_mapping")

dataset = NERDataset(encodings, labels)

# 모델 및 학습 설정
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=len(label_list))

data_collator = DataCollatorForTokenClassification(tokenizer)
training_args = TrainingArguments(
    output_dir="./biobert_ner_output",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()
trainer.save_model("./biobert_ner_model")

# rag용 메타데이터 형식으로 재변환

In [23]:
import json

def merge_tokens(tokens):
    text = ""
    for tok in tokens:
        if tok.startswith("##"):
            text += tok[2:]
        elif text:
            text += " " + tok
        else:
            text = tok
    return text.strip()

def recover_entities(tokens, labels):
    entities = {
        "INGREDIENT": [],
        "SYMPTOM": [],
        "DOSAGE": [],
        "SENSITIVE_CONDITION": [],
        "PERSONAL_INFO": []
    }
    current_tokens = []
    current_type = None

    for token, label in zip(tokens, labels):
        if label == "O":
            if current_tokens:
                entities[current_type].append(merge_tokens(current_tokens))
                current_tokens = []
                current_type = None
        elif label.startswith("B-"):
            if current_tokens:
                entities[current_type].append(merge_tokens(current_tokens))
            current_type = label[2:]
            current_tokens = [token]
        elif label.startswith("I-") and current_type == label[2:]:
            current_tokens.append(token)
        else:
            if current_tokens:
                entities[current_type].append(merge_tokens(current_tokens))
            current_tokens = []
            current_type = None

    if current_tokens:
        entities[current_type].append(merge_tokens(current_tokens))

    return entities

def convert_bio_json_to_rag_jsonl(input_path, output_path="rag_converted.jsonl"):
    with open(input_path, "r", encoding="utf-8") as f:
        bio_data = json.load(f)

    rag_data = []
    for item in bio_data:
        tokens = item["tokens"]
        labels = item["labels"]
        text = merge_tokens(tokens)
        meta = recover_entities(tokens, labels)
        rag_data.append({
            "text": text,
            "meta": meta
        })

    with open(output_path, "w", encoding="utf-8") as f:
        for entry in rag_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"✅ 변환 완료: {output_path}")

In [24]:
convert_bio_json_to_rag_jsonl("ner_bio_format.json")

✅ 변환 완료: rag_converted.jsonl
