In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.2-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27

# Sentence classification

In [4]:
## 분류된 정답 데이터 기반 fine-tuning

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# 1. 데이터 준비
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [3]:
# 2. 모델 설정
def setup_model(num_labels):
    # BioBERT 모델과 토크나이저 로드
    model_name = "dmis-lab/biobert-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )
    return model, tokenizer

In [33]:
from tqdm import tqdm
from sklearn.metrics import f1_score
from collections import defaultdict
import numpy as np

def train_model(model, train_loader, val_loader, device, num_epochs=10, label_names=None):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()
    
    model.to(device)
    best_val_accuracy = 0.0  # 최고 accuracy 추적
    
    for epoch in range(num_epochs):
        # 학습
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        train_pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} [Train]')
        for batch in train_pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            predictions = torch.argmax(outputs.logits, dim=1)
            train_correct += (predictions == labels).sum().item()
            train_total += labels.size(0)
            
            train_pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{(train_correct/train_total)*100:.2f}%'
            })
        
        # 검증
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        all_preds = []
        all_labels = []

        correct_per_class = defaultdict(int)
        total_per_class = defaultdict(int)
        
        val_pbar = tqdm(val_loader, desc=f'Epoch {epoch+1}/{num_epochs} [Val]')
        with torch.no_grad():
            for batch in val_pbar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                
                predictions = torch.argmax(outputs.logits, dim=1)
                val_correct += (predictions == labels).sum().item()
                val_total += labels.size(0)

                # F1 및 클래스별 accuracy 집계용
                all_preds.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                for true_label, pred_label in zip(labels.cpu().numpy(), predictions.cpu().numpy()):
                    total_per_class[true_label] += 1
                    if true_label == pred_label:
                        correct_per_class[true_label] += 1
                
                val_pbar.set_postfix({
                    'loss': f'{outputs.loss.item():.4f}',
                    'acc': f'{(val_correct/val_total)*100:.2f}%'
                })
        
        # 결과 계산
        train_accuracy = (train_correct / train_total) * 100
        val_accuracy = (val_correct / val_total) * 100
        val_f1 = f1_score(all_labels, all_preds, average='macro')

        print(f'\nEpoch {epoch+1} 결과:')
        print(f'Training Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_accuracy:.2f}%')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_accuracy:.2f}%, F1 Score: {val_f1:.4f}')
        
        # 클래스별 Accuracy 출력
        print("클래스별 Accuracy:")
        num_classes = len(set(all_labels))
        for i in range(num_classes):
            acc = correct_per_class[i] / total_per_class[i] if total_per_class[i] > 0 else 0.0
            label_str = label_names[i] if label_names and i < len(label_names) else f"Class {i}"
            print(f"{label_str}: {acc:.2%} ({correct_per_class[i]}/{total_per_class[i]})")
        
        # 최고 accuracy 모델 저장
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), '/kaggle/working/best_biobert_model.pt')
            print(f'모델 저장됨 (Validation Accuracy: {val_accuracy:.2f}%)')

In [34]:
merged_df = pd.read_csv('/kaggle/input/classify-pubmed/merged_data.csv')
merged_df = merged_df[['text', 'label']]
merged_df.head()

Unnamed: 0,text,label
0,"Patiromer, an oral potassium (K(+)) binder, ha...",0
1,"Selenium (p = 0.0001, OR 0.788, 95% CI 0.703-0...",1
2,"Potassium may protect against MM, while Calciu...",1
3,The guide outlines key dietary restrictions as...,0
4,Culturally relevant substitutions and preparat...,0


In [35]:
def main():
    # 데이터 로드
    # merged_df는 이전에 합친 데이터프레임
    texts = merged_df['text'].values
    labels = merged_df['label'].values
    
    # 데이터 분할
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )
    
    # 모델과 토크나이저 설정
    num_labels = 5  # 레이블의 고유한 값 개수
    model, tokenizer = setup_model(num_labels)
    
    # 데이터셋 생성
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)
    
    # 데이터로더 생성
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    
    # GPU 사용 가능 여부 확인
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 모델 학습
    train_model(model, train_loader, val_loader, device)

if __name__ == "__main__":
    main()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10 [Train]: 100%|██████████| 68/68 [01:09<00:00,  1.02s/it, loss=1.2044, acc=38.71%]
Epoch 1/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.85it/s, loss=1.1511, acc=57.72%]



Epoch 1 결과:
Training Loss: 1.4705, Accuracy: 38.71%
Validation Loss: 1.2100, Accuracy: 57.72%, F1 Score: 0.5226
클래스별 Accuracy:
Class 0: 35.00% (14/40)
Class 1: 87.04% (47/54)
Class 2: 93.85% (61/65)
Class 3: 41.38% (24/58)
Class 4: 20.00% (11/55)
모델 저장됨 (Validation Accuracy: 57.72%)


Epoch 2/10 [Train]: 100%|██████████| 68/68 [01:09<00:00,  1.03s/it, loss=0.8092, acc=71.98%]
Epoch 2/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.82it/s, loss=0.7155, acc=73.90%]



Epoch 2 결과:
Training Loss: 0.9007, Accuracy: 71.98%
Validation Loss: 0.7908, Accuracy: 73.90%, F1 Score: 0.7277
클래스별 Accuracy:
Class 0: 60.00% (24/40)
Class 1: 72.22% (39/54)
Class 2: 86.15% (56/65)
Class 3: 63.79% (37/58)
Class 4: 81.82% (45/55)
모델 저장됨 (Validation Accuracy: 73.90%)


Epoch 3/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.3431, acc=86.27%]
Epoch 3/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.83it/s, loss=0.5127, acc=76.47%]



Epoch 3 결과:
Training Loss: 0.4929, Accuracy: 86.27%
Validation Loss: 0.6757, Accuracy: 76.47%, F1 Score: 0.7561
클래스별 Accuracy:
Class 0: 80.00% (32/40)
Class 1: 87.04% (47/54)
Class 2: 84.62% (55/65)
Class 3: 53.45% (31/58)
Class 4: 78.18% (43/55)
모델 저장됨 (Validation Accuracy: 76.47%)


Epoch 4/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0637, acc=94.10%]
Epoch 4/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.81it/s, loss=0.4344, acc=80.51%]



Epoch 4 결과:
Training Loss: 0.2501, Accuracy: 94.10%
Validation Loss: 0.6475, Accuracy: 80.51%, F1 Score: 0.7952
클래스별 Accuracy:
Class 0: 72.50% (29/40)
Class 1: 87.04% (47/54)
Class 2: 89.23% (58/65)
Class 3: 63.79% (37/58)
Class 4: 87.27% (48/55)
모델 저장됨 (Validation Accuracy: 80.51%)


Epoch 5/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0571, acc=97.42%]
Epoch 5/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.82it/s, loss=0.6233, acc=81.99%]



Epoch 5 결과:
Training Loss: 0.1199, Accuracy: 97.42%
Validation Loss: 0.6465, Accuracy: 81.99%, F1 Score: 0.8100
클래스별 Accuracy:
Class 0: 72.50% (29/40)
Class 1: 92.59% (50/54)
Class 2: 86.15% (56/65)
Class 3: 65.52% (38/58)
Class 4: 90.91% (50/55)
모델 저장됨 (Validation Accuracy: 81.99%)


Epoch 6/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0424, acc=99.35%]
Epoch 6/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.79it/s, loss=0.5711, acc=81.99%]



Epoch 6 결과:
Training Loss: 0.0497, Accuracy: 99.35%
Validation Loss: 0.7050, Accuracy: 81.99%, F1 Score: 0.8115
클래스별 Accuracy:
Class 0: 80.00% (32/40)
Class 1: 90.74% (49/54)
Class 2: 84.62% (55/65)
Class 3: 62.07% (36/58)
Class 4: 92.73% (51/55)


Epoch 7/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0169, acc=99.82%]
Epoch 7/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.80it/s, loss=0.7734, acc=80.88%]



Epoch 7 결과:
Training Loss: 0.0286, Accuracy: 99.82%
Validation Loss: 0.7580, Accuracy: 80.88%, F1 Score: 0.8012
클래스별 Accuracy:
Class 0: 70.00% (28/40)
Class 1: 94.44% (51/54)
Class 2: 86.15% (56/65)
Class 3: 70.69% (41/58)
Class 4: 80.00% (44/55)


Epoch 8/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0236, acc=99.63%]
Epoch 8/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.82it/s, loss=0.8827, acc=82.72%]



Epoch 8 결과:
Training Loss: 0.0228, Accuracy: 99.63%
Validation Loss: 0.7812, Accuracy: 82.72%, F1 Score: 0.8163
클래스별 Accuracy:
Class 0: 62.50% (25/40)
Class 1: 92.59% (50/54)
Class 2: 83.08% (54/65)
Class 3: 77.59% (45/58)
Class 4: 92.73% (51/55)
모델 저장됨 (Validation Accuracy: 82.72%)


Epoch 9/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0105, acc=99.82%] 
Epoch 9/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.81it/s, loss=0.4008, acc=82.72%]



Epoch 9 결과:
Training Loss: 0.0141, Accuracy: 99.82%
Validation Loss: 0.7828, Accuracy: 82.72%, F1 Score: 0.8201
클래스별 Accuracy:
Class 0: 72.50% (29/40)
Class 1: 92.59% (50/54)
Class 2: 87.69% (57/65)
Class 3: 72.41% (42/58)
Class 4: 85.45% (47/55)


Epoch 10/10 [Train]: 100%|██████████| 68/68 [01:10<00:00,  1.03s/it, loss=0.0056, acc=99.91%] 
Epoch 10/10 [Val]: 100%|██████████| 17/17 [00:04<00:00,  3.78it/s, loss=0.4568, acc=83.09%]



Epoch 10 결과:
Training Loss: 0.0102, Accuracy: 99.91%
Validation Loss: 0.8066, Accuracy: 83.09%, F1 Score: 0.8219
클래스별 Accuracy:
Class 0: 80.00% (32/40)
Class 1: 94.44% (51/54)
Class 2: 86.15% (56/65)
Class 3: 62.07% (36/58)
Class 4: 92.73% (51/55)
모델 저장됨 (Validation Accuracy: 83.09%)


In [36]:
from transformers import AutoModelForSequenceClassification

# 예: BioBERT 기반 5-class 분류 모델
model = AutoModelForSequenceClassification.from_pretrained(
    'dmis-lab/biobert-base-cased-v1.1',
    num_labels=5  # 분류할 클래스 수에 맞게 수정
)
model.load_state_dict(torch.load('/kaggle/working/best_biobert_model.pt'))
model.to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [45]:
model.eval()

# 4. 예시 추론
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
inputs = tokenizer("Patients with malnutrition, alcoholism, inflammatory bowel disease, and malabsorption syndromes are at an increased risk of zinc deficiency.", return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    pred_class = torch.argmax(probs, dim=1)
    print("예측 클래스:", pred_class.item())

예측 클래스: 3


## Sentence Classification 실행 후 분류 결과 저장 (CSV)

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

model_path = "./biobert_sentence_classifier"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).to("cuda")
model.eval()

def classify_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        label = torch.argmax(probs, dim=1).item()
    return label

# 입력 CSV: 'sentences.csv' (컬럼: 'text')
df = pd.read_csv("sentences.csv")
df['label'] = df['text'].apply(classify_sentence)
df.to_csv("classified_sentences.csv", index=False)

ModuleNotFoundError: No module named 'transformers'

# NER

## NER 정답 데이터 생성

In [3]:
def create_prompt(text):
    return f"""
다음 문장을 BIO 포맷으로 NER 태깅해줘. 가능한 태그는 다음과 같아:
- INGREDIENT, SYMPTOM, DOSAGE, TARGET, SENSITIVE, GENDER, AGE_GROUP

문장: "{text}"

결과 형식 (JSON):
{
  "tokens": [...],
  "labels": [...]
}
"""

## finetuning

In [None]:




# ✅ 3. BioBERT로 NER fine-tuning 코드 (BIO 데이터 기반)

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from torch.utils.data import Dataset
import json

class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 데이터 로드 (BIO 포맷 JSONL)
with open("ner_dataset.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

label_list = ["O", "B-INGREDIENT", "I-INGREDIENT", "B-DOSAGE", "I-DOSAGE", "B-SYMPTOM", "I-SYMPTOM",
              "B-TARGET", "I-TARGET", "B-SENSITIVE", "I-SENSITIVE", "B-GENDER", "I-GENDER", "B-AGE_GROUP", "I-AGE_GROUP"]
label_to_id = {label: i for i, label in enumerate(label_list)}

# 전처리
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
tokens = [d['tokens'] for d in data]
labels = [[label_to_id[tag] for tag in d['labels']] for d in data]

encodings = tokenizer(tokens, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
# offset_mapping은 NER 학습에 불필요하므로 제거
encodings.pop("offset_mapping")

dataset = NERDataset(encodings, labels)

# 모델 및 학습 설정
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=len(label_list))

data_collator = DataCollatorForTokenClassification(tokenizer)
training_args = TrainingArguments(
    output_dir="./biobert_ner_output",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()
trainer.save_model("./biobert_ner_model")