In [1]:
import torch
import torch.nn as nn
import torch.distributed as dist


import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

from datasets import load_dataset, load_from_disk
from transformers import (
    AutoConfig, AutoTokenizer, DataCollatorForLanguageModeling, 
    get_cosine_schedule_with_warmup
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from transformers import Adafactor
import json
import os
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from vllm import LLM, SamplingParams
import pandas as pd
import torch
import torch._dynamo

# torch._dynamo.config.suppress_errors = True

2025-05-23 21:09:45,538	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


# QA Task

In [3]:

# LangChain의 프롬프트 템플릿 유지
qa_template = """
You are a medical professional proficient in both Korean and English. 
Your task is to carefully analyze a given medical question and its five answer choices, 
then provide the correct answer.
The answer choice will always follow the fixed format of Choice 1, Choice 2, Choice 3, Choice 4, Choice 5.
Return the response in the following format:
Choice answer

Now, analyze the following medical question and return the answer.

Question: {question}
Choice 1: {A}
Choice 2: {B}
Choice 3: {C}
Choice 4: {D}
Choice 5: {E}
Answer: 
"""

In [4]:
from datasets import load_from_disk,DatasetDict, Dataset

QA_dataset = load_from_disk("QA_dataset")

In [5]:
def format_qa_example_with_answer_text(example):
    # 보기 딕셔너리
    choices = {
        "Choice 1": example["A"],
        "Choice 2": example["B"],
        "Choice 3": example["C"],
        "Choice 4": example["D"],
        "Choice 5": example["E"]
    }

    # 정답 텍스트 붙이기
    choice_only = example["output"]  # 예: "Choice 2"
    full_answer = f"{choice_only}, {choices.get(choice_only, '')}"

    # 프롬프트 구성
    prompt = qa_template.format(
        question=example["input"],
        A=example["A"],
        B=example["B"],
        C=example["C"],
        D=example["D"],
        E=example["E"]
    )

    return {
        "input": prompt,
        "output": full_answer
    }


In [6]:
# train/test 각각 템플릿 + 정답 텍스트 포함 변환
formatted_train = QA_dataset["train"].map(format_qa_example_with_answer_text)
formatted_test = QA_dataset["test"].map(format_qa_example_with_answer_text)

# A~E는 필요 없으니 제거
formatted_QA_dataset = DatasetDict({
    "train": formatted_train.remove_columns(["A", "B", "C", "D", "E"]),
    "test": formatted_test.remove_columns(["A", "B", "C", "D", "E"]),
})


Map:   0%|          | 0/5991 [00:00<?, ? examples/s]

Map: 100%|██████████| 5991/5991 [00:00<00:00, 14693.53 examples/s]
Map: 100%|██████████| 1498/1498 [00:00<00:00, 14976.52 examples/s]


In [7]:
formatted_QA_dataset["train"][0]["input"]
# → "Choice 2, methyl mercaptan"


'\nYou are a medical professional proficient in both Korean and English. \nYour task is to carefully analyze a given medical question and its five answer choices, \nthen provide the correct answer.\nThe answer choice will always follow the fixed format of Choice 1, Choice 2, Choice 3, Choice 4, Choice 5.\nReturn the response in the following format:\nChoice answer\n\nNow, analyze the following medical question and return the answer.\n\nQuestion: 세균의 성장에 필요한 철 이온을 고갈시켜 영양면역 (nutritional immunity) 기능을 갖는 타액 내 단백질은?\nChoice 1: 스타테린(statherin)\nChoice 2: 락토페린(lactoferrin)\nChoice 3: 라이소자임(lysozyme)\nChoice 4: 트랜스페린(transferrin)\nChoice 5: 면역글로불린(immunoglobulin)\nAnswer: \n'

In [8]:
# formatted_QA_dataset.save_to_disk("formatted_medical_QA_with_answers")


In [9]:
formatted_QA_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 5991
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 1498
    })
})

# Dialogue Task

In [10]:
template = """
You are a medical professional proficient in both Korean and English.
Your task is to carefully read the conversation between a Client and a Doctor, 
then predict the next response from the Doctor in a professional and empathetic manner.

Please respond in **Korean** only.

Now produce the next Doctor response to the Client’s last question.
Return your answer in Korean.

***Conversation:
{conversation}

***Answer:
{Answer}
"""


In [13]:
from datasets import load_from_disk,DatasetDict, Dataset

Dialogue_dataset = load_from_disk("Dialogue_data_all")

Dialogue_dataset['train']['input'][0], Dialogue_dataset['train']['output'][0]

Dialogue_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'all_dialogue'],
        num_rows: 9484
    })
    test: Dataset({
        features: ['input', 'output', 'all_dialogue'],
        num_rows: 2372
    })
})

In [14]:
# 템플릿 적용 함수
def apply_template(example):
    formatted_input = template.format(
        conversation=example["input"],
        Answer=example["output"]
    )
    return {
        "input": formatted_input,
        "output": example["output"]
    }

# train, test 각각 템플릿 적용
new_train = Dialogue_dataset["train"].map(apply_template)
new_test = Dialogue_dataset["test"].map(apply_template)

# 새 DatasetDict로 묶기
templated_dataset = DatasetDict({
    "train": new_train,
    "test": new_test
})



Map: 100%|██████████| 9484/9484 [00:00<00:00, 25362.15 examples/s]
Map: 100%|██████████| 2372/2372 [00:00<00:00, 27566.50 examples/s]


In [15]:
# 저장 (선택사항)
# templated_dataset.save_to_disk("templated_doctor_dialogue")

In [16]:
# source 컬럼 추가 함수
def add_source_column(dataset, source_label):
    return dataset.map(lambda example: {**example, "source": source_label})

In [17]:
from datasets import concatenate_datasets, load_dataset

# Dialogue
templated_train = add_source_column(templated_dataset["train"], "dialogue")
templated_test = add_source_column(templated_dataset["test"], "dialogue")

# QA
qa_train = add_source_column(formatted_QA_dataset["train"], "qa")
qa_test = add_source_column(formatted_QA_dataset["test"], "qa")

# 합치기
combined_train = concatenate_datasets([templated_train, qa_train])
combined_test = concatenate_datasets([templated_test, qa_test])

# 최종 DatasetDict
combined_dataset = DatasetDict({
    "train": combined_train,
    "test": combined_test
})


Map: 100%|██████████| 9484/9484 [00:00<00:00, 19899.58 examples/s]
Map: 100%|██████████| 2372/2372 [00:00<00:00, 21368.07 examples/s]
Map: 100%|██████████| 5991/5991 [00:00<00:00, 35841.18 examples/s]
Map: 100%|██████████| 1498/1498 [00:00<00:00, 34586.58 examples/s]


In [18]:
print(combined_dataset["train"][0]["source"])  # 'dialogue' 또는 'qa'


dialogue


In [None]:
# combined_dataset.save_to_disk("combined_dataset_short")

Saving the dataset (1/1 shards): 100%|██████████| 15475/15475 [00:00<00:00, 236551.36 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3870/3870 [00:00<00:00, 338476.03 examples/s]
