# Preprocessing

- QA Task
    - QA Task Train / Test split
    - QA instruction prompt
    - 추가 전처리
    
- Dialogue Task
    - Dialogue Task Train / Test split
    - [(Input) : Client - Doctor - Client || (Output) : Doctor]  Dilaogue 데이터셋 분리
    - Dialogue Instruction Prompt
    - 추가 전처리

- Total Dataset
    - 이 2개의 데이터 셋을 concatenate, 추가 전처리 수행

In [1]:
import torch
import torch.nn as nn
import torch.distributed as dist


import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

from datasets import load_dataset, load_from_disk, concatenate_datasets
from transformers import (
    AutoConfig, AutoTokenizer, DataCollatorForLanguageModeling, 
    get_cosine_schedule_with_warmup
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from transformers import Adafactor
import json
import os
import copy

  from .autonotebook import tqdm as notebook_tqdm


# Dialogue Task

- Train, Test 분리 및 Instruction Prompt 포함되게 만듦

In [2]:
from datasets import load_dataset

ds = load_dataset("squarelike/ko_medical_chat")

In [3]:
from datasets import Dataset

def extract_sft_pairs(dataset):
    new_data = {"input": [], "output": [], "all_dialogue": []}

    for example in dataset:
        conv = example["conversations"]  # 전체 메시지 리스트

        # 전체 대화를 텍스트로 합쳐놓기
        full_dialogue = ""
        for turn in conv:
            speaker = turn['from'].capitalize()
            message = turn['value']
            full_dialogue += f"{speaker}: {message}\n"

        for i in range(3, len(conv)):
            try:
                if (conv[i-3]['from'] == 'client' and
                    conv[i-2]['from'] == 'doctor' and
                    conv[i-1]['from'] == 'client' and
                    conv[i]['from'] == 'doctor'):

                    input_text = (
                        f"Client: {conv[i-3]['value']}\n"
                        f"Doctor: {conv[i-2]['value']}\n"
                        f"Client: {conv[i-1]['value']}"
                    )
                    output_text = f"Doctor: {conv[i]['value']}"

                    new_data["input"].append(input_text)
                    new_data["output"].append(output_text)
                    new_data["all_dialogue"].append(full_dialogue.strip())  # 전체 대화도 추가
            except (IndexError, KeyError):
                continue

    return Dataset.from_dict(new_data)


In [4]:
from datasets import DatasetDict

sft_dataset = extract_sft_pairs(ds['train'])
Dialogue_dataset = sft_dataset.train_test_split(test_size=0.2, seed=42)

### Template 붙이기

In [5]:
template = """
You are a medical professional proficient in both Korean and English.
Your task is to carefully read the conversation between a Client and a Doctor, 
then predict the next response from the Doctor in a professional and empathetic manner.

Please respond in **Korean** only.

Now produce the next Doctor response to the Client’s last question.
Return your answer in Korean.

***Conversation:
{conversation}

***Answer:
{Answer}
"""

In [7]:
# 템플릿 적용 함수
def apply_template(example):
    formatted_input = template.format(
        conversation=example["input"],
        Answer=example["output"]
    )
    return {
        "input": formatted_input,
        "output": example["output"]
    }

# train, test 각각 템플릿 적용
new_train = Dialogue_dataset["train"].map(apply_template)
new_test = Dialogue_dataset["test"].map(apply_template)

# 새 DatasetDict로 묶기
templated_dataset = DatasetDict({
    "train": new_train,
    "test": new_test
})



Map: 100%|██████████| 9484/9484 [00:00<00:00, 13363.85 examples/s]
Map: 100%|██████████| 2372/2372 [00:00<00:00, 16230.97 examples/s]


# QA Task

- Train, Test 분리
- Instruction Prompt 포함
- 일부 데이터 한국어로 변형

In [8]:
dentist = load_dataset("sean0042/KorMedMCQA", 'dentist') # 추가 전처리 필요
doctor = load_dataset("sean0042/KorMedMCQA", 'doctor') # 추가 전처리 필요
nurse = load_dataset("sean0042/KorMedMCQA", 'nurse') # 추가 전처리 필요
pharm = load_dataset("sean0042/KorMedMCQA", 'pharm') # 추가 전처리 필요


In [9]:
def concatenate_all(all_datasets):
    combined_dataset = concatenate_datasets([
        all_datasets["train"],
        all_datasets["dev"],
        all_datasets["test"],
        all_datasets["fewshot"]
    ])

    return combined_dataset

In [10]:
dentist_all = concatenate_all(dentist)
doctor_all = concatenate_all(doctor)
nurse_all = concatenate_all(nurse)
pharm_all = concatenate_all(pharm)

In [11]:
doctor_all

Dataset({
    features: ['subject', 'year', 'period', 'q_number', 'question', 'A', 'B', 'C', 'D', 'E', 'answer', 'cot'],
    num_rows: 2494
})

In [12]:
combined_dataset = concatenate_datasets([dentist_all, doctor_all, nurse_all, pharm_all])


### Train Test 분리

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import DatasetDict



split = combined_dataset.train_test_split(test_size=0.2, seed=42)

final_datasets = {
    "train": split["train"],
    "test": split["test"]
}

final_datasets = DatasetDict(final_datasets)


In [14]:
from datasets import DatasetDict

# 필요한 열만 유지
columns_to_keep = ['question', 'A', 'B', 'C', 'D', 'E', 'answer']

final_datasets = DatasetDict({
    split: dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
    for split, dataset in final_datasets.items()
})

# 'answer' 컬럼을 'output'으로 이름 변경
QA_dataset = DatasetDict({
    split: dataset.rename_column("answer", "output")
    for split, dataset in final_datasets.items()
})


### Instruction Prompt 부여

In [15]:

# LangChain의 프롬프트 템플릿 유지
qa_template = """
You are a medical professional proficient in both Korean and English. 
Your task is to carefully analyze a given medical question and its five answer choices, 
then provide the correct answer.
The answer choice will always follow the fixed format of Choice 1, Choice 2, Choice 3, Choice 4, Choice 5.
Return the response in the following format:
Choice answer

Now, analyze the following medical question and return the answer.

Question: {question}
Choice 1: {A}
Choice 2: {B}
Choice 3: {C}
Choice 4: {D}
Choice 5: {E}
Answer: 
"""

In [16]:
def format_qa_example_with_answer_text(example):
    # 보기 딕셔너리
    choices = {
        "Choice 1": example["A"],
        "Choice 2": example["B"],
        "Choice 3": example["C"],
        "Choice 4": example["D"],
        "Choice 5": example["E"]
    }

    # 정답 텍스트 붙이기
    choice_only = example["output"]  # 예: "Choice 2"
    full_answer = f"{choice_only}, {choices.get(choice_only, '')}"

    # 프롬프트 구성
    prompt = qa_template.format(
        question=example["question"],
        A=example["A"],
        B=example["B"],
        C=example["C"],
        D=example["D"],
        E=example["E"]
    )

    return {
        "input": prompt,
        "output": full_answer
    }


In [17]:
# train/test 각각 템플릿 + 정답 텍스트 포함 변환
formatted_train = QA_dataset["train"].map(format_qa_example_with_answer_text)
formatted_test = QA_dataset["test"].map(format_qa_example_with_answer_text)

# A~E는 필요 없으니 제거
formatted_QA_dataset = DatasetDict({
    "train": formatted_train.remove_columns(["A", "B", "C", "D", "E"]),
    "test": formatted_test.remove_columns(["A", "B", "C", "D", "E"]),
})


In [18]:
formatted_QA_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'output', 'input'],
        num_rows: 5991
    })
    test: Dataset({
        features: ['question', 'output', 'input'],
        num_rows: 1498
    })
})

# Total Dataset

- Dialogue / QA Dataset을 한꺼번에 처리

In [19]:
# source 컬럼 추가 함수
def add_source_column(dataset, source_label):
    return dataset.map(lambda example: {**example, "source": source_label})

In [20]:
from datasets import concatenate_datasets, load_dataset

# Dialogue
templated_train = add_source_column(templated_dataset["train"], "dialogue")
templated_test = add_source_column(templated_dataset["test"], "dialogue")

# QA
qa_train = add_source_column(formatted_QA_dataset["train"], "qa")
qa_test = add_source_column(formatted_QA_dataset["test"], "qa")

# 합치기
combined_train = concatenate_datasets([templated_train, qa_train])
combined_test = concatenate_datasets([templated_test, qa_test])

# 최종 DatasetDict
combined_dataset = DatasetDict({
    "train": combined_train,
    "test": combined_test
})


Map: 100%|██████████| 9484/9484 [00:00<00:00, 19120.46 examples/s]
Map: 100%|██████████| 2372/2372 [00:00<00:00, 20536.76 examples/s]
Map: 100%|██████████| 5991/5991 [00:00<00:00, 23947.40 examples/s]
Map: 100%|██████████| 1498/1498 [00:00<00:00, 24267.47 examples/s]


### 추가 전처리

- Doctor, Client 의미 제거
- 몇몇 단어 한국어 변환 및 표현 명확화

In [21]:
import re

def transform_prompt(prompt: str) -> str:
    # 추가할 지침 문장
    role_instruction = "You will always respond as the doctor, and your answer must be written in Korean only.\n\n"

    # 1. '***Conversation:' 위치 찾기
    parts = prompt.split('***Conversation:')
    if len(parts) != 2:
        return prompt  # 변환할 수 없는 구조는 그대로 반환

    header, conversation = parts

    # 2. Client:, Doctor: 라벨 제거
    conversation_cleaned = re.sub(r'^(Client|Doctor):\s*', '', conversation, flags=re.MULTILINE)

    # 3. 역할 문장을 Conversation 위에 삽입
    transformed_prompt = header.strip() + "\n\n" + role_instruction + "***Conversation:" + conversation_cleaned

    return transformed_prompt

In [23]:
def truncate_after_answer(example):
    if example['source'] == 'dialogue':
        example['input'] = transform_prompt(example['input'])
        example['output'] = re.sub(r'^(Client|Doctor):\s*', '', example['output'], flags=re.MULTILINE)

    if example['source'] == 'qa':
        example['input'] = example['input'].replace('Choice','선택')
        example['input'] = example['input'].replace('Answer:','***Answer:').replace('Question:','***Question:' )
        example['input'] = example['input'].strip()
        example['output'] = example['output'].replace('Choice','선택').strip()
        
    if '***Answer:' in example['input']:
        # '***Answer:'까지만 남기고 뒷부분 삭제
        idx = example['input'].index('***Answer:') + len('***Answer:')
        example['input'] = example['input'][:idx].strip()
    return example


In [24]:

# 적용
train_cleaned = combined_dataset['train'].map(truncate_after_answer)
test_cleaned = combined_dataset['test'].map(truncate_after_answer)

# 다시 DatasetDict로
cleaned_dataset = DatasetDict({
    'train': train_cleaned,
    'test': test_cleaned
})


cleaned_dataset.save_to_disk("/nas_homes/projects/rapa/Dataset/cleaned_dataset_with_answer_tag_new") ## QA / Dialogue 데이터셋 모두 저장

Map: 100%|██████████| 15475/15475 [00:00<00:00, 15599.41 examples/s]
Map: 100%|██████████| 3870/3870 [00:00<00:00, 16011.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 15475/15475 [00:03<00:00, 4799.41 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3870/3870 [00:00<00:00, 4777.46 examples/s]
