### 10. 고급 미세 튜닝 - 약품 분류하기

### 사전 준비
 * 구글 코랩 환경은 일정 시간이후에 초기화가 되기 때문에 두가지 작업을 매번 수행해야 함.
   * chatgpt.env 파일 생성이 필요.
     * 준비된 chatgpt.env를 내용을 변경하여 업로드 하거나 또는 API_KEY와 ORG_ID를 확인하여 생성한다.
   * pip install openai 설치
   * 캐글 데이터 셋 다운로드 후, 업로드
     * https://www.kaggle.com/datasets/saratchendra/medicine-recommendation 또는 https://www.kaggle.com/datasets/saratchendra/medicine-recommendation/download?datasetVersionNumber=1
     * 파일 이름 : 'Medicine_description.xlsx

### 학습 내용
 * 판다스를 이용한 데이터 포맷 변경
 * 미세 튜닝된 모델 테스트하기

In [4]:
!pip install openai



### 판다스를 이용한 데이터 포맷 변경

In [4]:
# 판다스 라이브러리 불러오기
import pandas as pd

# 처음 n개의 행 읽기
n = 2000
df = pd.read_excel('Medicine_description.xlsx', sheet_name='Sheet1', header=0, nrows=n)
df

Unnamed: 0,Drug_Name,Reason,Description
0,A CN Gel(Topical) 20gmA CN Soap 75gm,Acne,Mild to moderate acne (spots)
1,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,Acne,A RET 0.025% is a prescription medicine that i...
2,ACGEL CL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...
3,ACGEL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...
4,Acleen 1% Lotion 25ml,Acne,treat the most severe form of acne (nodular ac...
...,...,...,...
1995,Cilny 10mg Tablet 15'S,Angina,prevents angina and works by widening the blo...
1996,Cinwox 25mg Tablet 10'S,Angina,prevents angina and works by widening the blo...
1997,Coralan 5mg Tablet 14'S,Angina,prevents angina and works by widening the blo...
1998,Coralan 7.5mg Tablet 14'S,Angina,prevents angina and works by widening the blo...


In [5]:
# ‘Reason’ 열에서 고유한 값들 얻기
reasons = df["Reason"].unique()
print(reasons)

# 각 Reason에 번호 할당
reasons_dict = {reason : i for i, reason in enumerate(reasons)}
reasons_dict


['Acne' 'Adhd' 'Allergies' 'Alzheimer' 'Amoebiasis' 'Anaemia' 'Angina']


{'Acne': 0,
 'Adhd': 1,
 'Allergies': 2,
 'Alzheimer': 3,
 'Amoebiasis': 4,
 'Anaemia': 5,
 'Angina': 6}

In [6]:
# ‘Reason_Num'컬럼에 각 대응하는 인덱스 값으로 변경
df["Reason_Num"] = " " + df["Reason"].apply(lambda x : "" + str(reasons_dict[x]))
df

Unnamed: 0,Drug_Name,Reason,Description,Reason_Num
0,A CN Gel(Topical) 20gmA CN Soap 75gm,Acne,Mild to moderate acne (spots),0
1,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,Acne,A RET 0.025% is a prescription medicine that i...,0
2,ACGEL CL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...,0
3,ACGEL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...,0
4,Acleen 1% Lotion 25ml,Acne,treat the most severe form of acne (nodular ac...,0
...,...,...,...,...
1995,Cilny 10mg Tablet 15'S,Angina,prevents angina and works by widening the blo...,6
1996,Cinwox 25mg Tablet 10'S,Angina,prevents angina and works by widening the blo...,6
1997,Coralan 5mg Tablet 14'S,Angina,prevents angina and works by widening the blo...,6
1998,Coralan 7.5mg Tablet 14'S,Angina,prevents angina and works by widening the blo...,6


In [7]:
# 각 이름의 끝에 줄바꿈의 "\n"과 Malady 추가
df["Drug_Name"] = "Drug : " + df["Drug_Name"] + "\n" + "Malady:"
df["Drug_Name"]

0       Drug : A CN Gel(Topical) 20gmA CN Soap 75gm\nM...
1       Drug : A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gm...
2                  Drug : ACGEL CL NANO Gel 15gm\nMalady:
3                     Drug : ACGEL NANO Gel 15gm\nMalady:
4                   Drug : Acleen 1% Lotion 25ml\nMalady:
                              ...                        
1995               Drug : Cilny 10mg Tablet 15'S\nMalady:
1996              Drug : Cinwox 25mg Tablet 10'S\nMalady:
1997              Drug : Coralan 5mg Tablet 14'S\nMalady:
1998            Drug : Coralan 7.5mg Tablet 14'S\nMalady:
1999    Drug : Corflo 10mg Tablet 20'SCorflo 5mg Table...
Name: Drug_Name, Length: 2000, dtype: object

In [8]:
# ‘Reason', 'Description' 열 삭제하기’
df.drop(['Reason','Description'], axis=1, inplace=True)
df

Unnamed: 0,Drug_Name,Reason_Num
0,Drug : A CN Gel(Topical) 20gmA CN Soap 75gm\nM...,0
1,Drug : A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gm...,0
2,Drug : ACGEL CL NANO Gel 15gm\nMalady:,0
3,Drug : ACGEL NANO Gel 15gm\nMalady:,0
4,Drug : Acleen 1% Lotion 25ml\nMalady:,0
...,...,...
1995,Drug : Cilny 10mg Tablet 15'S\nMalady:,6
1996,Drug : Cinwox 25mg Tablet 10'S\nMalady:,6
1997,Drug : Coralan 5mg Tablet 14'S\nMalady:,6
1998,Drug : Coralan 7.5mg Tablet 14'S\nMalady:,6


### 샘플 JSON 데이터 셋
```
{"messages": [{"role": "system", "content": [시스템메시지] }, {"role": "user", "content": [입력]}, {"role": "assistant", "content": "AI답변"}]}
{"messages": [{"role": "system", "content": "너는 의료 전문가 챗봇이야."}, {"role": "user", "content": "A CN Gel(Topical) 20gmA CN Soap 75gm"}, {"role": "assistant", "content": "Acne"}]}

```

### jsonl파일로 준비

In [9]:
print(df.shape)
df.head()

(2000, 2)


Unnamed: 0,Drug_Name,Reason_Num
0,Drug : A CN Gel(Topical) 20gmA CN Soap 75gm\nM...,0
1,Drug : A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gm...,0
2,Drug : ACGEL CL NANO Gel 15gm\nMalady:,0
3,Drug : ACGEL NANO Gel 15gm\nMalady:,0
4,Drug : Acleen 1% Lotion 25ml\nMalady:,0


In [10]:
import json
from sklearn.model_selection import train_test_split

In [11]:
# 학습용 80%, 검증용 20%의 데이터로 준비
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
print(df_train.shape, df_test.shape)

(1600, 2) (400, 2)


In [12]:
# JSONL 파일로 저장
with open('drug_malady_data_train.jsonl', 'w') as file:
    for index, row in df.iterrows():
        json_data = {
            "messages": [
                {"role": "system", "content": "너는 의료 전문가 챗봇이야."},
                {"role": "user", "content": row['Drug_Name']},
                {"role": "assistant", "content": str(row['Reason_Num'])}
            ]
        }
        file.write(json.dumps(json_data) + '\n')

with open('drug_malady_data_test.jsonl', 'w') as file:
    for index, row in df.iterrows():
        json_data = {
            "messages": [
                {"role": "system", "content": "너는 의료 전문가 챗봇이야."},
                {"role": "user", "content": row['Drug_Name']},
                {"role": "assistant", "content": str(row['Reason_Num'])}
            ]
        }
        file.write(json.dumps(json_data) + '\n')

### OpenAI API Key 초기 설정

In [7]:
# OpenAI API 키를 설정합니다.
import os
from openai import OpenAI

def init_api():
    with open("chatgpt.env") as env:
       for line in env:
           key, value = line.strip().split("=")
           os.environ[key] = value

init_api()

client = OpenAI(api_key  = os.environ.get("API_KEY"),
                organization  = os.environ.get("ORG_ID"))

In [8]:
from datetime import datetime
import pytz

def format_timestamp_to_kst(timestamp):
    """
    타임스탬프가 주어지면 KST (한국 시간대) 시간 문자열로 변환하고, None이면 None을 반환합니다.
    """
    if timestamp is not None:
        utc_time = datetime.utcfromtimestamp(timestamp)
        utc_time = utc_time.replace(tzinfo=pytz.UTC)
        kst_time = utc_time.astimezone(pytz.timezone("Asia/Seoul"))
        return kst_time.strftime('%Y-%m-%d %H:%M:%S')
    return None

### 파일 변환

In [15]:
# 파일 목록을 가져옵니다.
files = client.files.list()

# 기존에 업로드된 파일 중 동일한 이름의 파일을 찾아 삭제합니다.
for file in files.data:
  change_time = format_timestamp_to_kst(file.created_at)
  print(f"id : {file.id}, created_at : {change_time}, filename : {file.filename}")
  print(file)

id : file-E24rU3OEQQRqGd8Bj2GVhBJX, created_at : 2023-12-10 00:36:02, filename : step_metrics.csv
FileObject(id='file-E24rU3OEQQRqGd8Bj2GVhBJX', bytes=26204, created_at=1702136162, filename='step_metrics.csv', object='file', purpose='fine-tune-results', status='processed', status_details=None)
id : file-LsSQzOX8cuiUNqrioN85OGI4, created_at : 2023-12-09 23:43:38, filename : drug_malady_data_test.jsonl
FileObject(id='file-LsSQzOX8cuiUNqrioN85OGI4', bytes=484084, created_at=1702133018, filename='drug_malady_data_test.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
id : file-mAtmFjKBy5y1Ut2hiRA6JpHe, created_at : 2023-12-09 23:43:37, filename : drug_malady_data_train.jsonl
FileObject(id='file-mAtmFjKBy5y1Ut2hiRA6JpHe', bytes=484084, created_at=1702133017, filename='drug_malady_data_train.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
id : file-BPBHJKjuUW2x2Hr6kDqEbtxA, created_at : 2023-12-08 18:40:35, filename : st

### 파일 업로드

In [None]:
# 훈련 데이터 파일 업로드
train_file = client.files.create(
  file=open("drug_malady_data_train.jsonl", "rb"),
  purpose="fine-tune"
)

# 검증 데이터 파일 업로드
valid_file = client.files.create(
  file=open("drug_malady_data_test.jsonl", "rb"),
  purpose="fine-tune"
)

In [None]:
# Fine-tune 작업 시작 (예: 'davinci-002' 사용)
fine_tune_response = client.fine_tuning.jobs.create(
  training_file=train_file.id,
  validation_file=valid_file.id,
  model="gpt-3.5-turbo", # 또는 'babbage-002', 'gpt-3.5-turbo', 'davinci-002'로 변경 가능
  suffix="drug_malady_data"
)

In [None]:
print(fine_tune_response.id)

ftjob-zvzf8Zph1fG5b2PKfxqudqtf


In [9]:
### 현재 미세 조정 상태 확인
job_id = "ftjob-zvzf8Zph1fG5b2PKfxqudqtf"
job_status = client.fine_tuning.jobs.retrieve(job_id)

job_status

FineTuningJob(id='ftjob-zvzf8Zph1fG5b2PKfxqudqtf', created_at=1702133070, error=None, fine_tuned_model='ft:gpt-3.5-turbo-0613:personal:drug-malady-data:8Ttey3h4', finished_at=1702136160, hyperparameters=Hyperparameters(n_epochs=3, batch_size=4, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-VSSDsvSmnHA1izxVflKKw4HZ', result_files=['file-E24rU3OEQQRqGd8Bj2GVhBJX'], status='succeeded', trained_tokens=314130, training_file='file-mAtmFjKBy5y1Ut2hiRA6JpHe', validation_file='file-LsSQzOX8cuiUNqrioN85OGI4')

In [21]:
### 현재 미세 조정 상태 확인
job_id = "ftjob-zvzf8Zph1fG5b2PKfxqudqtf"

job_status = client.fine_tuning.jobs.retrieve(job_id)

created_time = format_timestamp_to_kst(job_status.created_at)
finished_time = format_timestamp_to_kst(job_status.finished_at)
print(f"created_at : {created_time}, finished_at : {finished_time}, status : {job_status.status}" )
print(f"fine_tuned_model : {job_status.fine_tuned_model}, model : {job_status.model},  result_files : {job_status.result_files}")
print(f"trained_tokens : {job_status.trained_tokens}, error : {job_status.error},  hyperparameters : {job_status.hyperparameters}")

print(job_status)

created_at : 2023-12-09 23:44:30, finished_at : 2023-12-10 00:36:00, status : succeeded
fine_tuned_model : ft:gpt-3.5-turbo-0613:personal:drug-malady-data:8Ttey3h4, model : gpt-3.5-turbo-0613,  result_files : ['file-E24rU3OEQQRqGd8Bj2GVhBJX']
trained_tokens : 314130, error : None,  hyperparameters : Hyperparameters(n_epochs=3, batch_size=4, learning_rate_multiplier=2)
FineTuningJob(id='ftjob-zvzf8Zph1fG5b2PKfxqudqtf', created_at=1702133070, error=None, fine_tuned_model='ft:gpt-3.5-turbo-0613:personal:drug-malady-data:8Ttey3h4', finished_at=1702136160, hyperparameters=Hyperparameters(n_epochs=3, batch_size=4, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-VSSDsvSmnHA1izxVflKKw4HZ', result_files=['file-E24rU3OEQQRqGd8Bj2GVhBJX'], status='succeeded', trained_tokens=314130, training_file='file-mAtmFjKBy5y1Ut2hiRA6JpHe', validation_file='file-LsSQzOX8cuiUNqrioN85OGI4')


In [22]:
### 현재 미세 조정 상태 확인
job_id = "ftjob-zvzf8Zph1fG5b2PKfxqudqtf"

job_list_events =client.fine_tuning.jobs.list_events(fine_tuning_job_id= job_id , limit=15)
for e in job_list_events.data:
  print(e)

FineTuningJobEvent(id='ftevent-X6KL7s8ymKcmrhuFbBpJGaTN', created_at=1702136164, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message')
FineTuningJobEvent(id='ftevent-4NC52IdsHotDJOAl70DyUoB6', created_at=1702136161, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-0613:personal:drug-malady-data:8Ttey3h4', object='fine_tuning.job.event', data={}, type='message')
FineTuningJobEvent(id='ftevent-udk9JJ3zZPfyjReL37TfIoqx', created_at=1702135955, level='info', message='Step 1401/1500: training loss=0.00, validation loss=0.00', object='fine_tuning.job.event', data={'step': 1401, 'train_loss': 4.76837158203125e-07, 'valid_loss': 4.76837158203125e-07, 'train_mean_token_accuracy': 1.0, 'valid_mean_token_accuracy': 0.5}, type='metrics')
FineTuningJobEvent(id='ftevent-70GCIP1IQPsNVy8urkr5Krru', created_at=1702135744, level='info', message='Step 1301/1500: training loss=0.33, validation loss=0.00', object='fine_tuni

In [None]:
### 현재 미세 조정 상태 확인
job_id = "ftjob-MGf34t0EqAywt8WJypnipJXb"
job_status = client.fine_tuning.jobs.retrieve(job_id)

print("객체 상태 전체 정보 : ", job_status)

객체 상태 전체 정보 :  FineTuningJob(id='ftjob-MGf34t0EqAywt8WJypnipJXb', created_at=1701855458, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=3, learning_rate_multiplier=2), model='davinci-002', object='fine_tuning.job', organization_id='org-VSSDsvSmnHA1izxVflKKw4HZ', result_files=[], status='running', trained_tokens=None, training_file='file-IPOA67JLwoRVXBRNDOZwFPq1', validation_file='file-0KYZqcFqA9c6f6boNGLWh5lQ')


### 전체 미세 조정 작업 리스트 확인

In [10]:
models = client.fine_tuning.jobs.list()

print("미세 조정 모델 리스트 개수 : ", len(models.data))

for m in models.data:
  created_time = format_timestamp_to_kst(m.created_at)
  finished_time = format_timestamp_to_kst(m.finished_at)

  print(f"id : {m.id}")
  print(f"created_at : {created_time}, finished_at : {finished_time}, status : {m.status}")
  print(f"fine_tuned_model : {m.fine_tuned_model}, model : {m.model},  result_files : {m.result_files}")
  print(f"trained_tokens : {m.trained_tokens}, error : {m.error},  hyperparameters : {m.hyperparameters}")
  print()


미세 조정 모델 리스트 개수 :  10
id : ftjob-zvzf8Zph1fG5b2PKfxqudqtf
created_at : 2023-12-09 23:44:30, finished_at : 2023-12-10 00:36:00, status : succeeded
fine_tuned_model : ft:gpt-3.5-turbo-0613:personal:drug-malady-data:8Ttey3h4, model : gpt-3.5-turbo-0613,  result_files : ['file-E24rU3OEQQRqGd8Bj2GVhBJX']
trained_tokens : 314130, error : None,  hyperparameters : Hyperparameters(n_epochs=3, batch_size=4, learning_rate_multiplier=2)

id : ftjob-7ZZdOZAgFhZmFb4qspk2XoWv
created_at : 2023-12-08 18:33:45, finished_at : 2023-12-08 18:40:33, status : succeeded
fine_tuned_model : ft:gpt-3.5-turbo-0613:personal::8TRdS2aQ, model : gpt-3.5-turbo-0613,  result_files : ['file-BPBHJKjuUW2x2Hr6kDqEbtxA']
trained_tokens : 6110, error : None,  hyperparameters : Hyperparameters(n_epochs=10, batch_size=1, learning_rate_multiplier=2)

id : ftjob-MGf34t0EqAywt8WJypnipJXb
created_at : 2023-12-06 18:37:38, finished_at : 2023-12-06 18:47:28, status : succeeded
fine_tuned_model : ft:davinci-002:personal:drug-malady-

### 미세 튜닝된 모델 테스트 하기

In [15]:
# 모델 ID 설정. 여기서는 사용자의 모델 ID로 변경해야 합니다.
fine_tune_model = "ft:gpt-3.5-turbo-0613:personal:drug-malady-data:8Ttey3h4"

# 각 클래스에서 하나의 약물을 선택합니다.
drugs = [
    "A CN Gel(Topical) 20gmA CN Soap 75gm" , # Class 0
    "Tomoxetin 18mg Capsule 10'S" , # Class 1
    "ABICET M Tablet 10's" , # Class 2
]

# 각 약물에 대한 약물 클래스를 반환합니다.
for drug_name in drugs:
    prompt = "Drug: {} \n Malady:" . format (drug_name)
    response = client.chat.completions.create(
      model=fine_tune_model,
      messages=[{"role": "user", "content": prompt} ],
      temperature = 1 , max_tokens = 2
    )


    # 약 이름에 대한 대답
    drug_class = response.choices[0].message.content

    # 결과는 0, 1, 2 중 하나
    print(drug_class)


 0
 1
 2


### 병명 반환 추가

In [38]:
# 모델 ID 설정. 여기서는 사용자의 모델 ID로 변경해야 합니다.
fine_tune_model = "ft:gpt-3.5-turbo-0613:personal:drug-malady-data:8Ttey3h4"

# 각 클래스에서 하나의 약물을 선택합니다.
drugs = [
    "'A CN Gel(Topical) 20gmA CN Soap 75gm'은 어디에 사용되나요?" , # Class 0
    "'Addnok Tablet 20'S'은 어디에 사용되나요?" , # Class 1
    "'ABICET M Tablet 10's'은 어디에 사용되나요?" , # Class 2
]


class_map = {
    0 : "Acne" ,
    1 : "Adhd" ,
    2 : "Allergies" ,
    # ...
}


# 각 약물에 대한 약물 클래스를 반환합니다.
for drug_name in drugs:
    prompt = "Drug: {} \n Malady:" . format (drug_name)
    response = client.chat.completions.create(
      model=fine_tune_model,
      messages=[{"role": "user", "content": prompt} ],
      temperature = 1 ,
      max_tokens = 2
    )

    # 생성된 텍스트를 출력합니다.
    response = response.choices[0].message.content

    try :
        print (drug_name + class_map[ int (response)] + "에 사용합니다")
    except :
        print (drug_name + " 어떤 경우에 사용하는지 잘 모르겠습니다." )


'A CN Gel(Topical) 20gmA CN Soap 75gm'은 어디에 사용되나요?Acne에 사용합니다
'Addnok Tablet 20'S'은 어디에 사용되나요?Allergies에 사용합니다
'ABICET M Tablet 10's'은 어디에 사용되나요?Allergies에 사용합니다
