## 1. csv 파일 원본에서 필요한 부분만 JSONL 파일 형식으로 저장(Write)

In [None]:
import csv
import json
import random

def read_conversation_log(file_path):
    conversations = {}
    current_conversation = []
    current_id = None

    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        for row in reader:
            conversation_id, speaker, message = row
            if current_id != conversation_id:
                if current_conversation:
                    conversations[current_id] = current_conversation
                current_id = conversation_id
                current_conversation = []
            current_conversation.append((speaker, message))

    if current_conversation:
        conversations[current_id] = current_conversation

    return conversations

def read_learning_data(file_path):
    data = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        headers = reader.fieldnames
        conversation_id_column = 'Conversation ID'

        # 'Conversation ID' 컬럼이 없으면 첫 번째 컬럼을 사용
        if 'Conversation ID' not in headers:
            conversation_id_column = headers[0]
            print(f"Warning: 'Conversation ID' column not found. Using '{conversation_id_column}' instead.")

        for row in reader:
            try:
                conversation_id = row[conversation_id_column]
                data[conversation_id] = row
            except KeyError as e:
                print(f"Error processing row: {row}")
                print(f"KeyError: {e}")
                continue
    return data

def create_jsonl_entry(conversation, learning_data):
    customer_messages = [msg for speaker, msg in conversation if speaker == 'Customer']
    customer_dialogue = ' '.join(customer_messages)

    detected_lie = learning_data.get('Detected Lie', 'None')
    confidence = learning_data.get('Detection Confidence', '0')

    if detected_lie != 'None' and int(confidence) > 0:
        is_lie = True
        analysis = f"The customer made a false statement: {detected_lie}. Confidence: {confidence}%"
    else:
        is_lie = False
        analysis = "No false statements detected in the customer's dialogue."

    user_content = f"Customer's statements: {customer_dialogue}"

    if is_lie:
        assistant_content = f"""Based on the analysis of the customer's statements, a potential false statement has been detected:

Detected False Statement: {detected_lie}
Confidence: {confidence}%

Analysis:
{analysis}

This discrepancy may indicate an attempt by the customer to misrepresent their situation. It is recommended that the underwriter investigate this matter further to ensure accurate risk assessment."""
    else:
        assistant_content = f"""Based on the analysis of the customer's statements, no false statements were detected.

Analysis:
{analysis}

The customer's statements appear to be consistent and truthful. However, it is always recommended to verify key information through standard underwriting procedures."""

    return {
        "messages": [
            {"role": "system", "content": "Your goal is to detect any false statements made by the insurance applicant during the underwriting conversation. After detecting these inconsistencies, you will draft an analytical report outlining the detected discrepancies and provide this report to the underwriter for further review. The report should be based on the dialogue input provided by the user."},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ]
    }

def main():
    conversations = read_conversation_log('conversation_log.csv')
    learning_data = read_learning_data('learning_data.csv')

    with open('train.jsonl', 'w', encoding='utf-8') as outfile:
        for conv_id, conversation in conversations.items():
            if conv_id in learning_data:
                entry = create_jsonl_entry(conversation, learning_data[conv_id])
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

if __name__ == "__main__":
    main()

print("train.jsonl 생성 완료")

## 2. Fine-Tuning 수행

In [None]:
# openai 라이브러리 인스톨
!pip install openai

In [None]:
# (1) 로컬에 저장된 학습데이터 train.jsonl 업로드

from openai import OpenAI

client = OpenAI( # 클라이언트 인스턴스 생성

    api_key = "API KEY REQUIRED"

)

file = client.files.create(

    file = open('train.jsonl', 'rb'), # 이진파일로 file에 읽어들임


    purpose = 'fine-tune' # 파인튜닝 목적임을 명기

)

print(file)

### 위 소스코드 실행 결과의 id를 아래 소스코드의 training_file에 작성해줘야 한다.

In [None]:
# (2) Fine-Tuning Job 생성하기

client.fine_tuning.jobs.create(

    training_file = file.id,

    model = 'gpt-3.5-turbo-1106', # 사용할 모델명 기입

    suffix = "underwriter-support-model", # 모델명 설정

    # n_epochs = 3, # 에포크 수

    # learning_rate_multiplier = 0.1 # 학습률 배수

)

### 위 소스코드 실행 결과의 id를 아래 소스코드의 retrieve 함수로 넘겨줘야 한다.
- 실행 결과에서, status = 'succeeded' 여부를 확인한다.
- OpenAI 콘솔의 Fine-Tuning UI를 통해, 모델 학습 현황을 확인하는 것도 가능하다.

In [None]:
# (3) 학습 완료여부(Status) 확인

client.fine_tuning.jobs.list()
# client.fine_tuning.jobs.list(limit = 10) # API 요청을 통해 한 번에 조회가능한 파인튜닝 작업의 개수 제한

client.fine_tuning.jobs.retrive("ftjob-파인튜닝 완료된 모델의 코드")

## 3. Fine-Tuned Model 직접 사용하기
- messages의 system(key)에는 지정했던 instruction 삽입
- messages의 user(key)에는 파인튜닝된 ChatGPT(Underwriter)에게 상담내용 예시 입력받으면 적절한 언더라이팅 솔루션 제시

<br><br>

### ⚠️주의사항 : 함수를 한 번 사용할 때마다 일정 토큰이 소비되므로 주의하여 실행

In [None]:
question = input("고객(피보험자)의 담화내용(영어) 입력 : ")

response = client.chat.completions.create(

    # 파인튜닝에 성공한 모델명 명시
    model = "ft:gpt-3.5-turbo-1106:personal::뒤에 뜨는 코드",

    messages = [
        {"role" : "system", "content" : instruction},
        {"role" : "user", "content" : question}
    ]

)

print(response.choices[0].message.content.strip())

---
---
---

# 일괄실행

In [None]:
# openai 라이브러리 인스톨
!pip install openai

# 라이브러리 import
import csv
import json
import random
from openai import OpenAI

def read_conversation_log(file_path):
    conversations = {}
    current_conversation = []
    current_id = None

    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        for row in reader:
            conversation_id, speaker, message = row
            if current_id != conversation_id:
                if current_conversation:
                    conversations[current_id] = current_conversation
                current_id = conversation_id
                current_conversation = []
            current_conversation.append((speaker, message))

    if current_conversation:
        conversations[current_id] = current_conversation

    return conversations

def read_learning_data(file_path):
    data = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        headers = reader.fieldnames
        conversation_id_column = 'Conversation ID'

        # 'Conversation ID' 컬럼이 없으면 첫 번째 컬럼을 사용
        if 'Conversation ID' not in headers:
            conversation_id_column = headers[0]
            print(f"Warning: 'Conversation ID' column not found. Using '{conversation_id_column}' instead.")

        for row in reader:
            try:
                conversation_id = row[conversation_id_column]
                data[conversation_id] = row
            except KeyError as e:
                print(f"Error processing row: {row}")
                print(f"KeyError: {e}")
                continue
    return data

def create_jsonl_entry(conversation, learning_data):
    customer_messages = [msg for speaker, msg in conversation if speaker == 'Customer']
    customer_dialogue = ' '.join(customer_messages)

    detected_lie = learning_data.get('Detected Lie', 'None')
    confidence = learning_data.get('Detection Confidence', '0')

    if detected_lie != 'None' and int(confidence) > 0:
        is_lie = True
        analysis = f"The customer made a false statement: {detected_lie}. Confidence: {confidence}%"
    else:
        is_lie = False
        analysis = "No false statements detected in the customer's dialogue."

    user_content = f"Customer's statements: {customer_dialogue}"

    if is_lie:
        assistant_content = f"""Based on the analysis of the customer's statements, a potential false statement has been detected:

Detected False Statement: {detected_lie}
Confidence: {confidence}%

Analysis:
{analysis}

This discrepancy may indicate an attempt by the customer to misrepresent their situation. It is recommended that the underwriter investigate this matter further to ensure accurate risk assessment."""
    else:
        assistant_content = f"""Based on the analysis of the customer's statements, no false statements were detected.

Analysis:
{analysis}

The customer's statements appear to be consistent and truthful. However, it is always recommended to verify key information through standard underwriting procedures."""

    return {
        "messages": [
            {"role": "system", "content": "Your goal is to detect any false statements made by the insurance applicant during the underwriting conversation. After detecting these inconsistencies, you will draft an analytical report outlining the detected discrepancies and provide this report to the underwriter for further review. The report should be based on the dialogue input provided by the user."},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ]
    }


conversations = read_conversation_log('conversation_log.csv')
learning_data = read_learning_data('learning_data.csv')

with open('train.jsonl', 'w', encoding='utf-8') as outfile:
    for conv_id, conversation in conversations.items():
        if conv_id in learning_data:
            entry = create_jsonl_entry(conversation, learning_data[conv_id])
            json.dump(entry, outfile, ensure_ascii=False)
            outfile.write('\n')
print("train.jsonl 생성 완료")

# (1) 로컬에 저장된 학습데이터 train.jsonl 업로드
client = OpenAI( # 클라이언트 인스턴스 생성

    api_key = "API KEY REQUIRED"

)

file = client.files.create(

    file = open('train.jsonl', 'rb'), # 이진파일로 file에 읽어들임


    purpose = 'fine-tune' # 파인튜닝 목적임을 명기

)
print(file)

# (2) Fine-Tuning Job 생성하기

client.fine_tuning.jobs.create(

    training_file = file.id,

    model = 'gpt-3.5-turbo-1106', # 사용할 모델명 기입

    suffix = "underwriter-support-model", # 모델명 설정

    # n_epochs = 3, # 에포크 수

    # learning_rate_multiplier = 0.1 # 학습률 배수

)

# (3) 학습 완료여부(Status) 확인

client.fine_tuning.jobs.list()
# client.fine_tuning.jobs.list(limit = 10) # API 요청을 통해 한 번에 조회가능한 파인튜닝 작업의 개수 제한

client.fine_tuning.jobs.retrive("ftjob-파인튜닝 완료된 모델의 코드")

In [None]:
question = input("고객(피보험자)의 담화내용(영어) 입력 : ")

response = client.chat.completions.create(

    # 파인튜닝에 성공한 모델명 명시
    model = "ft:gpt-3.5-turbo-1106:personal::뒤에 뜨는 코드",

    messages = [
        {"role" : "system", "content" : instruction},
        {"role" : "user", "content" : question}
    ]

)

print(response.choices[0].message.content.strip())