In [None]:
# Fine-tune Llama3.1 or OpenAI on the following customer support conversation data:
# https://huggingface.co/datasets/aibabyshark/insurance_customer_support_conversation

In [1]:
from datasets import load_dataset

data = load_dataset("aibabyshark/insurance_customer_support_conversation")

Downloading readme: 100%|██████████| 2.94k/2.94k [00:00<00:00, 38.7kB/s]
Downloading data: 100%|██████████| 177k/177k [00:00<00:00, 829kB/s] 
Generating train split: 100%|██████████| 100/100 [00:00<00:00, 805.86 examples/s]


In [None]:
data = data['train']
data 

Dataset({
    features: ['conversation', 'topic', 'idx', 'customer_tone', 'agent_tone', 'number_of_exchange', 'agent_name', 'month'],
    num_rows: 100
})

In [36]:
conversations = data["conversation"]
no_of_conversations = data["number_of_exchange"]

In [63]:
len(conversations)

100

In [66]:
import json

def create_finetuning_dataset(conversations, file_name):
    messages = []
    system_prompt = "You are a customer support agent who specializes in dealing with customer greivances and complaints. You always answer in a polite and professional manner. If you cannot find a solution to the customer's problem, you should escalate the issue to a higher authority."
    for conversation  in conversations:
        text = conversation
        text = text.replace("**", "")
        chunks = text.split("\n\n")
        if len (chunks) % 2 != 0:
            chunks.pop()
        for i in range(0,len(chunks),2) :
            customer_message = chunks[i].split(":")[1]
            assistant_message = chunks[i+1].split(":")[1]
            message = {"messages": [
                            {"role": "system", "content": f"{system_prompt}"},
                            {"role": "user", "content": f"{customer_message}"},
                            {"role": "assistant", "content": f"{assistant_message}"},
                        ]}
            messages.append(message)
    
    with open(file_name, 'w') as f:
        for message in messages:
            f.write(f"{json.dumps(message)}\n")
    print(f"Dataset saved to {file_name}")
    return file_name

In [67]:
training_file = create_finetuning_dataset(conversations[:75], "Train.jsonl")
testing_file = create_finetuning_dataset(conversations[75:], "Test.jsonl")

Dataset saved to Train.jsonl
Dataset saved to Test.jsonl


In [68]:
import json
from collections import defaultdict
from tiktoken import get_encoding

def validate_and_estimate_finetuning_data(file_path):
    # Setup
    format_errors = defaultdict(int)
    token_counts = []
    total_tokens = 0
    encoding = get_encoding("cl100k_base")  # For OpenAI models


    # Load the dataset
    with open(file_path, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    for idx, ex in enumerate(dataset):
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        # Validate format
        conversation_tokens = 0
        assistant_message_found = False

        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
                continue

            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1

            if message.get("role", None) not in ("system", "user", "assistant"):
                format_errors["unrecognized_role"] += 1

            content = message.get("content", None)
            function_call = message.get("function_call", None)

            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1

            # Count tokens for each message
            try:
                message_tokens = len(encoding.encode(message.get("content", "")))
                conversation_tokens += message_tokens
            except Exception as e:
                format_errors["tokenization_error"] += 1

            if message.get("role") == "assistant":
                assistant_message_found = True

        if not assistant_message_found:
            format_errors["example_missing_assistant_message"] += 1

        token_counts.append(conversation_tokens)
        total_tokens += conversation_tokens

    # Output results
    return {
        "format_errors": dict(format_errors),
        "token_counts": token_counts,
        "total_tokens": total_tokens,
    }



In [69]:
import os
print(os.getcwd())
training_File_Path = os.path.join(os.getcwd(),"Train.jsonl")
validation_File_Path = os.path.join(os.getcwd(),"Test.jsonl")
print(training_File_Path)
print(validation_File_Path)

/home/kush_210/Vettura-genai/vettura-genai/Assignments/Capstone__Project_3
/home/kush_210/Vettura-genai/vettura-genai/Assignments/Capstone__Project_3/Train.jsonl
/home/kush_210/Vettura-genai/vettura-genai/Assignments/Capstone__Project_3/Test.jsonl


In [70]:
## Training data
result = validate_and_estimate_finetuning_data(training_File_Path)

# Print Results
print("Training Data")
print("Format Errors:", result["format_errors"])
print("Token Counts per Conversation:", result["token_counts"])
print("Total Tokens:", result["total_tokens"])

result = validate_and_estimate_finetuning_data(validation_File_Path)

## Test dataset
print("\n\nTest Data")
print("Format Errors:", result["format_errors"])
print("Token Counts per Conversation:", result["token_counts"])
print("Total Tokens:", result["total_tokens"])

Training Data
Format Errors: {}
Token Counts per Conversation: [105, 111, 106, 121, 111, 86, 134, 79, 112, 123, 127, 120, 122, 102, 105, 131, 115, 107, 126, 126, 92, 95, 106, 92, 99, 110, 106, 114, 110, 95, 111, 126, 127, 116, 138, 133, 123, 103, 121, 110, 113, 96, 112, 117, 132, 124, 118, 155, 161, 98, 117, 86, 104, 110, 97, 90, 95, 100, 100, 79, 98, 97, 92, 88, 115, 92, 105, 110, 107, 115, 93, 103, 94, 137, 179, 134, 124, 142, 125, 127, 123, 120, 96, 102, 117, 128, 122, 118, 110, 116, 133, 151, 121, 91, 112, 109, 110, 121, 116, 93, 97, 110, 110, 96, 108, 87, 121, 100, 106, 102, 112, 91, 100, 105, 114, 108, 99, 108, 91, 106, 105, 106, 99, 71, 130, 121, 144, 136, 116, 138, 91, 113, 112, 138, 131, 116, 113, 109, 131, 80, 113, 104, 107, 110, 104, 113, 108, 91, 105, 124, 109, 93, 99, 88, 111, 145, 91, 119, 111, 125, 111, 106, 117, 97, 121, 125, 101, 99, 111, 100, 96, 114, 98, 148, 182, 101, 119, 86, 111, 126, 132, 121, 136, 86, 101, 102, 94, 102, 96, 95, 97, 110, 111, 99, 103, 96, 99, 121

In [71]:

from dotenv import load_dotenv
import wandb
import os

# Load environment variables from a .env file
load_dotenv()

# Get the OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")

print("OpenAI API Key loaded successfully.")

wandb.login()

OpenAI API Key loaded successfully.


[34m[1mwandb[0m: Currently logged in as: [33mkush2101999[0m ([33mdl_3[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [72]:
from openai import OpenAI
## create a client
client = OpenAI(api_key=api_key)

# Function to check if a file already exists on OpenAI
def get_existing_file_id(filename):
    files = client.files.list()
    for file in files.data:
        if file.filename == filename:
            return file.id  # Return the existing file ID
    return None  # File does not exist

# Function to delete a file by ID
def delete_file(file_id):
    response = client.files.delete(file_id)
    return response.deleted

# Check and delete training file
file_name = os.path.basename(training_File_Path)
training_file_id = get_existing_file_id(file_name)
if training_file_id:
    print(f"Deleting existing training file: {training_File_Path}")
    delete_file(training_file_id)

# Check and delete validation file
file_name = os.path.basename(validation_File_Path)
validation_file_id = get_existing_file_id(file_name)
if validation_file_id:
    print(f"Deleting existing validation file: {validation_File_Path}")
    delete_file(validation_file_id)

# Upload the training file
training = client.files.create(
    file=open(training_File_Path, "rb"),
    purpose="fine-tune"
)
print(f"Training file uploaded: {training.id}")

# Upload the validation file
validation = client.files.create(
    file=open(validation_File_Path, "rb"),
    purpose="fine-tune"
)
print(f"Validation file uploaded: {validation.id}")

Deleting existing training file: /home/kush_210/Vettura-genai/vettura-genai/Assignments/Capstone__Project_3/Train.jsonl
Deleting existing validation file: /home/kush_210/Vettura-genai/vettura-genai/Assignments/Capstone__Project_3/Test.jsonl
Training file uploaded: file-G2tHjt5jCfoYCV1QCMEoMm
Validation file uploaded: file-Fh16mYLgHKHk85GhasKFbf


In [73]:
## List all the files to choose its id for fine tuning with it's data
files = client.files.list()
print(files.data)

[FileObject(id='file-Fh16mYLgHKHk85GhasKFbf', bytes=84957, created_at=1740159246, filename='Test.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None, expires_at=None), FileObject(id='file-G2tHjt5jCfoYCV1QCMEoMm', bytes=272894, created_at=1740159246, filename='Train.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None, expires_at=None), FileObject(id='file-Jfhb1UCJwAu4q8xdJ9uoQT', bytes=1608, created_at=1739733718, filename='step_metrics.csv', object='file', purpose='fine-tune-results', status='processed', status_details=None, expires_at=None), FileObject(id='file-CShKiZYdfURf8GUpAiQinw', bytes=2196, created_at=1739671084, filename='step_metrics.csv', object='file', purpose='fine-tune-results', status='processed', status_details=None, expires_at=None), FileObject(id='file-KHYEvwF6hcaACbiFmShezV', bytes=4503614, created_at=1739669937, filename='cat_dog_test.jsonl', object='file', purpose='fine-tune', status='processed', statu

In [74]:
## Paste the file id into the training_file parameter and choose the model and adjust the hyperparameters if you want to tune it
job = client.fine_tuning.jobs.create(
    training_file= training.id,
    validation_file=validation.id,
    model = "gpt-4o-mini-2024-07-18",
    method={
        "type": "supervised",
        "supervised": {
            "hyperparameters": {
                "n_epochs": 4,  # Number of epochs
                "batch_size": 20,  # Batch size
                "learning_rate_multiplier": 0.8,  # Learning rate scaling factor
            }
        }
    },
    integrations= [
        {
            "type": "wandb",
            "wandb": {
                "project": "insuarance customer support agent",
                "tags": ["bot", "customer support", "finetuning"]
            }
        }
    ]
)
print(job)

FineTuningJob(id='ftjob-7NhS02wQSlTX4vZPNsisFmSH', created_at=1740159428, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=20, learning_rate_multiplier=0.8, n_epochs=4), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-zWe7tdzyNIozBnwMRHtVvoQr', result_files=[], seed=940178511, status='validating_files', trained_tokens=None, training_file='file-G2tHjt5jCfoYCV1QCMEoMm', validation_file='file-Fh16mYLgHKHk85GhasKFbf', estimated_finish=None, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='insuarance customer support agent', entity=None, name=None, tags=None, run_id='ftjob-7NhS02wQSlTX4vZPNsisFmSH'))], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=20, learning_rate_multiplier=0.8, n_epochs=4)), type='supervised'), user_provided_suffix=None)


In [75]:
## Listing all the recent jobs
all_jobs = client.fine_tuning.jobs.list(limit=10).data
print(all_jobs)

[FineTuningJob(id='ftjob-7NhS02wQSlTX4vZPNsisFmSH', created_at=1740159428, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=20, learning_rate_multiplier=0.8, n_epochs=4), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-zWe7tdzyNIozBnwMRHtVvoQr', result_files=[], seed=940178511, status='validating_files', trained_tokens=None, training_file='file-G2tHjt5jCfoYCV1QCMEoMm', validation_file='file-Fh16mYLgHKHk85GhasKFbf', estimated_finish=None, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='insuarance customer support agent', entity=None, name=None, tags=None, run_id='ftjob-7NhS02wQSlTX4vZPNsisFmSH'))], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=20, learning_rate_multiplier=0.8, n_epochs=4)), type='supervised'), user_provided_suffix=None), FineTuningJ

In [76]:
## Prinint the recent job to get the fine-tuned model name
print(all_jobs[0])
print(client.fine_tuning.jobs.retrieve(all_jobs[0].id))

FineTuningJob(id='ftjob-7NhS02wQSlTX4vZPNsisFmSH', created_at=1740159428, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=20, learning_rate_multiplier=0.8, n_epochs=4), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-zWe7tdzyNIozBnwMRHtVvoQr', result_files=[], seed=940178511, status='validating_files', trained_tokens=None, training_file='file-G2tHjt5jCfoYCV1QCMEoMm', validation_file='file-Fh16mYLgHKHk85GhasKFbf', estimated_finish=None, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='insuarance customer support agent', entity=None, name=None, tags=None, run_id='ftjob-7NhS02wQSlTX4vZPNsisFmSH'))], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=20, learning_rate_multiplier=0.8, n_epochs=4)), type='supervised'), user_provided_suffix=None)
FineTuningJob

In [77]:
import time
import requests
checkpoints = None

# Function to get the latest accuracy and loss from checkpoints
def get_latest_accuracy(job_id, api_key):
    url = f"https://api.openai.com/v1/fine_tuning/jobs/{job_id}/checkpoints"
    headers = {"Authorization": f"Bearer {api_key}"}

    response = requests.get(url, headers=headers)
    checkpoints = response.json().get("data", [])

    if not checkpoints:
        return None, None  # Return None if no checkpoints are available

    # Find the latest checkpoint based on step_number
    latest_checkpoint = max(checkpoints, key=lambda c: c["step_number"])
    latest_accuracy = latest_checkpoint["metrics"]["full_valid_mean_token_accuracy"]
    latest_loss = latest_checkpoint["metrics"]["full_valid_loss"]
    return latest_accuracy, latest_loss

# Function to monitor fine-tuning job and print training/validation metrics
def monitor_finetuning_progress(job_id, api_key, check_interval=10):
    while True:
        try:
            # Retrieve the fine-tuning job status
            job_status = client.fine_tuning.jobs.retrieve(job_id)

            # Print basic job details
            print(f"Job ID: {job_status.id}")
            print(f"Status: {job_status.status}")

            # Check if the job has completed
            if job_status.status in ["succeeded", "failed"]:
                print(f"Fine-tuning job {job_status.status}.")
                model_id = job_status.fine_tuned_model
                result_file_id = job_status.result_files[0]
                return job_status, model_id, result_file_id
            
            # Retrieve and print the latest accuracy and loss
            latest_accuracy, latest_loss = get_latest_accuracy(job_id, api_key)
            if latest_accuracy is not None and latest_loss is not None:
                print(f"Latest Accuracy: {latest_accuracy:.3f}")
                print(f"Latest Loss: {latest_loss:.3f}")
            else:
                print("No checkpoints available yet.")
                
            # Wait before the next check
            print(f"Checking again in {check_interval} seconds...\n")
            time.sleep(check_interval)

        except Exception as e:
            print(f"An error occurred: {e}. Retrying in {check_interval} seconds...\n")
            time.sleep(check_interval)


# Replace `fine_tuning_job_id` with your actual job ID
fine_tuning_job_id = all_jobs[0].id
status, model_name, result_file_id = monitor_finetuning_progress(fine_tuning_job_id, api_key, 10)
print(f"Status: {status}")
print(f"Model Name: {model_name}")
print(f"Result file id: {result_file_id}")

Job ID: ftjob-7NhS02wQSlTX4vZPNsisFmSH
Status: validating_files
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-7NhS02wQSlTX4vZPNsisFmSH
Status: validating_files
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-7NhS02wQSlTX4vZPNsisFmSH
Status: validating_files
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-7NhS02wQSlTX4vZPNsisFmSH
Status: validating_files
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-7NhS02wQSlTX4vZPNsisFmSH
Status: validating_files
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-7NhS02wQSlTX4vZPNsisFmSH
Status: validating_files
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-7NhS02wQSlTX4vZPNsisFmSH
Status: validating_files
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-7NhS02wQSlTX4vZPNsisFmSH
Status: validating_files
No checkpoints available yet.
Checking again in

In [78]:
response = requests.get(
    f"https://api.openai.com/v1/fine_tuning/jobs/{all_jobs[0].id}/checkpoints",
    headers={"Authorization": f"Bearer {api_key}"}
)
checkpoints = response.json().get("data", [])
for checkpoint in checkpoints:
    print(checkpoint)

{'object': 'fine_tuning.job.checkpoint', 'id': 'ftckpt_gXpuojhm2fm1SlFh0MBzpWHs', 'created_at': 1740160183, 'fine_tuned_model_checkpoint': 'ft:gpt-4o-mini-2024-07-18:personal::B3RSLUDL', 'fine_tuning_job_id': 'ftjob-7NhS02wQSlTX4vZPNsisFmSH', 'metrics': {'step': 81}, 'step_number': 81}
{'object': 'fine_tuning.job.checkpoint', 'id': 'ftckpt_RpupL3VQ3LLUfucdtMB58teX', 'created_at': 1740160077, 'fine_tuned_model_checkpoint': 'ft:gpt-4o-mini-2024-07-18:personal::B3RSKWIa:ckpt-step-63', 'fine_tuning_job_id': 'ftjob-7NhS02wQSlTX4vZPNsisFmSH', 'metrics': {'step': 63}, 'step_number': 63}
{'object': 'fine_tuning.job.checkpoint', 'id': 'ftckpt_uPrmzWq0wjsmjvr7BqSSfYSN', 'created_at': 1740159961, 'fine_tuned_model_checkpoint': 'ft:gpt-4o-mini-2024-07-18:personal::B3RSK9Ya:ckpt-step-42', 'fine_tuning_job_id': 'ftjob-7NhS02wQSlTX4vZPNsisFmSH', 'metrics': {'step': 42}, 'step_number': 42}


In [79]:
import requests

def print_result_file_content(file_id, api_key):
    # API endpoint to retrieve file content
    url = f"https://api.openai.com/v1/files/{file_id}/content"
    headers = {"Authorization": f"Bearer {api_key}"}

    # Request the file content
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        # Print the contents of the file
        print("Result File Contents:")
        print(response.text)
    else:
        print(f"Failed to retrieve file content. Status Code: {response.status_code}")
        print(f"Error: {response.json()}")

# Print the result file content
print_result_file_content(result_file_id, api_key)


Result File Contents:
c3RlcCx0cmFpbl9sb3NzLHRyYWluX2FjY3VyYWN5LHZhbGlkX2xvc3MsdmFsaWRfbWVhbl90b2tlbl9hY2N1cmFjeSx0cmFpbl9tZWFuX3Jld2FyZCxmdWxsX3ZhbGlkYXRpb25fbWVhbl9yZXdhcmQKMSwzLjAzMDA0LDAuNTg5ODEsMi44NDA2MSwwLjU4ODEsLAoyLDMuMTU2MTgsMC41NjI5MSwyLjY1MzUyLDAuNTk5NzgsLAozLDMuMjU4ODUsMC41Mjc5MiwyLjc3NTg3LDAuNTk2NDEsLAo0LDIuODk5NzEsMC41NDU1NywyLjQyMzksMC42MTgxOCwsCjUsMi43Mjg1MSwwLjU3MzE5LDIuNzU5ODIsMC41NjkyMSwsCjYsMi45MzQ5NSwwLjU2ODk5LDIuNDUzNDIsMC41OTE2MiwsCjcsMi40NzA5OCwwLjU3OTMsMi4wNTE5OSwwLjYxNDgsLAo4LDIuMjI4NDIsMC42MDQ4NiwxLjc0MjE2LDAuNjMwODMsLAo5LDEuODYwMTMsMC42MTMwOSwxLjI2NTUyLDAuNzAxNzEsLAoxMCwxLjY0OTAyLDAuNjUxNDQsMS4zMjg2OCwwLjY1MTM1LCwKMTEsMS4zMDI3MiwwLjY4NTQ1LDEuMjAwMzYsMC42NDMsLAoxMiwxLjM3MTI5LDAuNjI5MSwxLjI3MTA2LDAuNjQ3NjYsLAoxMywxLjMwMjYzLDAuNjM5NzQsMS4wOTUwMSwwLjY4OTg3LCwKMTQsMS4zNzA0NywwLjYyNDE0LDEuMDUzNjIsMC42NzQ3OCwsCjE1LDEuMjUzMTMsMC42NTg4MSwxLjA5OTgsMC42ODA0NSwsCjE2LDEuMTMxNiwwLjY4MTMsMC45Nzc3NSwwLjY5NzI4LCwKMTcsMS4wMjc2MiwwLjcwMDM5LDEuMDIzOTYsMC42OTUsLAoxOCwxLjEyMzU0LD

In [80]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
## Inferencing the fine tuned model
def query(user_input,model_name):
  completion = client.chat.completions.create(
      model= model_name,
      messages=[
          {"role": "user", "content": user_input }
      ],
      temperature=0.7,
  )

  return completion.choices[0].message.content

In [82]:
queries = [
    "Hi, I have an issue with my insurance policy.",
    "I need help with a claim I submitted last week.",
    "I want to cancel my policy and get a refund.",
    "I need to update my contact information.",
    "I'm having trouble accessing my account online.",
    "I need to speak with a customer service representative.",
    "I want to know the status of my claim.",
    "I need to file a complaint about my claim.",
    "I have a question about my coverage options.",
    "I want to add a new driver to my policy.",
]
model = "ft:gpt-4o-mini-2024-07-18:personal::B3RSLUDL"

In [83]:
for q in queries:
    print(f"Query: {q}")
    print("---"*20)
    response = query(q, model)
    print(f" Response: {response}\n")
    print("==="*20)

Query: Hi, I have an issue with my insurance policy.
------------------------------------------------------------
 Response:  I'm here to help. What seems to be the problem with your policy?

Query: I need help with a claim I submitted last week.
------------------------------------------------------------
 Response:  Sure! Could you provide me with your claim number and a brief description of the issue you're experiencing?

Query: I want to cancel my policy and get a refund.
------------------------------------------------------------
 Response:  To assist you better, could you please provide your policy number and the reason for the cancellation?

Query: I need to update my contact information.
------------------------------------------------------------
 Response:  I can't assist with personal data updates. Please follow the provided link or contact customer service directly for help.

Query: I'm having trouble accessing my account online.
-------------------------------------------