<a href="https://colab.research.google.com/github/MariaMuu/snippets/blob/main/OpenAI_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preparing the data for the fine-tuning process**

In [None]:
!pip install OpenAI

**Converting CSV to JSONL Format**

In [None]:
import csv
import json

def convert_csv_to_training_format(input_csv, output_file):
    system_message = {
        "role": "system",
        "content": "" #give the basic instruction that apply to every prompt
    }

    with open(input_csv, 'r', encoding='utf-8') as csvfile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header

        for row in reader:
            your_dataset_here = row[0] # change this
            extracted_json = row[1]

            training_example = {
                "messages": [
                    system_message,
                    {"role": "user", "content": medical_report}, # change this
                    {"role": "assistant", "content": extracted_json}
                ]
            }
            outfile.write(json.dumps(training_example) + '\n')

**Creating Training and Validation Sets**

In [None]:
# Prepare training data
convert_csv_to_training_format("medical-records.csv", "training_data.jsonl") # change

# Prepare validation data
convert_csv_to_training_format("validation-medical-records.csv", "validation_data.jsonl") # change

# Setting Up and Starting the Fine-Tuning Process

**Initial Setup with OpenAI**

In [None]:
from openai import OpenAI
from time import sleep

# Initialize OpenAI client
client = OpenAI(api_key = your_api_key)

**Step 1: Uploading Training Files**

In [None]:
def upload_training_file(file_path):
    """Upload training file to OpenAI"""
    with open(file_path, "rb") as file:
        response = client.files.create(
            file=file,
            purpose="fine-tune"
        )
        return response.id

# Upload both training and validation files
training_file_id = upload_training_file("training_data.jsonl") # change
validation_file_id = upload_training_file("validation_data.jsonl") # change

**Step 2: Creating a Fine-Tuning Job**

In [None]:
def create_fine_tuning_job(training_file_id, validation_file_id=None, model="gpt-4o-mini-2024-07-18"): # change model
    """Create a fine-tuning job"""
    response = client.fine_tuning.jobs.create(
        training_file=training_file_id,
        validation_file=validation_file_id,
        model=model
    )
    return response.id

# Start the fine-tuning job
job_id = create_fine_tuning_job(training_file_id, validation_file_id, model)

**Step 3: Monitoring Training Progress**

In [None]:
def monitor_job(job_id):
    """Monitor fine-tuning job progress"""
    while True:
        job = client.fine_tuning.jobs.retrieve(job_id)
        print(f"Status: {job.status}")

        if job.status in ["succeeded", "failed"]:
            return job

        # List latest events
        events = client.fine_tuning.jobs.list_events(
            fine_tuning_job_id=job_id,
            limit=5
        )
        for event in events.data:
            print(f"Event: {event.message}")

        sleep(30)  # Check every 30 seconds

# Monitor the job until completion
job = monitor_job(job_id)
if job.status == "succeeded":
    fine_tuned_model = job.fine_tuned_model
    print(f"Fine-tuned model ID: {fine_tuned_model}")
else:
    print("Fine-tuning failed.")

# **Testing and Using Your Fine-Tuned Model**

**Making Predictions with Your Model**

In [None]:
def test_model(model_id, test_input):
    """Test the fine-tuned model"""
    completion = client.chat.completions.create(
        model=model_id,
        messages=[
            {
                "role": "system",
                "content": "Extract Details from medical report"
            },
            {"role": "user", "content": test_input}
        ]
    )
    return completion.choices[0].message

**Let's try it with a new medical report:**

In [None]:
# Test input
test_report = """Marcus Wong, a 19-year-old male, presents with severe acne
on face and upper back present for 1 year. Multiple inflammatory papules
and nodules noted on examination. Previous trials of over-the-counter
treatments ineffective. Started on Isotretinoin 40mg daily with monthly
liver function monitoring."""

# Get prediction
result = test_model(fine_tuned_model, test_report)

# Parse the JSON response
import json
extracted_data = json.loads(result.content)
print(json.dumps(extracted_data, indent=2))