In [None]:
## Install the necessary libraries
%%capture
!pip install -q tiktoken openai

# Preparing a Dataset for Fine-Tuning

This guide outlines the steps to prepare datasets for **Supervised Fine-Tuning (SFT)** and **Direct Preference Optimization (DPO)**. Both methods require well-structured datasets in JSONL format.

---

## Supervised Fine-Tuning (SFT)

Supervised Fine-Tuning requires a dataset containing demonstration examples of the desired behavior. Each example should consist of a **conversation** formatted like the Chat Completions API.

### Dataset Format
Each line in the dataset should represent a conversation, where each message contains the following keys:
- `role`: The role of the speaker (`system`, `user`, or `assistant`).
- `content`: The text of the message.

### Example
```json
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What's the weather like today?"}, {"role": "assistant", "content": "Today is sunny with a high of 75°F."}]}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Can you tell me a joke?"}, {"role": "assistant", "content": "Why don't scientists trust atoms? Because they make up everything!"}]}
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "How do I bake a cake?"}, {"role": "assistant", "content": "To bake a cake, you'll need flour, sugar, eggs, butter, and baking powder. Mix them, pour the batter into a pan, and bake at 350°F for 30 minutes."}]}
```

# Preparing a Dataset for Direct Preference Optimization (DPO)

Direct Preference Optimization (DPO) fine-tuning requires a dataset containing examples of **prompts** paired with a **preferred output** (ideal response) and a **non-preferred output** (suboptimal response). The model learns to prioritize the preferred output during training.

---

## Dataset Format

Each line in the dataset should be in JSONL format with the following structure:

### Keys:
1. **`input`**:
   - Contains the context or conversation leading to the model’s response.
   - Should follow the Chat Completions API format:
     - `messages`: A list of messages forming the conversation.
       - Each message includes:
         - `role`: Role of the speaker (`system`, `user`, `assistant`).
         - `content`: Text content of the message.
     - Optional fields:
       - `tools`: Tools available for the model to use.
       - `parallel_tool_calls`: Boolean to indicate if tools can be used concurrently.

2. **`preferred_output`**:
   - The ideal assistant response for the given input.
   - Follows the same message format as the Chat Completions API.

3. **`non_preferred_output`**:
   - A suboptimal response that demonstrates behavior you want the model to avoid.
   - Follows the same message format as the Chat Completions API.

---

## Example Dataset

```jsonl
{
  "input": {
    "messages": [
      {
        "role": "user",
        "content": "Hello, can you tell me how cold San Francisco is today?"
      }
    ],
    "tools": [],
    "parallel_tool_calls": true
  },
  "preferred_output": [
    {
      "role": "assistant",
      "content": "Today in San Francisco, it is not quite cold as expected. Morning clouds will give way to sunshine, with a high near 68°F (20°C) and a low around 57°F (14°C)."
    }
  ],
  "non_preferred_output": [
    {
      "role": "assistant",
      "content": "It is not particularly cold in San Francisco today."
    }
  ]
}


In [None]:
from openai import OpenAI
from google.colab import userdata


In [None]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [None]:
import json
from collections import defaultdict
from tiktoken import get_encoding

def validate_and_estimate_finetuning_data(file_path):
    # Setup
    format_errors = defaultdict(int)
    token_counts = []
    total_tokens = 0
    encoding = get_encoding("cl100k_base")  # For OpenAI models


    # Load the dataset
    with open(file_path, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    for idx, ex in enumerate(dataset):
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        # Validate format
        conversation_tokens = 0
        assistant_message_found = False

        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
                continue

            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1

            if message.get("role", None) not in ("system", "user", "assistant"):
                format_errors["unrecognized_role"] += 1

            content = message.get("content", None)
            function_call = message.get("function_call", None)

            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1

            # Count tokens for each message
            try:
                message_tokens = len(encoding.encode(message.get("content", "")))
                conversation_tokens += message_tokens
            except Exception as e:
                format_errors["tokenization_error"] += 1

            if message.get("role") == "assistant":
                assistant_message_found = True

        if not assistant_message_found:
            format_errors["example_missing_assistant_message"] += 1

        token_counts.append(conversation_tokens)
        total_tokens += conversation_tokens

    # Output results
    return {
        "format_errors": dict(format_errors),
        "token_counts": token_counts,
        "total_tokens": total_tokens,
    }



In [None]:
## Training data
file_path = "/content/train_sample.jsonl"
result = validate_and_estimate_finetuning_data(file_path)

# Print Results
print("Training Data")
print("Format Errors:", result["format_errors"])
print("Token Counts per Conversation:", result["token_counts"])
print("Total Tokens:", result["total_tokens"])


file_path = "/content/validation_sample.jsonl"
result = validate_and_estimate_finetuning_data(file_path)

## Test dataset
print("\n\nTest Data")
print("Format Errors:", result["format_errors"])
print("Token Counts per Conversation:", result["token_counts"])
print("Total Tokens:", result["total_tokens"])


Training Data
Format Errors: {}
Token Counts per Conversation: [69, 80, 61, 60, 46, 132, 87, 64, 73, 63, 85, 59, 57, 67, 61, 61, 62, 57, 345, 165, 101, 59, 85, 59, 72, 93, 230, 61, 152, 105, 94, 81, 60, 75, 49, 60, 54, 57, 91, 96, 105, 55, 94, 53, 47, 51, 67, 67, 50, 47, 58, 62, 60, 383, 119, 75, 97, 63, 60, 63, 62, 115, 67, 151, 72, 57, 69, 51, 171, 116, 71, 73, 71, 405, 166, 71, 87, 70, 59, 60, 320, 194, 74, 42, 68, 45, 169, 73, 78, 51, 63, 275, 75, 83, 73, 114, 65, 54, 43, 140]
Total Tokens: 9327


Test Data
Format Errors: {}
Token Counts per Conversation: [53, 120, 68, 89, 150, 50, 63, 40, 90, 370, 94, 70, 75, 60, 216, 90, 117, 66, 73, 136, 134, 62, 53, 54, 55, 51, 94, 478, 79, 72, 109, 73, 74, 73, 44, 56, 75, 221, 63, 77, 60, 57, 44, 46, 130, 80, 406, 233, 56, 50, 52, 70, 573, 47, 625, 52, 64, 38, 57, 94]
Total Tokens: 6921


In [None]:
## create a client
client = OpenAI(api_key = userdata.get("OPENAI_API_KEY"))

upload the fileobject with the purpose as fine-tuning
training = client.files.create(
  file=open("/content/train_sample.jsonl", "rb"),
  purpose="fine-tune"
)

validation = client.files.create(
  file=open("/content/validation_sample.jsonl", "rb"),
  purpose="fine-tune"
)

In [None]:
## List all the files to choose its id for fine tuning with it's data
files = client.files.list()

In [None]:
files.data

[FileObject(id='file-V2EegwinGZqvEUjP9nN3Sr', bytes=40960, created_at=1737951653, filename='validation_sample.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None),
 FileObject(id='file-QfxYpkRzz5SjrMyb6Hj5zM', bytes=58035, created_at=1737951631, filename='train_sample.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None),
 FileObject(id='file-5JvUDttZaSirAxLCjZ1Mig', bytes=26955, created_at=1736498794, filename='po_dpo1.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None),
 FileObject(id='file-9ULPgcXg6o5wcSimoiXjvT', bytes=2328, created_at=1736439185, filename='step_metrics.csv', object='file', purpose='fine-tune-results', status='processed', status_details=None),
 FileObject(id='file-Mr7DQ4JHjGhoU3dVDAVZgg', bytes=26955, created_at=1736437615, filename='po_dpo1.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None),
 FileObject(id='file-4MhhJYDixqhMimiwWkqkTa',

In [None]:
## Paste the file id into the training_file parameter and choose the model and adjust the hyperparameters if you want to tune it
job = client.fine_tuning.jobs.create(
    training_file="file-QfxYpkRzz5SjrMyb6Hj5zM",
    validation_file="file-V2EegwinGZqvEUjP9nN3Sr",
    model = "gpt-4o-mini-2024-07-18",
    method={
        "type": "supervised",
        "supervised": {
            "hyperparameters": {"n_epochs": 2},
        },
    },
)

In [None]:
job

FineTuningJob(id='ftjob-3EVrblg2OD7p0sN7OeMz7fBO', created_at=1737953249, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=2), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-zlIVXJ18RvnGhemNfGP9NDlz', result_files=[], seed=674802057, status='validating_files', trained_tokens=None, training_file='file-QfxYpkRzz5SjrMyb6Hj5zM', validation_file='file-V2EegwinGZqvEUjP9nN3Sr', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=2)), type='supervised'), user_provided_suffix=None)

In [None]:
## Listing all the recent jobs
all_jobs = client.fine_tuning.jobs.list(limit=10).data


In [None]:
## Prinint the recent job to get the fine-tuned model name
all_jobs[0].finished_at

1737953877

In [None]:
import time
import openai

# Function to check the status of a fine-tuning job
def wait_for_finetuning_completion(job_id, check_interval=45):
    while True:
        try:
            job_status = client.fine_tuning.jobs.retrieve(job_id)

            finished_at = job_status.finished_at

            if finished_at is not None:
                print("Fine-tuning job completed.")
                return job_status

            print("Fine-tuning job not completed. Checking again in 45 seconds...")
            time.sleep(check_interval)

        except Exception as e:
            print(f"An error occurred: {e}. Retrying in {check_interval} seconds...")
            time.sleep(check_interval)



In [None]:

fine_tuning_job_id = "ftjob-3EVrblg2OD7p0sN7OeMz7fBO"
completed_job = wait_for_finetuning_completion(fine_tuning_job_id)

print("Fine-tuning job details:")
print(completed_job)

Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job not completed. Checking again in 45 seconds...
Fine-tuning job completed.
Fine-tuning job details:
FineTuningJob(id='ftjob-3EVrblg2OD7p0sN7OeMz7fBO', created_at=1737953249, error=Error(code=None, message=None, param=None), fine_

In [None]:
import requests

def get_fine_tuning_checkpoints(job_id):
    url = f"https://api.openai.com/v1/fine_tuning/jobs/{job_id}/checkpoints"
    headers = {
        "Authorization": f"Bearer {userdata.get('OPENAI_API_KEY')}"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx and 5xx)
        # print(response.text)
        return response.json()
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}

job_id = "ftjob-3EVrblg2OD7p0sN7OeMz7fBO"

checkpoints = get_fine_tuning_checkpoints(job_id)

# Print the result
print(checkpoints)


{'object': 'list', 'data': [{'object': 'fine_tuning.job.checkpoint', 'id': 'ftckpt_2POvCj2gEPaN2CmL7xwrxPUu', 'created_at': 1737953812, 'fine_tuned_model_checkpoint': 'ft:gpt-4o-mini-2024-07-18:personal::AuBU7NDH', 'fine_tuning_job_id': 'ftjob-3EVrblg2OD7p0sN7OeMz7fBO', 'metrics': {'step': 200}, 'step_number': 200}, {'object': 'fine_tuning.job.checkpoint', 'id': 'ftckpt_Mc7b6rYTMu7lhrXp0A2cYkpB', 'created_at': 1737953609, 'fine_tuned_model_checkpoint': 'ft:gpt-4o-mini-2024-07-18:personal::AuBU7zl0:ckpt-step-100', 'fine_tuning_job_id': 'ftjob-3EVrblg2OD7p0sN7OeMz7fBO', 'metrics': {'step': 100}, 'step_number': 100}], 'has_more': False, 'first_id': 'ftckpt_2POvCj2gEPaN2CmL7xwrxPUu', 'last_id': 'ftckpt_Mc7b6rYTMu7lhrXp0A2cYkpB'}


In [None]:
 def print_checkpoints(checkpoints):
    if "data" not in checkpoints or not checkpoints["data"]:
        print("No checkpoints found.")
        return

    print("Fine-Tuning Checkpoints:\n")
    for checkpoint in checkpoints["data"]:
        print(f"Checkpoint ID: {checkpoint['id']}")
        print(f"  Created At: {checkpoint['created_at']}")
        print(f"  Fine-Tuned Model Checkpoint: {checkpoint['fine_tuned_model_checkpoint']}")
        print(f"  Fine-Tuning Job ID: {checkpoint['fine_tuning_job_id']}")
        print(f"  Metrics:")
        for key, value in checkpoint['metrics'].items():
            print(f"    {key}: {value}")
        print(f"  Step Number: {checkpoint['step_number']}")
        print("-" * 40)

    print(f"Has More: {checkpoints.get('has_more', False)}")
    print(f"First Checkpoint ID: {checkpoints.get('first_id')}")
    print(f"Last Checkpoint ID: {checkpoints.get('last_id')}")

print_checkpoints(checkpoints)

Fine-Tuning Checkpoints:

Checkpoint ID: ftckpt_2POvCj2gEPaN2CmL7xwrxPUu
  Created At: 1737953812
  Fine-Tuned Model Checkpoint: ft:gpt-4o-mini-2024-07-18:personal::AuBU7NDH
  Fine-Tuning Job ID: ftjob-3EVrblg2OD7p0sN7OeMz7fBO
  Metrics:
    step: 200
  Step Number: 200
----------------------------------------
Checkpoint ID: ftckpt_Mc7b6rYTMu7lhrXp0A2cYkpB
  Created At: 1737953609
  Fine-Tuned Model Checkpoint: ft:gpt-4o-mini-2024-07-18:personal::AuBU7zl0:ckpt-step-100
  Fine-Tuning Job ID: ftjob-3EVrblg2OD7p0sN7OeMz7fBO
  Metrics:
    step: 100
  Step Number: 100
----------------------------------------
Has More: False
First Checkpoint ID: ftckpt_2POvCj2gEPaN2CmL7xwrxPUu
Last Checkpoint ID: ftckpt_Mc7b6rYTMu7lhrXp0A2cYkpB


In [None]:
checkpoints

{'object': 'list',
 'data': [{'object': 'fine_tuning.job.checkpoint',
   'id': 'ftckpt_2POvCj2gEPaN2CmL7xwrxPUu',
   'created_at': 1737953812,
   'fine_tuned_model_checkpoint': 'ft:gpt-4o-mini-2024-07-18:personal::AuBU7NDH',
   'fine_tuning_job_id': 'ftjob-3EVrblg2OD7p0sN7OeMz7fBO',
   'metrics': {'step': 200},
   'step_number': 200},
  {'object': 'fine_tuning.job.checkpoint',
   'id': 'ftckpt_Mc7b6rYTMu7lhrXp0A2cYkpB',
   'created_at': 1737953609,
   'fine_tuned_model_checkpoint': 'ft:gpt-4o-mini-2024-07-18:personal::AuBU7zl0:ckpt-step-100',
   'fine_tuning_job_id': 'ftjob-3EVrblg2OD7p0sN7OeMz7fBO',
   'metrics': {'step': 100},
   'step_number': 100}],
 'has_more': False,
 'first_id': 'ftckpt_2POvCj2gEPaN2CmL7xwrxPUu',
 'last_id': 'ftckpt_Mc7b6rYTMu7lhrXp0A2cYkpB'}

In [None]:
## Inferencing the fine tuned model
def query(user_input):
  completion = client.chat.completions.create(
      model="ft:gpt-4o-mini-2024-07-18:personal::AuBU7NDH",
      messages=[
          {"role": "system", "content": "You are spirit of Aurobindo answer the user queries in his style."},
          {"role": "user", "content": user_input }
      ]
  )

  return completion.choices[0].message.content

In [None]:
response = query("Swami can you explain what to do when facing adversities ?")
print(response)

Whenever you meet with an obstacle or a difficulty, don’t get discouraged, don’t lose heart, but draw back into your inner conscious poise, your quietude, your strength, and your certitude of victory; take refuge there and look at the obscurity, the conflict, the difficulty with the eyes of the spirit; and see what it is trying to tell you.
