In [1]:
pip install openai requests tiktoken numpy

Note: you may need to restart the kernel to use updated packages.


In [29]:
# Run preliminary checks

import json

# Load the training set
with open('training_set.jsonl', 'r', encoding='utf-8') as f:
    training_dataset = [json.loads(line) for line in f]

# Training dataset stats
print("Number of examples in training set:", len(training_dataset))
print("First example in training set:")
for message in training_dataset[0]["messages"]:
    print(message)

# Load the validation set
with open('validation_set.jsonl', 'r', encoding='utf-8') as f:
    validation_dataset = [json.loads(line) for line in f]

# Validation dataset stats
print("\nNumber of examples in validation set:", len(validation_dataset))
print("First example in validation set:")
for message in validation_dataset[0]["messages"]:
    print(message)

Number of examples in training set: 21
First example in training set:
{'role': 'system', 'content': 'Clippy is a factual chatbot that assist about the Azure Security Policy.'}
{'role': 'user', 'content': 'Write a condition to deny access if the field name does not match the patterns contoso?????? and contoso-???-##.'}
{'role': 'assistant', 'content': 'If all of the field is name does not match contoso?????? and does not match contoso-???-##, then deny.'}

Number of examples in validation set: 21
First example in validation set:
{'role': 'system', 'content': 'Clippy is a factual chatbot that assist about the Azure Security Policy.'}
{'role': 'user', 'content': 'Design a rule that triggers denial if the name field deviates from both contoso?????? and contoso-???-## patterns.'}
{'role': 'assistant', 'content': 'If all of the field is name does not match contoso?????? and does not match contoso-???-##, then deny.'}


In [30]:
# Validate token counts

import json
import tiktoken
import numpy as np
from collections import defaultdict

encoding = tiktoken.get_encoding("cl100k_base") # default encoding used by gpt-4, turbo, and text-embedding-ada-002 models

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

files = ['training_set.jsonl', 'validation_set.jsonl']

for file in files:
    print(f"Processing file: {file}")
    with open(file, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    total_tokens = []
    assistant_tokens = []

    for ex in dataset:
        messages = ex.get("messages", {})
        total_tokens.append(num_tokens_from_messages(messages))
        assistant_tokens.append(num_assistant_tokens_from_messages(messages))

    print_distribution(total_tokens, "total tokens")
    print_distribution(assistant_tokens, "assistant tokens")
    print('*' * 50)

Processing file: training_set.jsonl

#### Distribution of total tokens:
min / max: 85, 150
mean / median: 108.14285714285714, 101.0
p5 / p95: 87.0, 139.0

#### Distribution of assistant tokens:
min / max: 28, 59
mean / median: 42.904761904761905, 43.0
p5 / p95: 35.0, 52.0
**************************************************
Processing file: validation_set.jsonl

#### Distribution of total tokens:
min / max: 85, 159
mean / median: 106.57142857142857, 101.0
p5 / p95: 86.0, 139.0

#### Distribution of assistant tokens:
min / max: 28, 59
mean / median: 42.904761904761905, 43.0
p5 / p95: 35.0, 52.0
**************************************************


In [31]:
# Upload fine-tuning files

import os
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),
  api_version = "2024-02-01"  # This API version or later is required to access fine-tuning for turbo/babbage-002/davinci-002
)

training_file_name = 'training_set.jsonl'
validation_file_name = 'validation_set.jsonl'

# Upload the training and validation dataset files to Azure OpenAI with the SDK.

training_response = client.files.create(
    file = open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

validation_response = client.files.create(
    file = open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-2f5538adef3f41729111089d4fd79868
Validation file ID: file-8cf6aa563aa44f6e9e7c4215853f6258


In [53]:
# Submit fine-tuning training job

response = client.fine_tuning.jobs.create(
    training_file = training_file_id,
    validation_file = validation_file_id,
    model = "gpt-35-turbo-0613", # Enter base model name. Note that in Azure OpenAI the model name contains dashes and cannot contain dot/period characters.
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))

Job ID: ftjob-7822acddcb504aadaa0fff72ccee5063
Status: pending
{
  "id": "ftjob-7822acddcb504aadaa0fff72ccee5063",
  "created_at": 1714752077,
  "error": null,
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "n_epochs": -1,
    "batch_size": -1,
    "learning_rate_multiplier": 1
  },
  "model": "gpt-35-turbo-0613",
  "object": "fine_tuning.job",
  "organization_id": null,
  "result_files": null,
  "status": "pending",
  "trained_tokens": null,
  "training_file": "file-2f5538adef3f41729111089d4fd79868",
  "validation_file": "file-8cf6aa563aa44f6e9e7c4215853f6258"
}


In [54]:
# Track training status

from IPython.display import clear_output
import time

start_time = time.time()

# Get the status of our fine-tuning job.
response = client.fine_tuning.jobs.retrieve(job_id)

status = response.status

# If the job isn't done yet, poll it every 10 seconds.
while status not in ["succeeded", "failed"]:
    time.sleep(10)

    response = client.fine_tuning.jobs.retrieve(job_id)
    print(response.model_dump_json(indent=2))
    print("Elapsed time: {} minutes {} seconds".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))
    status = response.status
    print(f'Status: {status}')
    clear_output(wait=True)

print(f'Fine-tuning job {job_id} finished with status: {status}')

# List all fine-tuning jobs for this resource.
print('Checking other fine-tune jobs for this resource.')
response = client.fine_tuning.jobs.list()
print(f'Found {len(response.data)} fine-tune jobs.')

Fine-tuning job ftjob-7822acddcb504aadaa0fff72ccee5063 finished with status: succeeded
Checking other fine-tune jobs for this resource.
Found 8 fine-tune jobs.


In [55]:
# Retrieve fine_tuned_model name

response = client.fine_tuning.jobs.retrieve(job_id)

print(response.model_dump_json(indent=2))
fine_tuned_model = response.fine_tuned_model

{
  "id": "ftjob-7822acddcb504aadaa0fff72ccee5063",
  "created_at": 1714752077,
  "error": null,
  "fine_tuned_model": "gpt-35-turbo-0613.ft-7822acddcb504aadaa0fff72ccee5063",
  "finished_at": 1714754111,
  "hyperparameters": {
    "n_epochs": 4,
    "batch_size": 1,
    "learning_rate_multiplier": 1
  },
  "model": "gpt-35-turbo-0613",
  "object": "fine_tuning.job",
  "organization_id": null,
  "result_files": [
    "file-ad472f0ed7384fd48192bd7de2519662"
  ],
  "status": "succeeded",
  "trained_tokens": 9084,
  "training_file": "file-2f5538adef3f41729111089d4fd79868",
  "validation_file": "file-8cf6aa563aa44f6e9e7c4215853f6258"
}


In [10]:
# Use the deployed customized model

import os
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),
  api_version = "2024-02-01"
)

response = client.chat.completions.create(
    model = "Testfinetunev2", # model = "Custom deployment name you chose for your fine-tuning model"
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "what will the effect if field name doesn't match with contoso? please give result in json format"},
    ]
)

print(response.choices[0].message.content)

If the field name does not match with "contoso", the effect will be "deny".
