In [1]:
# !pip install openai

In [2]:
import pandas as pd
from openai import OpenAI
from time import sleep
import random
import json


In [3]:
GPT_FINE_TUNE_KEY = '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'

In [4]:
# Initialize OpenAI client
client = OpenAI(api_key = GPT_FINE_TUNE_KEY)

In [5]:
# Process 1
def convert_json_to_training_format(file_path, system_content, output_file, user_input, assistant_input):
    jsonl_data = []
    df = pd.read_json(file_path)
    for _, row in df.iterrows():
        jsonl_item = {
            "messages": [
                {"role": "system", "content": system_content},
                {"role": "user", "content": row[user_input]},
                {"role": "assistant", "content": row[assistant_input]}
            ]
        }
        jsonl_data.append(jsonl_item)
        with open(output_file, "w", encoding="utf-8") as f:
            for line in jsonl_data:
                f.write(json.dumps(line, ensure_ascii=False) + "\n")
    return f"JSON Lines file saved to {output_file}"

In [6]:
# Process 2
def upload_training_file(file_path):
    """Upload training file to OpenAI"""
    with open(file_path, "rb") as file:
        response = client.files.create(
            file=file,
            purpose="fine-tune"
        )
        return response.id

In [7]:
def split_train_test_data(input_file, train_file, test_file):
    data = []
    
    with open(input_file, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))

    df = pd.DataFrame(data)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    train_size = int(len(df) * 0.8)
    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]

    with open(train_file, "w", encoding="utf-8") as f:
        for _, row in train_df.iterrows():
            f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

    with open(test_file, "w", encoding="utf-8") as f:
        for _, row in test_df.iterrows():
            f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")
            
    return f"Training data & Testing data saved to {train_file, test_file}"

In [8]:
def create_fine_tuning_job(training_file_id, validation_file_id=None, model="gpt-3.5-turbo-1106"):
    """Create a fine-tuning job"""
    response = client.fine_tuning.jobs.create(
        training_file=training_file_id,
        validation_file=validation_file_id,
        model=model,
        hyperparameters={
        'n_epochs': 3,
        'batch_size': 1,
        'learning_rate_multiplier': 1
        })
    return response.id

In [None]:
# def create_fine_tuning_job(training_file_id, validation_file_id=None, model="gpt-3.5-turbo-1106"):
#     """Create a fine-tuning job"""
#     response = client.fine_tuning.jobs.create(
#         training_file=training_file_id,
#         validation_file=validation_file_id,
#         model=model
#     )
#     return response.id

In [9]:
def monitor_job(job_id):
    """Monitor fine-tuning job progress"""
    while True:
        job = client.fine_tuning.jobs.retrieve(job_id)
        print(f"Status: {job.status}")

        if job.status in ["succeeded", "failed"]:
            return job

        # List latest events
        events = client.fine_tuning.jobs.list_events(
            fine_tuning_job_id=job_id,
            limit=5
        )
        for event in events.data:
            print(f"Event: {event.message}")

        sleep(30)  # Check every 30 seconds

In [10]:
def test_model(model_id, test_input):
    """Test the fine-tuned model"""
    completion = client.chat.completions.create(
        model=model_id,
        messages=[
            {
                "role": "system",
                "content": "Generate small tamil stories according to the heading"
            },
            {"role": "user", "content": test_input}
        ]
    )
    return completion.choices[0].message

In [11]:
input_json_path = "C:/Users/abira/OneDrive/Desktop/rp_dataset/tamil_stories.json"
system_content_message = "Generate small tamil stories according to the heading"
output_jsonl_file = 'C:/Users/abira/OneDrive/Desktop/rp_dataset/tamil_stories.jsonl'
user_input_colum_name = "input"
assistant_input_colum_name = "output"

In [12]:
convert_json_to_training_format(input_json_path, system_content_message, output_jsonl_file, user_input_colum_name, assistant_input_colum_name)

'JSON Lines file saved to C:/Users/abira/OneDrive/Desktop/rp_dataset/tamil_stories.jsonl'

In [13]:
train_file = "C:/Users/abira/OneDrive/Desktop/rp_dataset/train.jsonl"
test_file = "C:/Users/abira/OneDrive/Desktop/rp_dataset/test.jsonl"

In [14]:
split_train_test_data(output_jsonl_file,train_file,test_file)

"Training data & Testing data saved to ('C:/Users/abira/OneDrive/Desktop/rp_dataset/train.jsonl', 'C:/Users/abira/OneDrive/Desktop/rp_dataset/test.jsonl')"

In [15]:
training_file_id = upload_training_file(train_file)
validation_file_id = upload_training_file(test_file)

In [16]:
client

<openai.OpenAI at 0x206bf01b4d0>

In [17]:
training_file_id = 'file-XRdu68N5Fi73v6FcFkVPvD'
validation_file_id = 'file-EJGeoTAyaUdVURPNtFdmbB'

In [18]:
job_id = create_fine_tuning_job(training_file_id, validation_file_id)

In [19]:
#fine_tuned_model = ''

In [20]:
job_id

'ftjob-j0O2jTjKI3uuARWvVoyRut2I'

In [21]:
# Monitor the job until completion
job = monitor_job(job_id)
if job.status == "succeeded":
    fine_tuned_model = job.fine_tuned_model
    print(f"Fine-tuned model ID: {fine_tuned_model}")
else:
    print("Fine-tuning failed.")

Status: validating_files
Event: Validating training file: file-XRdu68N5Fi73v6FcFkVPvD and validation file: file-EJGeoTAyaUdVURPNtFdmbB
Event: Created fine-tuning job: ftjob-j0O2jTjKI3uuARWvVoyRut2I
Status: running
Event: Fine-tuning job started
Event: Files validated, moving job to queued state
Event: Validating training file: file-XRdu68N5Fi73v6FcFkVPvD and validation file: file-EJGeoTAyaUdVURPNtFdmbB
Event: Created fine-tuning job: ftjob-j0O2jTjKI3uuARWvVoyRut2I
Status: running
Event: Fine-tuning job started
Event: Files validated, moving job to queued state
Event: Validating training file: file-XRdu68N5Fi73v6FcFkVPvD and validation file: file-EJGeoTAyaUdVURPNtFdmbB
Event: Created fine-tuning job: ftjob-j0O2jTjKI3uuARWvVoyRut2I
Status: running
Event: Fine-tuning job started
Event: Files validated, moving job to queued state
Event: Validating training file: file-XRdu68N5Fi73v6FcFkVPvD and validation file: file-EJGeoTAyaUdVURPNtFdmbB
Event: Created fine-tuning job: ftjob-j0O2jTjKI3uuAR

Status: running
Event: Step 684/840: training loss=0.38
Event: Step 683/840: training loss=0.14
Event: Step 682/840: training loss=0.34
Event: Step 681/840: training loss=0.18
Event: Step 680/840: training loss=0.36, validation loss=0.59
Status: running
Event: Step 704/840: training loss=0.21
Event: Step 703/840: training loss=0.41
Event: Step 702/840: training loss=0.22
Event: Step 701/840: training loss=0.32
Event: Step 700/840: training loss=0.27, validation loss=0.15
Status: running
Event: Step 728/840: training loss=0.45
Event: Step 727/840: training loss=0.31
Event: Step 726/840: training loss=0.28
Event: Step 725/840: training loss=0.21
Event: Step 724/840: training loss=0.29
Status: running
Event: Step 748/840: training loss=0.22
Event: Step 747/840: training loss=0.24
Event: Step 746/840: training loss=0.44
Event: Step 745/840: training loss=0.29
Event: Step 744/840: training loss=0.28
Status: running
Event: Step 772/840: training loss=0.42
Event: Step 771/840: training loss=0

In [31]:
fine_tuned_model="ft:gpt-3.5-turbo-1106:personal::AYaBgUIZ"
test_report="கதையின் பெயர்:சிங்கம்"

In [32]:
# Get prediction
result = test_model(fine_tuned_model, test_report)

In [33]:
result

ChatCompletionMessage(content='ஒரு சிங்கம் தனது பாராளுக்கு தூண்டடித்ததும், கடினமான வெற்றியை பெற்றது. ஒருவருக்கு முன்னாள் நேரங்கள் மிகுந்ந, புதியவை வெற்றியை ஈட்டும்!', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)

In [30]:
def test_model(model_id, test_input):
    """Test the fine-tuned model"""
    completion = client.chat.completions.create(
        model=model_id,
        messages=[
            {
                "role": "system",
                "content": "Generate small tamil stories according to the heading"
            },
            {"role": "user", "content": test_input}
        ]
    )
    return completion

In [31]:
fine_tuned_model="ft:gpt-3.5-turbo-1106:personal::AYZFFeyK"
test_report="கதையின் பெயர்: நட்பு மற்றும் வாழ்வு"

In [32]:
# Get prediction
result = test_model(fine_tuned_model, test_report)

In [33]:
result

ChatCompletion(id='chatcmpl-AYZR2APJO4XWyQwnZi6qL1yA9Jfv8', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='கதையின் பெயர்: நட்பு மற்றும் வாழ்வு', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732802728, model='ft:gpt-3.5-turbo-1106:personal::AYZFFeyK', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=47, prompt_tokens=66, total_tokens=113, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))