In [30]:
from datasets import load_dataset
import openai
import os
from dotenv import load_dotenv
import json


In [31]:
load_dotenv()

True

In [32]:
OPEN_AI_API_KEY = os.getenv("OPENAI_API_KEY")

In [33]:
openai.api_key = OPEN_AI_API_KEY

In [16]:
dataset = load_dataset("mbpp")

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 374
    })
    test: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 90
    })
    prompt: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 10
    })
})

In [14]:
print(dataset["train"][0]['code'])

class Pair(object): 
	def __init__(self, a, b): 
		self.a = a 
		self.b = b 
def max_chain_length(arr, n): 
	max = 0
	mcl = [1 for i in range(n)] 
	for i in range(1, n): 
		for j in range(0, i): 
			if (arr[i].a > arr[j].b and
				mcl[i] < mcl[j] + 1): 
				mcl[i] = mcl[j] + 1
	for i in range(n): 
		if (max < mcl[i]): 
			max = mcl[i] 
	return max


In [34]:
prompt_model = """
    ### Context:
    If additional context is provided, use it to enhance your response.
    
    ### User's questions:
    {query}

    ### Python version: 
    {python_version}
    
    ** Instructions **
    - If user asks you to generate code and by using context you cannot do it, then generate it on your own
    - If user doesn't ask to generate code and the context does not contain answer for query say "I don't have sufficient knowledge to answer this question".
    - If the user's question does not specify Python, rephrase it internally as a Python-related question before answering.
    - If there is a code in your output explain this code to the user step by step
    - Do not answer any other question than about python programming language
    - If topic is complex provide summary at the end of your answer
    - Do not make up any information
    - Provide consise and structured answer
    """

In [35]:
python_versions=["3.10","3.11","3.12","3.13"]

In [50]:
formatted_data = []


for python_version in python_versions:
    for example in dataset["train"]:
        formatted_data.append({
            "messages": [
                {"role": "system", "content": prompt_model.format(query=example["text"], python_version=python_version).strip()},
                {"role": "user", "content": example["text"]},
                {"role": "assistant", "content": example["code"]}
            ]
        })

with open("mbpp_finetune.jsonl", "w") as f:
    for entry in formatted_data:
        f.write(json.dumps(entry) + "\n")

In [51]:
validation_data = []
for python_version in python_versions:
    for example in dataset["validation"]:
        validation_data.append({
            "messages": [
                {"role": "system", "content": prompt_model.format(query=example["text"], python_version=python_version).strip()},
                {"role": "user", "content": example["text"]},
                {"role": "assistant", "content": example["code"]}
            ]
        })

with open("mbpp_finetune_val.jsonl", "w") as f:
    for entry in validation_data:
        f.write(json.dumps(entry) + "\n")

In [52]:
print(validation_data[0])

{'messages': [{'role': 'system', 'content': '### Context:\n    If additional context is provided, use it to enhance your response.\n    \n    ### User\'s questions:\n    Write a python function to find minimum sum of factors of a given number.\n\n    ### Python version: \n    3.10\n    \n    ** Instructions **\n    - If user asks you to generate code and by using context you cannot do it, then generate it on your own\n    - If user doesn\'t ask to generate code and the context does not contain answer for query say "I don\'t have sufficient knowledge to answer this question".\n    - If the user\'s question does not specify Python, rephrase it internally as a Python-related question before answering.\n    - If there is a code in your output explain this code to the user step by step\n    - Do not answer any other question than about python programming language\n    - If topic is complex provide summary at the end of your answer\n    - Do not make up any information\n    - Provide consise

In [54]:
train_file = openai.files.create(
  file=open("mbpp_finetune.jsonl", "rb"),
  purpose="fine-tune"
)

valid_file = openai.files.create(
  file=open("mbpp_finetune_val.jsonl", "rb"),
  purpose="fine-tune"
)


In [55]:
print(f"Training file Info: {train_file}")
print(f"Validation file Info: {valid_file}")

Training file Info: FileObject(id='file-TLoZTPyqSHzprWM2vPEdLH', bytes=2054116, created_at=1741724696, filename='mbpp_finetune.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)
Validation file Info: FileObject(id='file-LT8NjQrKWvr2N9NWMyHMJZ', bytes=495784, created_at=1741724697, filename='mbpp_finetune_val.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)


In [59]:
model = openai.fine_tuning.jobs.create(
  training_file=train_file.id, 
  validation_file=valid_file.id,
  model="gpt-4o-mini-2024-07-18", 
  hyperparameters={
    "n_epochs": 3,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
  }
)
job_id = model.id
status = model.status

In [60]:
print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {model}")
print(f"Training Status: {status}")

Fine-tuning model with jobID: ftjob-nAHqCOHrlROLsh4fzkDAvyb1.
Training Response: FineTuningJob(id='ftjob-nAHqCOHrlROLsh4fzkDAvyb1', created_at=1741724879, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=3, learning_rate_multiplier=0.3, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-bgSJ5OvwR04OVBpr61Q2jq7K', result_files=[], seed=338309469, status='validating_files', trained_tokens=None, training_file='file-TLoZTPyqSHzprWM2vPEdLH', validation_file='file-LT8NjQrKWvr2N9NWMyHMJZ', estimated_finish=None, integrations=[], metadata=None, method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=3, learning_rate_multiplier=0.3, n_epochs=3)), type='supervised'), user_provided_suffix=None)
Training Status: validating_files


In [61]:
openai.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-nAHqCOHrlROLsh4fzkDAvyb1', created_at=1741724879, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=3, learning_rate_multiplier=0.3, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-bgSJ5OvwR04OVBpr61Q2jq7K', result_files=[], seed=338309469, status='validating_files', trained_tokens=None, training_file='file-TLoZTPyqSHzprWM2vPEdLH', validation_file='file-LT8NjQrKWvr2N9NWMyHMJZ', estimated_finish=None, integrations=[], metadata=None, method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=3, learning_rate_multiplier=0.3, n_epochs=3)), type='supervised'), user_provided_suffix=None)

In [62]:
openai.fine_tuning.jobs.cancel(job_id)

BadRequestError: Error code: 400 - {'error': {'message': 'Job has already completed: ftjob-nAHqCOHrlROLsh4fzkDAvyb1', 'type': 'invalid_request_error', 'param': 'fine_tune_id', 'code': 'invalid_cancel'}}

In [63]:
result = openai.fine_tuning.jobs.list()

In [64]:
fine_tuned_model = result.data[0].fine_tuned_model

In [65]:
print(fine_tuned_model)

ft:gpt-4o-mini-2024-07-18:personal::BA15GnWM


In [69]:
completion = openai.chat.completions.create(
  model = fine_tuned_model,
  messages=[
    {"role": "system", "content": "You are python expert and you provide answer only based on given context."},
    {"role": "user", "content": "generate code to print"}
  ]
)
print(completion.choices[0].message.content)

Please provide more context or specify what you want to print so I can assist you appropriately.
