In [73]:
from datasets import load_dataset
import openai
import os
from dotenv import load_dotenv
import json
import chromadb
from peft import LoraConfig, get_peft_model



In [2]:
load_dotenv()

True

In [3]:
OPEN_AI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
openai.api_key = OPEN_AI_API_KEY

In [13]:
dataset = load_dataset("mbpp")

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 374
    })
    test: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 90
    })
    prompt: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 10
    })
})

In [15]:
print(dataset["train"][0]['code'])

class Pair(object): 
	def __init__(self, a, b): 
		self.a = a 
		self.b = b 
def max_chain_length(arr, n): 
	max = 0
	mcl = [1 for i in range(n)] 
	for i in range(1, n): 
		for j in range(0, i): 
			if (arr[i].a > arr[j].b and
				mcl[i] < mcl[j] + 1): 
				mcl[i] = mcl[j] + 1
	for i in range(n): 
		if (max < mcl[i]): 
			max = mcl[i] 
	return max


In [16]:
print(dataset["train"][0])

{'task_id': 601, 'text': 'Write a function to find the longest chain which can be formed from the given set of pairs.', 'code': 'class Pair(object): \r\n\tdef __init__(self, a, b): \r\n\t\tself.a = a \r\n\t\tself.b = b \r\ndef max_chain_length(arr, n): \r\n\tmax = 0\r\n\tmcl = [1 for i in range(n)] \r\n\tfor i in range(1, n): \r\n\t\tfor j in range(0, i): \r\n\t\t\tif (arr[i].a > arr[j].b and\r\n\t\t\t\tmcl[i] < mcl[j] + 1): \r\n\t\t\t\tmcl[i] = mcl[j] + 1\r\n\tfor i in range(n): \r\n\t\tif (max < mcl[i]): \r\n\t\t\tmax = mcl[i] \r\n\treturn max', 'test_list': ['assert max_chain_length([Pair(5, 24), Pair(15, 25),Pair(27, 40), Pair(50, 60)], 4) == 3', 'assert max_chain_length([Pair(1, 2), Pair(3, 4),Pair(5, 6), Pair(7, 8)], 4) == 4', 'assert max_chain_length([Pair(19, 10), Pair(11, 12),Pair(13, 14), Pair(15, 16), Pair(31, 54)], 5) == 5'], 'test_setup_code': '', 'challenge_test_list': []}


In [106]:
prompt_model = """
    ### Context:
    {context}
    
    ### User's questions:
    {query}

    ### Python version: 
    {python_version}
    
    ** Instructions **
    - If user asks you to generate code and by using context you cannot do it, then generate it on your own
    - If user doesn't ask to generate code and the context does not contain answer for query say "I don't have sufficient knowledge to answer this question".
    - If the user's question does not specify Python, rephrase it internally as a Python-related question before answering.
    - If there is a code in your output explain this code to the user step by step
    - Do not answer any other question than about python programming language
    - If topic is complex provide summary at the end of your answer
    - Do not make up any information
    - Provide consise and structured answer
    """

In [22]:
python_versions=["3.10","3.11","3.12","3.13"]

In [23]:
CHROMA_DB_PATH = "./../data_collecting/chroma_db"
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
collection = client.get_or_create_collection(name="python_data")

In [27]:
def get_openai_embedding(text):
    response = openai.embeddings.create(
        input=[text],
        model="text-embedding-ada-002"
    )
    return response.data[0].embedding

def retrieve_documents(query, python_version, top_k=7):
    query_embedding = get_openai_embedding(query)

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        where={"version": python_version}
    )

    return results["documents"][0] if "documents" in results and results["documents"] else []

def format_query(user_query: str):
    if "python" not in user_query.lower():
        return f"{user_query} in Python"
    return user_query

In [109]:
print(len(dataset["train"]))

374


In [111]:
retrieved_context_train = []

for i, example in enumerate(dataset["train"]):
    query = format_query(example["text"])
    retrieved_docs = retrieve_documents(query, python_version, 2)
    retrieved_docs = "\n\n".join(retrieved_docs)
    retrieved_context_train.append(retrieved_docs)

    if i % 20 == 0:
        print(i)

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360


In [114]:
retrieved_context_val = []

for i, example in enumerate(dataset["validation"]):
    query = format_query(example["text"])
    retrieved_docs = retrieve_documents(query, python_version, 2)
    retrieved_docs = "\n\n".join(retrieved_docs)
    retrieved_context_val.append(retrieved_docs)

    if i % 20 == 0:
        print(i)

0
20
40
60
80


In [115]:
formatted_data = []


for python_version in python_versions:
    for example, context in zip(dataset["train"],retrieved_context_train):
        formatted_data.append({
            "messages": [
                {"role": "system", "content": prompt_model.format(context=context, query=example["text"], python_version=python_version).strip()},
                {"role": "user", "content": example["text"]},
                {"role": "assistant", "content": example["code"]}
            ]
        })
    print(python_version)

# with open("mbpp_finetune.jsonl", "w") as f:
#     for entry in formatted_data:
#         f.write(json.dumps(entry) + "\n")

3.10
3.11
3.12
3.13


In [117]:
validation_data = []
for python_version in python_versions:
    for example, context in zip(dataset["validation"],retrieved_context_val):
        validation_data.append({
            "messages": [
                {"role": "system", "content": prompt_model.format(context=context, query=example["text"], python_version=python_version).strip()},
                {"role": "user", "content": example["text"]},
                {"role": "assistant", "content": example["code"]}
            ]
        })

# with open("mbpp_finetune_val.jsonl", "w") as f:
#     for entry in validation_data:
#         f.write(json.dumps(entry) + "\n")

In [118]:
with open("mbpp_finetune.jsonl", "w") as f:
    for entry in formatted_data:
        f.write(json.dumps(entry) + "\n")

with open("mbpp_finetune_val.jsonl", "w") as f:
    for entry in validation_data:
        f.write(json.dumps(entry) + "\n")

In [54]:
train_file = openai.files.create(
  file=open("mbpp_finetune.jsonl", "rb"),
  purpose="fine-tune"
)

valid_file = openai.files.create(
  file=open("mbpp_finetune_val.jsonl", "rb"),
  purpose="fine-tune"
)


In [55]:
print(f"Training file Info: {train_file}")
print(f"Validation file Info: {valid_file}")

Training file Info: FileObject(id='file-TLoZTPyqSHzprWM2vPEdLH', bytes=2054116, created_at=1741724696, filename='mbpp_finetune.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)
Validation file Info: FileObject(id='file-LT8NjQrKWvr2N9NWMyHMJZ', bytes=495784, created_at=1741724697, filename='mbpp_finetune_val.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)


In [59]:
model = openai.fine_tuning.jobs.create(
  training_file=train_file.id, 
  validation_file=valid_file.id,
  model="gpt-4o-mini-2024-07-18", 
  hyperparameters={
    "n_epochs": 3,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
  }
)
job_id = model.id
status = model.status

In [60]:
print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {model}")
print(f"Training Status: {status}")

Fine-tuning model with jobID: ftjob-nAHqCOHrlROLsh4fzkDAvyb1.
Training Response: FineTuningJob(id='ftjob-nAHqCOHrlROLsh4fzkDAvyb1', created_at=1741724879, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=3, learning_rate_multiplier=0.3, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-bgSJ5OvwR04OVBpr61Q2jq7K', result_files=[], seed=338309469, status='validating_files', trained_tokens=None, training_file='file-TLoZTPyqSHzprWM2vPEdLH', validation_file='file-LT8NjQrKWvr2N9NWMyHMJZ', estimated_finish=None, integrations=[], metadata=None, method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=3, learning_rate_multiplier=0.3, n_epochs=3)), type='supervised'), user_provided_suffix=None)
Training Status: validating_files


In [61]:
openai.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-nAHqCOHrlROLsh4fzkDAvyb1', created_at=1741724879, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=3, learning_rate_multiplier=0.3, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-bgSJ5OvwR04OVBpr61Q2jq7K', result_files=[], seed=338309469, status='validating_files', trained_tokens=None, training_file='file-TLoZTPyqSHzprWM2vPEdLH', validation_file='file-LT8NjQrKWvr2N9NWMyHMJZ', estimated_finish=None, integrations=[], metadata=None, method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=3, learning_rate_multiplier=0.3, n_epochs=3)), type='supervised'), user_provided_suffix=None)

In [62]:
openai.fine_tuning.jobs.cancel(job_id)

BadRequestError: Error code: 400 - {'error': {'message': 'Job has already completed: ftjob-nAHqCOHrlROLsh4fzkDAvyb1', 'type': 'invalid_request_error', 'param': 'fine_tune_id', 'code': 'invalid_cancel'}}

In [63]:
result = openai.fine_tuning.jobs.list()

In [64]:
fine_tuned_model = result.data[0].fine_tuned_model

In [65]:
print(fine_tuned_model)

ft:gpt-4o-mini-2024-07-18:personal::BA15GnWM


In [69]:
completion = openai.chat.completions.create(
  model = fine_tuned_model,
  messages=[
    {"role": "system", "content": "You are python expert and you provide answer only based on given context."},
    {"role": "user", "content": "generate code to print"}
  ]
)
print(completion.choices[0].message.content)

Please provide more context or specify what you want to print so I can assist you appropriately.


### Fine Tuning gemma

In [119]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from transformers import pipeline


In [120]:
print("CUDA Available:", torch.cuda.is_available())

CUDA Available: True


In [121]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_compute_dtype=torch.float16,  
    bnb_4bit_use_double_quant=True
)


In [123]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [69]:
tokenizer.pad_token = tokenizer.eos_token

In [64]:
prompt_model_copy = """
    ### User's questions:
    {query}

    ### Python version: 
    {python_version}
    
    ** Instructions **
    - If user asks you to generate code and by using context you cannot do it, then generate it on your own
    - If there is a code in your output explain this code to the user step by step
    - Do not answer any other question than about python programming language
    - If topic is complex provide summary at the end of your answer
    """


In [68]:

def generate_response(prompt, max_new_tokens=300):
    prompt = prompt_model.format(context="",query = prompt,python_version = "3.10")
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True, 
            temperature=0.7, 
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id  
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

print(generate_response("How can i open file in Python?"))


    ### Context:
    
    
    ### User's questions:
    How can i open file in Python?

    ### Python version: 
    3.10
    
    ** Instructions **
    - If user asks you to generate code and by using context you cannot do it, then generate it on your own
    - If user doesn't ask to generate code and the context does not contain answer for query say "I don't have sufficient knowledge to answer this question".
    - If the user's question does not specify Python, rephrase it internally as a Python-related question before answering.
    - If there is a code in your output explain this code to the user step by step
    - Do not answer any other question than about python programming language
    - If topic is complex provide summary at the end of your answer
    - Do not make up any information
    - Provide consise and structured answer
    
    ### Answer:
    To open a file in Python, you can use the `open()` function. Here's an example:
    
    ```python
    with open('filename.

In [75]:
lora_config = LoraConfig(
    r=2,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none", 
    task_type="CAUSAL_LM"
)

In [76]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 281,600 || all params: 1,100,329,984 || trainable%: 0.0256


In [104]:
sample_text = "\n".join([f"{msg['role'].capitalize()}: {msg['content']}" for msg in train_data[0]["messages"]])

In [None]:
tokenized = tokenizer(sample_text)
print(f"Number of tokens: {len(tokenized['input_ids'])}")

In [None]:
train_data = load_dataset("json", data_files="mbpp_finetune.jsonl")["train"]
valid_data = load_dataset("json", data_files="mbpp_finetune_val.jsonl")["train"]

In [92]:
# print(train_data[0]["messages"][0]["content"])

In [100]:
def tokenize_function(example):
    messages = example["messages"]

    if isinstance(messages, list) and isinstance(messages[0], list):
        messages = messages[0]

    full_text = "\n".join([f"{msg['role'].capitalize()}: {msg['content']}" for msg in messages])

    return tokenizer(full_text, truncation=True, padding="max_length", max_length=1496)

tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_valid = valid_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/1496 [00:00<?, ? examples/s]

ArrowInvalid: Column 1 named input_ids expected length 1000 but got length 1496