## Evaluate GPT-3.5-TURBO on embedded code dataset

Author: Matej Vadovic

Year: 2024

In [None]:
from openai import OpenAI
import os
import regex as re
from traintools.lib import DatasetSingleton, evaluate_predictions, remove_comments_from_output
from dotenv import load_dotenv
load_dotenv()
import json

# Load key from .env file
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.getenv("OPENAI_API_KEY"),
)

## Prepare inputs for model

In [None]:
# Load prepared dataset dictionary
test_dataset = DatasetSingleton.get_test_dataset()

In [None]:
def prepare_inputs(data):
    return {"role": "user",
            "content": f'Complete the implementation of the function with the following docstring and signature:\n{data}\nRemember, you only need to fill in the body of the function, without altering the provided signature or adding any additional comments.'}

In [None]:
messages = [prepare_inputs(data) for data in test_dataset['prompt'][:2000]]

## Get outputs from model

In [None]:
# List to store responses
responses = []

In [None]:
for message in messages:
    response = client.chat.completions.create( 
        model="gpt-3.5-turbo", messages=[message] 
    )
    responses.append(response.choices[0].message.content)

## Save outputs to file

In [None]:
responses_copy = responses.copy()

In [None]:
# write responses to json file
with open('chatgpt-responses.json', 'w') as f:
    json.dump(responses, f)

## Clean outputs for evaluation
- Remove extra comments before and after the code snippet
- Remove signatures, since these are not evaluated, becaseuse they are pre-written
- Remove comments inside the code snippet


In [None]:
with open('chatgpt-responses.json', 'r') as f:
    responses = json.load(f)

In [None]:
for i in range(len(responses)):
    # If ```c is in code, then replace everything before and everything after ```
    matches = re.findall(r"(?<=```c)(?:.|\s)*?(?=```)", responses[i])
    if matches:
        responses[i] = matches[0]
    else:
        responses[i] = responses[i]

# Remove signatures from responses
predictions = [r.replace(test_dataset[i]['signature'], '').strip() for i, r in enumerate(responses)]

# Remove comments from the predictions
predictions = [remove_comments_from_output(p) for p in predictions]

## Evaluate outputs

In [None]:
# Reference code snippets
references = [[x] for x in test_dataset['code'][:2000]]

results = evaluate_predictions(predictions, references, 'GPT-3.5-turbo', DatasetSingleton.get_name(), 'GPT_CAUSAL',False)

print(results)