In [None]:
!pip install openai

In [None]:
import openai
import os
import json
import time
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score, precision_score

# Set API Key
os.environ["OPENAI_API_KEY"] = #<insert API Key>
openai.api_key = os.environ.get("OPENAI_API_KEY")

# Read the .jsonl file
# Use test_data.jsonl from GPT-3.5 fine-tuning code
data = []
with open('test_data.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))

# Lists to store actual and predicted values
actual = []
predicted = []

# Go through each example and predict
for example in data:
    # Extract messages without the assistant's message
    messages = example["messages"][:-1]

    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0
        )
        predicted_response = response["choices"][0]["message"]["content"].lower()

        # Check the response and treat any non-true responses as 'false'
        if predicted_response not in ['true', 'false']:
            predicted_response = 'false'

        predicted.append(predicted_response)
        actual_response = example["messages"][-1]["content"].lower()
        actual.append(actual_response)
    except Exception as e:
        print("Error for example:", messages)
        print(e)

    time.sleep(1)  # Wait for 1 second before the next request

# Metrics
print("Accuracy:", accuracy_score(actual, predicted))
print("F1-score:", f1_score(actual, predicted, pos_label="true"))
print("Precision:", precision_score(actual, predicted, pos_label="true"))
print("Recall:", recall_score(actual, predicted, pos_label="true"))
print(classification_report(actual, predicted, target_names=["false", "true"]))

# Displaying examples
correct = []
incorrect = []

for a, p, ex in zip(actual, predicted, data):
    if a == p:
        correct.append(ex)
    else:
        incorrect.append(ex)

print("\nExamples GPT-3.5 got right:")
for c in correct:
    print(c)

print("\nExamples GPT-3.5 got wrong:")
for ic in incorrect:
    print(ic)
