In [17]:
# Set up OpenAI API call function

import dotenv
import os
import time
import openai
from openai import OpenAI

# setup OpenAI API key
dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_GPT4_API_KEY")
openai_client = OpenAI(api_key=openai.api_key)

# models
EMBEDDING_MODEL = "text-embedding-ada-003"
GPT_MODEL = "gpt-4-0125-preview"

# for bulk openai message, no stream
def chat_openai(prompt="Tell me to ask you a prompt", system_prompt="You are a helpful assistant.", model=GPT_MODEL, chat_history=[], stop_sequences=[], max_tokens_to_sample=2000, temperature=0, **kwargs):
    raw_request = {
        "model": model,
        "temperature": temperature,
        "max_tokens": max_tokens_to_sample,
        "stop": stop_sequences or None,  # API doesn't like empty list
        **kwargs
    }

    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
    response = openai_client.chat.completions.create(**{"messages": messages,**raw_request})

    # print("RESPONSE: ", response)
    completion = response.choices[0].message.content

    return completion

In [18]:
# Evaluation
import random

# Accuracy
def multiply(a, b):
    return a * b

def evaluate_pred_true(pred, a, b):
    prompt = f"Is the predicted answer the same as the true answer? Respond only with 'Y' or 'N'. Predicted answer: {pred}. True answer: {multiply(a, b)}."
    res = chat_openai(prompt)

    # Logging
    print(f"Prompt: {prompt} \n\n Response: {res}\n\n")

    return res.lower() == 'y'

def evaluate(i, num_samples=10, prompt='''To multiply two numbers, start by multiplying the rightmost digit of the multiplicand by each digit of the multiplier, writing down the products and carrying over any remainders. Repeat this process for each digit of the multiplicand, and then add up all the partial products to obtain the final result.\n'''):
    # i = # of digits for both a and b
    # num_samples = # of samples to evaluate
    # Default prompt is the few shot multiplication explanation
    
    # Ensure i is positive and at least 1
    if i < 1:
        raise ValueError("i must be at least 1")

    # Define the lower and upper bounds for the random integers
    lower_bound = 10**(i-1)
    upper_bound = 10**i - 1

    num_correct = 0
    for _ in range(num_samples):
        # Generate random integers a and b within the specified bounds
        a = random.randint(lower_bound, upper_bound)
        b = random.randint(lower_bound, upper_bound)

        # Predict the product of a and b
        pred = chat_openai(f'''{prompt}Question: what's {a} times {b}? Answer''')
        
        # Evaluate the predicted answer
        if evaluate_pred_true(pred, a, b):
            num_correct += 1

    return num_correct / num_samples

# Local errors: ??? Gotta ask the authors. It's not clear in the code

In [19]:
few_shot_acc = []

for i in range(1, 6):
    few_shot_acc.append(evaluate(i))

print(few_shot_acc)

Prompt: Is the predicted answer the same as the true answer? Respond only with 'Y' or 'N'. Predicted answer: To find the product of 7 times 6, you can simply multiply the two numbers directly:

7 × 6 = 42

So, the answer is 42.. True answer: 42. 

 Response: Y


Prompt: Is the predicted answer the same as the true answer? Respond only with 'Y' or 'N'. Predicted answer: To multiply 8 by 3, you simply multiply the two numbers directly since they are both single digits:

8 times 3 equals 24.. True answer: 24. 

 Response: Y


Prompt: Is the predicted answer the same as the true answer? Respond only with 'Y' or 'N'. Predicted answer: To multiply 8 by 5, you simply multiply these two numbers directly since they are single digits:

\[ 8 \times 5 = 40 \]

So, the answer is 40.. True answer: 40. 

 Response: Y


Prompt: Is the predicted answer the same as the true answer? Respond only with 'Y' or 'N'. Predicted answer: The answer is 1.. True answer: 1. 

 Response: Y


Prompt: Is the predicted

In [None]:
zero_shot_acc = []

for i in range(1, 6):
    zero_shot_acc.append(evaluate(i, prompt=""))

print(zero_shot_acc)

[1.0, 0.3, 0.0, 0.0, 0.0]
