CS 520: Exercise 1  
Author: Ben Wei  

For this exercise, I chose to use Gemini and ChatGPT. This code uses API keys to access the LLMs. Because this code must be uploaded to Github and I'd like my keys to remain private, you must supply your own for this code to run. Of course, the results are also posted separately in the repository.

In [None]:
## Initialization

import os

# Paste your Gemini API Key here
os.environ['GEMINI_API_KEY'] = 'YOUR-KEY-HERE'

# Paste your ChatGPT API Key here
OPENAI_KEY = "YOUR-KEY-HERE"

# Load the dataset
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("evalplus/humanevalplus")

# Initialize LLM APIs
from google import genai
from openai import OpenAI

client = genai.Client()
GPT_client = OpenAI(api_key=OPENAI_KEY)

In [None]:
## Generate responses for each prompt from each model
responses = []
GPT_responses = []
k = 2
offset = 40

# Use Self-Planning and Self-Edit
prefix0 = "Please solve the following coding problem and format your code with markdown.\n"

prefix1 = "For the following functions, write numbered steps before implementing them.\n\n"
prefix2 = "After writing the code, check for errors and fix them if found.\n\n"

prefixes = [prefix1, prefix2]

for i in range(10):
  responses.insert(i, [])
  GPT_responses.insert(i, [])
  for j in range(k):
    # Prompt Gemini
    response = client.models.generate_content(
      model="gemini-2.0-flash",
      contents=prefix0+prefixes[j]+dataset["test"][i+offset]["prompt"]
    )
    responses[i].insert(j, response.text)

    # Prompt GPT
    gpt_response = GPT_client.responses.create(
        model="gpt-4",
        input=prefix0+prefixes[j]+dataset["test"][i+offset]["prompt"]
    )
    GPT_responses[i].insert(j, gpt_response.output_text)

In [None]:
## Test responses for each prompt
import re

results = []
GPT_results = []

for i in range(10):
  results.insert(i, [])
  GPT_results.insert(i, [])
  for j in range(k):
    # Evaluate Gemini Responses
    ns = {}
    result = True
    try:
      match = re.search(r"```python(.*?)```", responses[i][j], re.DOTALL)
      code = match.group(1)
      exec(code, ns)
      exec(dataset["test"][i+offset]["test"], ns)
      exec("check("+dataset["test"][i+offset]["entry_point"]+")", ns);
    except Exception as e:
      result = False
    results[i].insert(j, result)

    #Evaluate ChatGPT Responses
    ns2 = {}
    result = True
    try:
      match = re.search(r"```python(.*?)```", GPT_responses[i][j], re.DOTALL)
      gpt_code = match.group(1)
      exec(gpt_code, ns2)
      exec(dataset["test"][i+offset]["test"], ns2)
      exec("check("+dataset["test"][i+offset]["entry_point"]+")", ns2);
    except Exception as e:
      result = False
    GPT_results[i].insert(j, result)


In [None]:
# Print Results
print(results)
print(GPT_results)

In [None]:
# Optional: Print Prompts
for i in range(10):
  for j in range(k):
    print(prefix0+prefixes[j]+dataset["test"][i+offset]["prompt"])

In [None]:
# Optional: Print Responses
for i in range(10):
  for j in range(k):
    print(responses[i][j])
    print(GPT_responses[i][j])

In [None]:
# Analysis of HumanEval/46
import traceback

#print(dataset["test"][46]["prompt"])
#print(dataset["test"][46]["test"])

#print(responses[46-offset][0])
#print(responses[46-offset][1])
#print(GPT_responses[46-offset][0])
#print(GPT_responses[46-offset][1])

In [None]:
## Results of HumanEval 46 without typo

prompt = '''def fib4(n: int):
    """The Fib4 number sequence is a sequence similar to the Fibbonacci sequence that's defined as follows:
    fib4(0) -> 0
    fib4(1) -> 0
    fib4(2) -> 2
    fib4(3) -> 0
    fib4(n) -> fib4(n-1) + fib4(n-2) + fib4(n-3) + fib4(n-4).
    Please write a function to efficiently compute the n-th element of the fib4 number sequence.  Do not use recursion.
    >>> fib4(5)
    4
    >>> fib4(6)
    8
    >>> fib4(7)
    14
    """
'''

new46_responses = []
new46_gpt_responses = []

for j in range(k):
  # Prompt Gemini
  response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prefix0+prefixes[j]+prompt
  )
  new46_responses.insert(j, response.text)

  # Prompt GPT
  gpt_response = GPT_client.responses.create(
      model="gpt-4",
      input=prefix0+prefixes[j]+prompt
  )
  new46_gpt_responses.insert(j, gpt_response.output_text)

new46_results = []
new46_gpt_results = []

for j in range(k):
    # Evaluate Gemini Responses
    ns = {}
    result = True
    try:
      match = re.search(r"```python(.*?)```", new46_responses[j], re.DOTALL)
      code = match.group(1)
      exec(code, ns)
      exec(dataset["test"][46]["test"], ns)
      exec("check("+dataset["test"][46]["entry_point"]+")", ns);
    except Exception as e:
      result = False
    new46_results.insert(j, result)

    #Evaluate ChatGPT Responses
    ns2 = {}
    result = True
    try:
      match = re.search(r"```python(.*?)```", new46_gpt_responses[j], re.DOTALL)
      gpt_code = match.group(1)
      exec(gpt_code, ns2)
      exec(dataset["test"][46]["test"], ns2)
      exec("check("+dataset["test"][46]["entry_point"]+")", ns2);
    except Exception as e:
      result = False
    new46_gpt_results.insert(j, result)

print(new46_results)
print(new46_gpt_results)

In [None]:
# Analysis of HumanEval/48
import traceback

print(dataset["test"][48]["prompt"])
#print(dataset["test"][48]["test"])

#print(responses[48-offset][0])
#print(responses[48-offset][1])
#print(GPT_responses[48-offset][0])
#print(GPT_responses[48-offset][1])

In [None]:
## Results of HumanEval 48 when adding 'Consider all possible inputs.'

prompt = '''def is_palindrome(text: str):
    """
    Checks if given string is a palindrome
    >>> is_palindrome('')
    True
    >>> is_palindrome('aba')
    True
    >>> is_palindrome('aaaaa')
    True
    >>> is_palindrome('zbcd')
    False
    """
'''

newprefix = "Consider whitespace when evaluating palindromes.\n"

new48_responses = []
new48_gpt_responses = []

for j in range(k):
  # Prompt Gemini
  response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prefix0+prefixes[j]+newprefix+prompt
  )
  new48_responses.insert(j, response.text)

  # Prompt GPT
  gpt_response = GPT_client.responses.create(
      model="gpt-4",
      input=prefix0+prefixes[j]+newprefix+prompt
  )
  new48_gpt_responses.insert(j, gpt_response.output_text)

new48_results = []
new48_gpt_results = []

for j in range(k):
    # Evaluate Gemini Responses
    ns = {}
    result = True
    try:
      match = re.search(r"```python(.*?)```", new48_responses[j], re.DOTALL)
      code = match.group(1)
      exec(code, ns)
      exec(dataset["test"][48]["test"], ns)
      exec("check("+dataset["test"][48]["entry_point"]+")", ns);
    except Exception as e:
      result = False
    new48_results.insert(j, result)

    #Evaluate ChatGPT Responses
    ns2 = {}
    result = True
    try:
      match = re.search(r"```python(.*?)```", new48_gpt_responses[j], re.DOTALL)
      gpt_code = match.group(1)
      exec(gpt_code, ns2)
      exec(dataset["test"][48]["test"], ns2)
      exec("check("+dataset["test"][48]["entry_point"]+")", ns2);
    except Exception as e:
      result = False
    new48_gpt_results.insert(j, result)

print(new48_results)
print(new48_gpt_results)

In [None]:
# Innovation / Experimentation

## Generate responses for each prompt from each model
responses = []
GPT_responses = []
k = 2
offset = 40

# Use Self-Planning and Self-Edit
prefix0 = "You are an Expert Python Programmer.\nPlease solve the following coding problem and format your code with markdown.\n"

prefix1 = "For the following functions, write numbered steps before implementing them.\n\n"
prefix2 = "After writing the code, check for errors and fix them if found.\n\n"

prefixes = [prefix1, prefix2]

for i in range(10):
  responses.insert(i, [])
  GPT_responses.insert(i, [])
  for j in range(k):
    # Prompt Gemini
    response = client.models.generate_content(
      model="gemini-2.0-flash",
      contents=prefix0+prefixes[j]+dataset["test"][i+offset]["prompt"]
    )
    responses[i].insert(j, response.text)

    # Prompt GPT
    gpt_response = GPT_client.responses.create(
        model="gpt-4",
        input=prefix0+prefixes[j]+dataset["test"][i+offset]["prompt"]
    )
    GPT_responses[i].insert(j, gpt_response.output_text)

## Test responses for each prompt
import re

results = []
GPT_results = []

for i in range(10):
  results.insert(i, [])
  GPT_results.insert(i, [])
  for j in range(k):
    # Evaluate Gemini Responses
    ns = {}
    result = True
    try:
      match = re.search(r"```python(.*?)```", responses[i][j], re.DOTALL)
      code = match.group(1)
      exec(code, ns)
      exec(dataset["test"][i+offset]["test"], ns)
      exec("check("+dataset["test"][i+offset]["entry_point"]+")", ns);
    except Exception as e:
      result = False
    results[i].insert(j, result)

    #Evaluate ChatGPT Responses
    ns2 = {}
    result = True
    try:
      match = re.search(r"```python(.*?)```", GPT_responses[i][j], re.DOTALL)
      gpt_code = match.group(1)
      exec(gpt_code, ns2)
      exec(dataset["test"][i+offset]["test"], ns2)
      exec("check("+dataset["test"][i+offset]["entry_point"]+")", ns2);
    except Exception as e:
      result = False
    GPT_results[i].insert(j, result)

# Print Results
print(results)
print(GPT_results)
