In [None]:
# Project Overview

This project benchmarks **AI models for quantum computing code generation**, specifically **OpenAI’s GPT-4o** and **Anthropic’s Claude-3.5-sonnet-20240620**. The models are prompted to generate quantum computing code with beginner-friendly explanations. The project measures performance metrics such as **latency, token usage, and throughput**, and presents results in structured JSON and Markdown tables for easy comparison.

## Key Features

- Generates **quantum computing code with explanations** for educational purposes  
- Measures **inference performance**: latency, total tokens, tokens per second, and prompt vs completion tokens  
- Produces **structured JSON output** for analysis or visualization  
- Displays benchmarking results in **Markdown tables** for readability  

## Tech Stack

- **Models**: GPT-4o, Claude-3.5-sonnet-20240620  
- **Language**: Python  
- **Libraries**: OpenAI, Anthropic, time, json, IPython.display, tabulate  

## Purpose

Compare LLM performance across models, highlighting latency, token usage, and throughput in a structured benchmark.


In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import google.generativeai
import anthropic
from IPython.display import Markdown, display, update_display
import gradio as gr



In [None]:
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHRPIC_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print(f"OpenAI API Key not set")
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print(f"Anthropic API Key not set")
    
    



In [77]:
openai = OpenAI()
claude = anthropic.Anthropic()
# OPENAI_MODEL = "gpt-4.1-mini"
# CLAUDE_MODEL = "claude-3-5-haiku-latest"
OPENAI_MODEL = "gpt-4o"
CLAUDE_MODEL = "claude-3-5-sonnet-20240620"

In [None]:
system_message = "You are a Quantum Computing PHD Professor and Developer."
system_message += "You write quantum computing code and teach it to middle school students in an easy and digestible way."
system_message += "Assume your students have never heard or know anything about quantum computing."
system_message += "Every part of your code has coments and explains clearly what it is doing."

In [None]:

def user_prompt_for():    
    user_prompt = "Write quantum computing code and explain it in simple terms, using examples."
    user_prompt += "Add comments to each part of the code so it is very clear and easy to understand."
    return user_prompt
    

In [None]:

def messages_for():    
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt_for()}
    ]

In [None]:
import json

def generate_openai_code():
    response = openai.chat.completions.create(
        model = OPENAI_MODEL,
        messages = messages_for(),
    )

    return response

In [None]:
import json

def generate_claude_code():
    response = claude.messages.create(
        model = CLAUDE_MODEL,
        max_tokens = 2000,
        system = system_message,
        messages = [{"role": "user", "content": user_prompt_for()}],
    )

    return response

In [None]:
def display_code_markdown(model_name):
    if OPENAI_MODEL in model_name:
        response = generate_openai_code()
        code = response.choices[0].message.content
        display(Markdown(code))
    elif CLAUDE_MODEL in model_name:
        response = generate_claude_code()
        code = response.content[0].text
        display(Markdown(code))
    

In [None]:
display_code_markdown()

In [None]:
display_code_markdown(CLAUDE_MODEL)

In [None]:
import time

def benchmark_model(model_name):
    start_time = time.time()
    if OPENAI_MODEL in model_name:
        model_name = OPENAI_MODEL
    elif CLAUDE_MODEL in model_name:
        model_name = CLAUDE_MODEL
    response = generate_code()
    end_time = time.time()
    latency_seconds = end_time - start_time

    usage = getattr(response, "usage", None)
   
    if usage:
        total_tokens = response.usage.total_tokens
        prompt_tokens = response.usage.prompt_tokens
        completion_tokens = response.usage.completion_tokens
    else:
        total_tokens = prompt_tokens = completion_tokens = 0
        
    tokens_per_second = (
        total_tokens / latency_seconds if latency_seconds > 0 else 0
    )    


    
    # Structured output
    return {
        "model_name": model_name,
        "latency_seconds": latency_seconds,
        "total_tokens": total_tokens,
        "tokens_per_second": tokens_per_second,
        "prompt_tokens": prompt_tokens,
        "completion_tokens": completion_tokens,   
    }

In [None]:
benchmark_model(OPENAI_MODEL)

In [None]:
benchmark_model(CLAUDE_MODEL)

In [None]:
!pip install tabulate

In [None]:
from IPython.display import display, Markdown
from tabulate import tabulate 
def display_benchmark_table(models):
    results = []
    for model_name in models:
        result = benchmark_model(model_name)
        results.append(result)
    table = tabulate(results, headers="keys", tablefmt="github")
    display(Markdown(table))


In [78]:
display_benchmark_table([OPENAI_MODEL, CLAUDE_MODEL])

| model_name                 |   latency_seconds |   total_tokens |   tokens_per_second |   prompt_tokens |   completion_tokens |
|----------------------------|-------------------|----------------|---------------------|-----------------|---------------------|
| gpt-4o                     |           12.4664 |            854 |             68.5043 |             101 |                 753 |
| claude-3-5-sonnet-20240620 |           13.7089 |            954 |             69.59   |             101 |                 853 |

In [79]:
def display_benchmark_json(models):
    results = [benchmark_model(m) for m in models]
    json_results = json.dumps(results, indent=2)
    display(Markdown(f"```json\n{json_results}\n```"))

In [80]:
display_benchmark_json([OPENAI_MODEL, CLAUDE_MODEL])

```json
[
  {
    "model_name": "gpt-4o",
    "latency_seconds": 17.660048246383667,
    "total_tokens": 863,
    "tokens_per_second": 48.86736366514291,
    "prompt_tokens": 101,
    "completion_tokens": 762
  },
  {
    "model_name": "claude-3-5-sonnet-20240620",
    "latency_seconds": 14.692603349685669,
    "total_tokens": 902,
    "tokens_per_second": 61.3914347602188,
    "prompt_tokens": 101,
    "completion_tokens": 801
  }
]
```