In [1]:
# --- Imports and environment setup ---

import os
import time
import subprocess
from typing import Dict, Tuple, List, Optional, Any

from dotenv import load_dotenv
from litellm import completion, completion_cost  # LLM call + cost helper


In [2]:
# --- Load API keys from .env (do not commit .env to GitHub) ---

load_dotenv(override=True)

openai_api_key = os.getenv("OPENAI_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")

if not any([openai_api_key, anthropic_api_key, gemini_api_key]):
    raise RuntimeError(
        "No API keys found. Please set at least one of: "
        "OPENAI_API_KEY, ANTHROPIC_API_KEY, GEMINI_API_KEY."
    )



In [3]:
# --- Model configuration and system prompt for code translation ---

models = {
    "gpt":    "gpt-5",
    "claude": "anthropic/claude-opus-4-5-20251101",
    "gemini": "gemini/gemini-2.5-pro", 
}

system_message = (
    "You are an AI code generator. Convert Python code into highly optimized C++ that "
    "compiles successfully with Clang on macOS (Apple Silicon).\n\n"
    "=== HARD RULES ===\n"
    "• Output ONLY valid C++17 source code. No markdown, no backticks, no prose.\n"
    "• DO NOT use '#include <bits/stdc++.h>'. Use portable headers like <iostream>, <iomanip>, <cmath>, <vector>, <algorithm>.\n"
    "• DO NOT use OpenMP (#include <omp.h> or '#pragma omp ...').\n"
    "• DO NOT output shell commands or lines like 'Compile with: ...'.\n\n"
    "=== REQUIREMENTS ===\n"
    "• Code must compile with: clang++ -std=c++17 -Ofast -mcpu=native -flto=thin -DNDEBUG\n"
    "• Numeric results must match the Python code.\n"
    "• Use fast loops, appropriate floating point types, and only standard headers.\n"
)


In [4]:
# --- Prompt construction helpers (Python → C++) ---

def user_prompt_for(python_code: str) -> str:
    return (
        "Rewrite this Python code in C++ with the fastest possible implementation that produces identical output "
        "in the least time. Respond only with valid C++ code; do not include any natural language instructions, "
        "shell commands, or lines like 'Compile with ...'. "
        "Use comments sparingly and only inside the C++ file. "
        "Pay attention to number types to ensure no int overflows. Remember to #include all necessary C++ "
        "packages such as <iomanip>.\n\n"
        f"{python_code}"
    )


def messages_for(python_code: str) -> List[Dict[str, str]]:
    """
    Build the messages payload for the chat completion call.
    """
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt_for(python_code)},
    ]


def reasoning_effort_for(model_key: str) -> Optional[str]:
    # You want low for GPT + Claude, none for Gemini
    if model_key in ("gpt", "claude"):
        return "low"
    return None


In [5]:
# --- File helper: write generated C++ to main.cpp ---

results: List[Dict[str, Any]] = [] 

def write_output(cpp_code: str, filename: str) -> None:
    """
    Clean up LLM-generated C++ so it compiles on macOS/Clang:
      - remove markdown fences
      - replace <bits/stdc++.h> with standard headers
      - remove OpenMP includes/pragmas
      - remove 'Compile with ...' / shell-like junk
    """
    # 1) Remove markdown code fences if any
    clean = cpp_code.replace("```cpp", "").replace("```", "")

    lines_out = []
    for line in clean.splitlines():
        stripped = line.strip()

        # 2) Replace GCC-only header with portable includes
        if stripped.startswith("#include <bits/stdc++.h>"):
            lines_out.append("#include <iostream>")
            lines_out.append("#include <iomanip>")
            lines_out.append("#include <cmath>")
            lines_out.append("#include <vector>")
            lines_out.append("#include <algorithm>")
            continue

        # 3) Drop OpenMP 
        if "<omp.h>" in stripped:
            continue
        if stripped.startswith("#pragma omp"):
            continue

        # 4) Drop human instructions / compile hints
        if stripped.startswith("Compile with:") or "clang++" in stripped:
            continue

        lines_out.append(line)

    with open(filename, "w") as f:
        f.write("\n".join(lines_out))


In [6]:
# --- LLM call: return C++ code + cost (USD) + latency (seconds) ---

def optimize_llm(python_code: str, model_key: str) -> Tuple[str, float, float]:
    if model_key not in models:
        raise ValueError(f"Unknown model key: {model_key}")

    kwargs: Dict[str, object] = {}
    effort = reasoning_effort_for(model_key)
    if effort is not None:
        kwargs["reasoning_effort"] = effort

    start = time.perf_counter()
    resp = completion(
        model=models[model_key],
        messages=messages_for(python_code),
        **kwargs,
    )
    latency = time.perf_counter() - start

    cpp_code = resp["choices"][0]["message"]["content"]
    cost = completion_cost(completion_response=resp)

    return cpp_code, cost, latency



In [7]:
# --- Baseline: run the original Python code and time it ---

def run_python_and_time(code: str) -> float:
    """
    Execute the Python snippet and return wall-clock execution time in seconds.
    The snippet itself may also print its own timing info.
    """
    globals_dict = {"__builtins__": __builtins__}
    start = time.perf_counter()
    exec(code, globals_dict)
    end = time.perf_counter()
    return end - start

In [8]:
# --- C++ compile and run commands (clang++ on main.cpp) ---

compile_command = [
    "clang++", "main.cpp", "-o", "main",
    "-std=c++17",
    "-Ofast",
    "-mcpu=native",
    "-flto=thin",
    "-fvisibility=hidden",
    "-DNDEBUG",
    "-Xpreprocessor", "-fopenmp",
]

run_command = ["./main"]


In [9]:
# --- Single-model experiment: Python baseline → LLM → C++ → compile → run ---

def run_experiment(
    model_key: str,
    python_code: str,
    repeats: int = 1,
    python_time: Optional[float] = None,
) -> None:
    """
    For a single model:
      1) Use provided python_time if given, otherwise measure it.
      2) Use LLM to generate C++ from Python.
      3) Compile C++ into ./main.
      4) Run the C++ binary 'repeats' times.
      5) Append a summary entry into the global `results` list with:
         - model, status, cpp_time, speedup, cost, latency, error (if any).
    """
    global results

    # 1) Python baseline (measure here only if not provided)
    if python_time is None:
        python_time = run_python_and_time(python_code)
        print(f"Python baseline execution time: {python_time:.6f} seconds\n")

    # 2) Generate C++ via LLM
    try:
        cpp_code, cost_usd, llm_latency = optimize_llm(
            python_code=python_code,
            model_key=model_key,
        )
    except Exception as e:
        print(f"LLM generation failed for model '{model_key}': {e}")
        results.append({
            "model": model_key,
            "status": "llm_failed",
            "cpp_time": None,
            "speedup": None,
            "cost": None,
            "latency": None,
            "error": str(e)[:200],
        })
        return

    cpp_filename = f"{model_key}.cpp"
    write_output(cpp_code, filename=cpp_filename)

    # 3) Compile C++ with the model name
    cpp_filename = f"{model_key}.cpp"
    write_output(cpp_code, filename=cpp_filename)
    
    compile_cmd = compile_command.copy()
    compile_cmd[1] = cpp_filename     # replace 'main.cpp'
    compile_cmd[3] = model_key        # output binary: gpt, claude, gemini

    compile_proc = subprocess.run(
        compile_cmd, 
        check=False, 
        text=True, 
        capture_output=True
    )

    if compile_proc.returncode != 0:
        print(f"Compilation failed for model '{model_key}':")
        print(compile_proc.stderr)

        results.append({
            "model": model_key,
            "status": "compile_failed",
            "cpp_time": None,
            "speedup": None,
            "cost": cost_usd,
            "latency": llm_latency,
            "error": compile_proc.stderr[:200],
        })
        return

    # 4) Run C++ binary (possibly multiple times)
    best_cpp_time: Optional[float] = None
    best_speedup: Optional[float] = None

    for i in range(repeats):
        start_cpp = time.perf_counter()
        run_proc = subprocess.run(
            run_command,
            check=False,
            text=True,
            capture_output=True,
        )
        cpp_time = time.perf_counter() - start_cpp

        if run_proc.returncode != 0:
            print(f"Run {i + 1} failed for model '{model_key}':")
            print(run_proc.stderr)
            continue

        improvement = (python_time / cpp_time) if cpp_time > 0 else float("inf")

        # Print per-run detail
        print(f"Run {i + 1} output:")
        print(f'Model: "{model_key}"')
        print(run_proc.stdout.strip())
        print(f"Improvement: {improvement:.2f}x faster than Python")
        print(f"Cost (LLM generation): ${cost_usd:.6f}")
        print(f"LLM generation latency: {llm_latency:.3f} seconds\n")

        # Track best run
        if best_cpp_time is None or cpp_time < best_cpp_time:
            best_cpp_time = cpp_time
            best_speedup = improvement

    # 5) Record result summary
    if best_cpp_time is None:
        # All runs failed at runtime
        results.append({
            "model": model_key,
            "status": "run_failed",
            "cpp_time": None,
            "speedup": None,
            "cost": cost_usd,
            "latency": llm_latency,
            "error": "Runtime failure in all runs",
        })
    else:
        results.append({
            "model": model_key,
            "status": "ok",
            "cpp_time": best_cpp_time,
            "speedup": best_speedup,
            "cost": cost_usd,
            "latency": llm_latency,
            "error": None,
        })

def print_results_table() -> None:
    """
    Print a simple summary table from the global `results` list.
    """
    if not results:
        print("\nNo results to summarize.")
        return

    print("\n=== Summary Table ===")
    print(f"{'Model':<10}{'Status':<15}{'Cost($)':<12}{'Latency(s)':<14}{'Cpp(s)':<12}{'Speedup(x)':<10}")
    print("-" * 70)

    for r in results:
        model   = r.get("model", "")
        status  = r.get("status", "")
        cost    = r.get("cost", None)
        lat     = r.get("latency", None)
        cpp_t   = r.get("cpp_time", None)
        speedup = r.get("speedup", None)

        cost_str    = f"{cost:.6f}" if cost is not None else "N/A"
        lat_str     = f"{lat:.3f}" if lat is not None else "N/A"
        cpp_str     = f"{cpp_t:.6f}" if cpp_t is not None else "N/A"
        speedup_str = f"{speedup:.2f}" if speedup is not None else "N/A"

        print(f"{model:<10}{status:<15}{cost_str:<12}{lat_str:<14}{cpp_str:<12}{speedup_str:<10}")

In [10]:
# --- Multi-model driver: run one model or all models ---

def run_for_model_or_all(model_key: str, python_code: str, repeats: int = 1) -> None:
    """
    If model_key == 'all':
        - Run Python baseline once.
        - For each model in 'models', generate C++, compile, and run.
    Otherwise:
        - Run only the specified model.
    At the end, print a summary table from `results`.
    """
    global results
    results.clear()  # reset from any previous experiment

    if model_key == "all":
        # Measure Python baseline once for all models
        python_time = run_python_and_time(python_code)
        print(f"Python baseline execution time: {python_time:.6f} seconds\n")

        for key in models.keys():
            print(f"=== Model: {key} ===")
            try:
                run_experiment(
                    model_key=key,
                    python_code=python_code,
                    repeats=repeats,
                    python_time=python_time,
                )
            except Exception as e:
                print(f"Unexpected error for model '{key}': {e}")
                results.append({
                    "model": key,
                    "status": "unexpected_error",
                    "cpp_time": None,
                    "speedup": None,
                    "cost": None,
                    "latency": None,
                    "error": str(e)[:200],
                })
    else:
        # Single model; run_experiment will measure Python baseline itself
        print(f"=== Model: {model_key} ===")
        run_experiment(
            model_key=model_key,
            python_code=python_code,
            repeats=repeats,
        )

    # Print summary for all attempted models
    print_results_table()


In [11]:
# --- Example Python code: π approximation via series ---

pi = """
import time

def calculate(iterations, param1, param2):
    result = 1.0
    for i in range(1, iterations+1):
        j = i * param1 - param2
        result -= (1/j)
        j = i * param1 + param2
        result += (1/j)
    return result

start_time = time.time()
result = calculate(100_000_000, 4, 1) * 4
end_time = time.time()

print(f"Result: {result:.12f}")
print(f"Execution Time: {(end_time - start_time):.6f} seconds")
"""

In [12]:
# --- Run the full pipeline: choose one model or 'all' ---

# Example 1: run all models once
run_for_model_or_all("all", pi, repeats=1)

# Example 2: run only GPT once
# run_for_model_or_all("gpt", pi, repeats=1)

# Example 3: run only Claude or Gemini
# run_for_model_or_all("claude", pi, repeats=1)
# run_for_model_or_all("gemini", pi, repeats=1)

Result: 3.141592658589
Execution Time: 21.725370 seconds
Python baseline execution time: 21.714268 seconds

=== Model: gpt ===
Run 1 output:
Model: "gpt"
Result: 3.141592659092
Execution Time: 0.051358 seconds
Improvement: 355.79x faster than Python
Cost (LLM generation): $0.012874
LLM generation latency: 22.992 seconds

=== Model: claude ===
Run 1 output:
Model: "claude"
Result: 3.141592659092
Execution Time: 0.051650 seconds
Improvement: 386.14x faster than Python
Cost (LLM generation): $0.034185
LLM generation latency: 15.259 seconds

=== Model: gemini ===
Run 1 output:
Model: "gemini"
Result: 3.141592659092
Execution Time: 0.052302 seconds
Improvement: 383.22x faster than Python
Cost (LLM generation): $0.077374
LLM generation latency: 56.276 seconds


=== Summary Table ===
Model     Status         Cost($)     Latency(s)    Cpp(s)      Speedup(x)
----------------------------------------------------------------------
gpt       ok             0.012874    22.992        0.061031    355.