In [7]:
import os
import subprocess
from pathlib import Path
import json
import nest_asyncio
import shutil
import logfire
import csv
import re
import time
import random
from tqdm import tqdm
import pandas as pd
from great_tables import GT, style, loc
import matplotlib.pyplot as plt
import numpy as np

from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIModel
from pydantic_ai.models.anthropic import AnthropicModel
from pydantic_ai.models.gemini import GeminiModel
from pydantic_ai.models.groq import GroqModel

from dotenv import load_dotenv

# Configuration
logfire.configure()
nest_asyncio.apply()
load_dotenv(override=True, dotenv_path='llm_blue/.env')

True

[1mLogfire[0m project URL: ]8;id=783885;https://logfire-us.pydantic.dev/prayash/hpc4llm\[4;36mhttps://logfire-us.pydantic.dev/prayash/hpc4llm[0m]8;;\


In [8]:
# Set CUDA paths
os.environ["PATH"] = "/usr/local/cuda-12.4/bin:" + os.environ.get("PATH", "")
os.environ["LD_LIBRARY_PATH"] = "/usr/local/cuda-12.4/lib64:" + os.environ.get("LD_LIBRARY_PATH", "")

# Check for NVCC availability
nvcc_path = shutil.which("nvcc")
if nvcc_path is None:
    print("[ERROR] nvcc not found in PATH. Please ensure that nvcc is installed and its directory is added to the PATH environment variable.")
else:
    print("nvcc found at:", nvcc_path)

# Create directory structure for seed generation
BASE_DIR = Path("llm_blue/data")
SEED_DIR = BASE_DIR / "2_seed_generation"
SEEDS_RESULTS_DIR = BASE_DIR / "results" / "2_seed_generation"
PROMPTS_DIR = BASE_DIR / "prompts" / "2_seed_generation"

# Input sizes for testing
INPUT_SIZES = [1024, 1000000, 1000000000]
NUM_SEEDS = 5

# Create directories
for directory in [SEED_DIR, SEEDS_RESULTS_DIR, PROMPTS_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

nvcc found at: /usr/local/cuda-12.4/bin/nvcc


In [9]:
# Define models to use
models = {
    # OpenAI models
    "gpt-4o-mini": Agent(model=OpenAIModel("gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))),
    "o1-mini": Agent(model=OpenAIModel("o1-mini", api_key=os.getenv("OPENAI_API_KEY"))),
    "o3-mini": Agent(model=OpenAIModel("o1-mini", api_key=os.getenv("OPENAI_API_KEY"))),
    
    # Anthropic models
    "claude-3-7-sonnet-latest": Agent(model=AnthropicModel("claude-3-7-sonnet-latest", api_key=os.getenv("ANTHROPIC_API_KEY"))),
    "claude-3-5-sonnet-latest": Agent(model=AnthropicModel("claude-3-5-sonnet-latest", api_key=os.getenv("ANTHROPIC_API_KEY"))),
    "claude-3-5-haiku-latest": Agent(model=AnthropicModel("claude-3-5-haiku-latest", api_key=os.getenv("ANTHROPIC_API_KEY"))),
    
    # Gemini models
    "gemini-1.5-flash": Agent(model=GeminiModel("gemini-1.5-flash", api_key=os.getenv("GEMINI_API_KEY"))),
    
    # Opensource models
    "llama-3.3-70b-versatile": Agent(model=GroqModel("llama-3.3-70b-versatile", api_key=os.getenv("GROQ_API_KEY"))),
    "qwen-2.5-32b": Agent(model=GroqModel("qwen-2.5-32b", api_key=os.getenv("GROQ_API_KEY"))),
}

# Create model and seed specific directories
for model_name in models:
    model_dir = SEED_DIR / model_name
    model_dir.mkdir(exist_ok=True)
    
    for seed_idx in range(NUM_SEEDS):
        seed_dir = model_dir / f"seed_{seed_idx}"
        seed_dir.mkdir(exist_ok=True)
        
        for size in INPUT_SIZES:
            size_dir = seed_dir / f"size_{size}"
            size_dir.mkdir(exist_ok=True)

print(f"Created directory structure for {len(models)} models with {NUM_SEEDS} seeds each")
print(f"Models: {', '.join(models.keys())}")
# Cell 5

Created directory structure for 9 models with 5 seeds each
Models: gpt-4o-mini, o1-mini, o3-mini, claude-3-7-sonnet-latest, claude-3-5-sonnet-latest, claude-3-5-haiku-latest, gemini-1.5-flash, llama-3.3-70b-versatile, qwen-2.5-32b


In [10]:
def extract_cuda_code(text: str) -> str:
    """Extract CUDA code from LLM response."""
    # Remove any thinking sections
    text_cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # Look for code blocks in this order of preference
    patterns = [
        r'```cuda(.*?)```',
        r'```cpp(.*?)```',
        r'```c(.*?)```',
        r'```(.*?)```'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text_cleaned, flags=re.DOTALL | re.IGNORECASE)
        if match:
            return match.group(1).strip()
    
    # If no code blocks found, use the whole text
    return text_cleaned.strip()

def create_cuda_wrapper(kernel_code: str, kernel_name: str) -> str:
    """Create a standard CUDA wrapper for the kernel with proper reduction and array size parameter."""
    # Create a complete, robust CUDA program wrapping the kernel
    wrapper_code = f"""
#include <cuda_runtime.h>
#include <iostream>
#include <cstdlib>

// The kernel provided by the LLM
{kernel_code}

// Host function to perform reduction
int sumArray(int* h_input, int size) {{
    int *d_input, *d_temp;
    
    // Allocate device memory
    cudaMalloc((void**)&d_input, size * sizeof(int));
    
    // The size of d_temp is based on the number of blocks we'll launch
    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    cudaMalloc((void**)&d_temp, blocksPerGrid * sizeof(int));
    
    // Copy input data to device
    cudaMemcpy(d_input, h_input, size * sizeof(int), cudaMemcpyHostToDevice);
    
    // Create CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // Launch kernel with the actual size parameter
    cudaEventRecord(start);
    {kernel_name}<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>(d_input, d_temp, size);
    cudaEventRecord(stop);
    
    // Wait for kernel to finish
    cudaDeviceSynchronize();
    
    // Calculate elapsed time
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    
    // Copy the block results back to host
    int* h_temp = new int[blocksPerGrid];
    cudaMemcpy(h_temp, d_temp, blocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);
    
    // Finalize the reduction on CPU (sum the block results)
    int sum = 0;
    for (int i = 0; i < blocksPerGrid; i++) {{
        sum += h_temp[i];
    }}
    
    // Print results
    std::cout << "Sum: " << sum << std::endl;
    std::cout << "Kernel Execution Time: " << milliseconds << " ms" << std::endl;
    
    // Clean up
    cudaFree(d_input);
    cudaFree(d_temp);
    delete[] h_temp;
    
    return sum;
}}

int main(int argc, char** argv) {{
    int size = 1024;  // Default size
    
    if (argc > 1) {{
        size = atoi(argv[1]);
    }}
    
    // Print size for verification
    std::cout << "Running CUDA Reduction for size: " << size << std::endl;
    
    // Allocate and initialize host array
    int* h_input = new int[size];
    for (int i = 0; i < size; i++) {{
        h_input[i] = 1;  // Set all elements to 1 for easy verification
    }}
    
    // Run the reduction and get the sum
    int result = sumArray(h_input, size);
    
    // Verify result (should equal the array size since all elements are 1)
    bool correct = (result == size);
    std::cout << "Result verification: " << (correct ? "PASSED" : "FAILED") << std::endl;
    
    // Clean up
    delete[] h_input;
    
    return 0;
}}
"""
    return wrapper_code

def compile_and_run(cuda_file: Path, size: int, output_dir: Path) -> dict:
    """Compile and run a CUDA file, returning basic results without profiling."""
    results = {
        "compilation_success": False,
        "run_success": False,
        "execution_time_ms": None,
        "sum_value": None,
        "is_correct": False
    }
    
    # Compile
    binary_path = cuda_file.with_suffix(".out")
    compile_cmd = ["nvcc", "-O3", str(cuda_file), "-o", str(binary_path), "-std=c++11"]
    
    try:
        compile_result = subprocess.run(compile_cmd, capture_output=True, text=True, timeout=60)
        
        if compile_result.returncode != 0:
            results["error"] = compile_result.stderr
            return results
        
        results["compilation_success"] = True
        
        # Run
        run_cmd = [str(binary_path), str(size)]
        
        run_result = subprocess.run(run_cmd, capture_output=True, text=True, timeout=300)
        
        # Save output
        output_file = output_dir / "output.txt"
        with open(output_file, "w") as f:
            f.write(run_result.stdout)
            if run_result.stderr:
                f.write("\n\nSTDERR:\n")
                f.write(run_result.stderr)
        
        results["run_success"] = True
        
        # Extract results
        time_match = re.search(r'Kernel Execution Time: ([\d\.]+) ms', run_result.stdout)
        if time_match:
            results["execution_time_ms"] = float(time_match.group(1))
        
        sum_match = re.search(r'Sum: (\d+)', run_result.stdout)
        if sum_match:
            results["sum_value"] = int(sum_match.group(1))
            results["is_correct"] = (results["sum_value"] == size)
        
        # Check for verification result
        verify_match = re.search(r'Result verification: (PASSED|FAILED)', run_result.stdout)
        if verify_match:
            results["verification"] = verify_match.group(1)
        
    except Exception as e:
        if not results["compilation_success"]:
            results["error"] = str(e)
        else:
            results["run_success"] = False
            results["error"] = str(e)
    
    return results

def evaluate_kernel(kernel_code: str, model_name: str, seed_idx: int) -> dict:
    """Evaluate a kernel implementation across different input sizes."""
    # Extract kernel name
    kernel_match = re.search(r'__global__\s+void\s+(\w+)', kernel_code)
    if not kernel_match:
        return {"error": "Could not extract kernel name"}
    
    kernel_name = kernel_match.group(1)
    
    # Create results dict
    evaluation = {
        "model": model_name,
        "seed": seed_idx,
        "kernel_code": kernel_code,
        "kernel_name": kernel_name,
        "sizes": {}
    }
    
    # Create wrapper code
    wrapper_code = create_cuda_wrapper(kernel_code, kernel_name)
    
    # Evaluate for each size
    for size in INPUT_SIZES:
        size_dir = SEED_DIR / model_name / f"seed_{seed_idx}" / f"size_{size}"
        size_dir.mkdir(parents=True, exist_ok=True)
        
        # Save wrapper code
        cuda_file = size_dir / f"{model_name}_seed{seed_idx}_size{size}.cu"
        with open(cuda_file, "w") as f:
            f.write(wrapper_code)
        
        # Compile and run
        results = compile_and_run(cuda_file, size, size_dir)
        evaluation["sizes"][size] = results
    
    return evaluation


In [11]:
def generate_prompt_for_kernel(previous_kernels=None, previous_performance=None) -> str:
    """
    Generate a prompt for kernel generation, including context from previous implementations.
    
    Args:
        previous_kernels: List of previously generated kernel codes
        previous_performance: Performance data from previous kernels
    """
    base_prompt = (
        "You are an expert in high-performance CUDA programming. Generate a CUDA kernel function "
        "that performs a sum reduction on an array of integers.\n\n"
        "Implement ONLY the kernel function with this exact signature:\n"
        "__global__ void sumReduction(int *input, int *output, int size)\n\n"
        "The kernel should:\n"
        "- Take an input array of integers, an output array to store block results, and the size of the input array\n"
        "- Use shared memory appropriately sized with extern __shared__\n"
        "- Handle array boundaries correctly using the 'size' parameter\n"
        "- Use tree-based reduction for high performance\n"
        "- Use synchronization appropriately\n"
        "- Aim for the best performance across all input sizes (1K to 1B elements)\n\n"
    )
    
    # Add context from previous implementations if available
    if previous_kernels and len(previous_kernels) > 0:
        base_prompt += "Here are previous kernel implementations with their performance metrics:\n\n"
        
        # Include up to 2 previous implementations as context
        for i, (kernel, perf) in enumerate(zip(previous_kernels[-2:], previous_performance[-2:])):
            base_prompt += f"Implementation {i+1}:\n```cuda\n{kernel}\n```\n"
            
            # Include performance information if available
            if perf:
                base_prompt += "Performance:\n"
                for size, data in perf.items():
                    if data.get("is_correct", False):
                        base_prompt += f"- Size {size}: {data.get('execution_time_ms', 'N/A')} ms\n"
            
            base_prompt += "\n"
        
        base_prompt += (
            "IMPORTANT: Analyze the strengths and weaknesses of the previous implementations before designing your approach.\n\n"
            "Consider implementing a different strategy such as but not limited to:\n"
            "- Bank-conflict-free memory access patterns\n"
            "- Sequential addressing vs. strided addressing\n"
            "- Warp-level primitives like __shfl_down_sync() for warp-level reductions\n"
            "- Loop unrolling for the reduction phase\n"
            "- Early exit strategies to reduce unnecessary work\n"
            "- Minimizing divergent execution paths\n\n"
            "Your goal is to create an implementation that performs better than previous ones, especially for large input sizes (1B elements).\n\n"
        )
    
    base_prompt += (
        "First, briefly explain (in comments) your optimization strategy and why you believe it will be effective.\n\n"
        "The wrapper code will:\n"
        "- Call your kernel with blocks and threads: sumReduction<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>(d_input, d_output, size)\n"
        "- Handle the final reduction across blocks\n\n"
        "Output ONLY the kernel function, starting with __global__ void sumReduction"
    )
    return base_prompt

def generate_and_evaluate_seed(model_name: str, agent, seed_idx: int, prompt=None) -> dict:
    """Generate and evaluate a single seed from a model."""
    try:
        # Generate kernel code
        print(f"Generating seed {seed_idx} for {model_name}...")
        
        # Use provided prompt or generate default one
        if prompt is None:
            prompt = generate_prompt_for_kernel()
        
        # Generate code
        response = agent.run_sync(prompt)
        code_text = response.data if hasattr(response, "data") else response
        
        # Extract kernel code
        kernel_code = extract_cuda_code(code_text)
        
        # Save the raw response and extracted kernel
        seed_dir = SEED_DIR / model_name / f"seed_{seed_idx}"
        seed_dir.mkdir(exist_ok=True, parents=True)
        
        with open(seed_dir / "raw_response.txt", "w") as f:
            f.write(code_text)
        
        with open(seed_dir / "kernel_code.cu", "w") as f:
            f.write(kernel_code)
        
        # Evaluate the kernel
        evaluation = evaluate_kernel(kernel_code, model_name, seed_idx)
        
        # Save evaluation
        with open(seed_dir / "evaluation.json", "w") as f:
            json.dump(evaluation, f, indent=2)
        
        return evaluation
    
    except Exception as e:
        print(f"Error generating/evaluating seed {seed_idx} for {model_name}: {e}")
        return {
            "model": model_name,
            "seed": seed_idx,
            "error": str(e)
        }
    
def generate_multiple_seeds(model_name, agent, num_seeds=5):
    """Generate and evaluate multiple seeds for a single model."""
    print(f"\n{'='*50}\nGenerating {num_seeds} seeds for {model_name}\n{'='*50}")
    
    results = []
    previous_kernels = []
    previous_performance = []
    
    for seed_idx in range(num_seeds):
        print(f"\nGenerating seed {seed_idx+1}/{num_seeds}...")
        
        # Generate prompt with context from previous implementations
        prompt = generate_prompt_for_kernel(previous_kernels, previous_performance)
        
        # Save prompt
        prompt_dir = PROMPTS_DIR / model_name
        prompt_dir.mkdir(exist_ok=True)
        with open(prompt_dir / f"seed_{seed_idx}_prompt.txt", "w") as f:
            f.write(prompt)
        
        # Generate and evaluate seed
        evaluation = generate_and_evaluate_seed(model_name, agent, seed_idx, prompt)
        results.append(evaluation)
        
        # Add to previous implementations list
        previous_kernels.append(evaluation.get("kernel_code", ""))
        previous_performance.append(evaluation.get("sizes", {}))
        
        # Delay to avoid rate limiting
        time.sleep(3)
    
    # Save summary of all seeds
    with open(SEEDS_RESULTS_DIR / f"{model_name}_all_seeds.json", "w") as f:
        json.dump(results, f, indent=2)
    
    return results

In [None]:
results_by_model = {}

# Loop through all models
for model_name, agent in models.items():
    print(f"\n{'='*80}\nRunning seed generation for model: {model_name}\n{'='*80}")
    
    try:
        # Generate seeds for this model
        model_seeds = generate_multiple_seeds(model_name, agent, 5)
        results_by_model[model_name] = model_seeds
        
        # Save model results
        results_path = SEEDS_RESULTS_DIR / f"{model_name}_all_seeds.json"
        with open(results_path, "w") as f:
            json.dump(model_seeds, f, indent=2)
            
        print(f"Results saved to {results_path}")
        
    except Exception as e:
        print(f"Error running model {model_name}: {e}")


Running seed generation for model: gpt-4o-mini

Generating 5 seeds for gpt-4o-mini

Generating seed 1/5...
Generating seed 0 for gpt-4o-mini...
01:14:10.020 agent run prompt=You are an expert in high-performance CUDA programming. Genera...he kernel function, starting with __global__ void sumReduction
01:14:10.039   preparing model request params run_step=1
01:14:10.040   model request
01:14:18.644   handle model response

Generating seed 2/5...
Generating seed 1 for gpt-4o-mini...
01:14:25.457 agent run prompt=You are an expert in high-performance CUDA programming. Genera...he kernel function, starting with __global__ void sumReduction
01:14:25.458   preparing model request params run_step=1
01:14:25.459   model request
01:14:34.934   handle model response

Generating seed 3/5...
Generating seed 2 for gpt-4o-mini...
01:14:42.058 agent run prompt=You are an expert in high-performance CUDA programming. Genera...he kernel function, starting with __global__ void sumReduction
01:14:42.060 