In [1]:
!pip install openai tiktoken pandas numpy tqdm scikit-learn

import time
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import tiktoken
from openai import OpenAI



MODEL_CONFIG = {
    "gpt-4o-mini": {
        "input_cost": 0.15 / 1_000_000,
        "output_cost": 0.60 / 1_000_000
    },
    "gpt-4o": {
        "input_cost": 5.00 / 1_000_000,
        "output_cost": 15.00 / 1_000_000
    },
    "gpt-3.5-turbo": {
        "input_cost": 0.50 / 1_000_000,
        "output_cost": 1.50 / 1_000_000
    }
}

def count_tokens(text, model):
    enc = tiktoken.encoding_for_model(model)
    return len(enc.encode(text))

evaluation_data = [
    {"question": "What is aspirin used for?", "answer": "pain relief"},
    {"question": "What is diabetes?", "answer": "high blood sugar"},
    {"question": "What does MRI stand for?", "answer": "magnetic resonance imaging"},
    {"question": "What is hypertension?", "answer": "high blood pressure"},
    {"question": "What is chemotherapy?", "answer": "cancer treatment"},
]

def run_llm(model, prompt):
    start = time.time()

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    latency = time.time() - start
    output = response.choices[0].message.content.strip()

    input_tokens = count_tokens(prompt, model)
    output_tokens = count_tokens(output, model)

    cost = (
        input_tokens * MODEL_CONFIG[model]["input_cost"]
        + output_tokens * MODEL_CONFIG[model]["output_cost"]
    )

    return output, latency, cost

def evaluate_accuracy(preds, refs):
    preds = [p.lower() for p in preds]
    refs = [r.lower() for r in refs]
    return accuracy_score(refs, preds)

def run_experiment(model_name):
    predictions, latencies, costs = [], [], []

    for item in tqdm(evaluation_data):
        output, latency, cost = run_llm(model_name, item["question"])

        predictions.append(output)
        latencies.append(latency)
        costs.append(cost)

    accuracy = evaluate_accuracy(
        predictions,
        [x["answer"] for x in evaluation_data]
    )

    return {
        "model": model_name,
        "accuracy": accuracy,
        "avg_latency_sec": np.mean(latencies),
        "avg_cost_usd": np.mean(costs),
        "total_cost_usd": np.sum(costs)
    }

results = []

for model in MODEL_CONFIG.keys():
    print(f"\nüîç Running evaluation for: {model}")
    metrics = run_experiment(model)
    results.append(metrics)

df = pd.DataFrame(results)
df

df.to_csv("llm_experiment_logs.csv", index=False)
print("‚úÖ Experiment logs saved")

best_model = df.sort_values(
    by=["accuracy", "avg_cost_usd", "avg_latency_sec"],
    ascending=[False, True, True]
).iloc[0]

print("\nüèÜ BEST MODEL SELECTED")
print(best_model)


BASELINE_ACCURACY = 0.7

for _, row in df.iterrows():
    if row["accuracy"] < BASELINE_ACCURACY:
        print(f"‚ö†Ô∏è Regression detected in model: {row['model']}")





üîç Running evaluation for: gpt-4o-mini


  0%|          | 0/5 [00:05<?, ?it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [2]:
from openai import RateLimitError
import random

def run_llm(model, prompt, max_retries=3):
    for attempt in range(max_retries):
        try:
            start = time.time()

            response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0
            )

            latency = time.time() - start
            output = response.choices[0].message.content.strip()

            input_tokens = count_tokens(prompt, model)
            output_tokens = count_tokens(output, model)

            cost = (
                input_tokens * MODEL_CONFIG[model]["input_cost"]
                + output_tokens * MODEL_CONFIG[model]["output_cost"]
            )


            time.sleep(1)

            return output, latency, cost

        except RateLimitError:
            wait_time = 2 ** attempt + random.random()
            print(f"‚è≥ Rate limit hit. Retrying in {wait_time:.1f}s...")
            time.sleep(wait_time)


    return "ERROR", 0, 0


In [3]:
JUDGE_PROMPT = """
You are an expert evaluator.

Question:
{question}

Ground Truth Answer:
{reference}

Model Answer:
{prediction}

Score the model answer from 0 to 5:
0 = completely wrong
5 = perfectly correct

Return ONLY a number between 0 and 5.
"""


In [4]:
def judge_answer(question, reference, prediction, model="gpt-4o-mini"):
    prompt = JUDGE_PROMPT.format(
        question=question,
        reference=reference,
        prediction=prediction
    )

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        score = float(response.choices[0].message.content.strip())
        time.sleep(1)
        return score
    except:
        return 0.0


In [None]:
judge_scores = []

for model in df["model"]:
    print(f"\nüßë‚Äç‚öñÔ∏è Judging answers for {model}")
    scores = []

    for item in evaluation_data:
        pred, _, _ = run_llm(model, item["question"])
        score = judge_answer(
            item["question"],
            item["answer"],
            pred
        )
        scores.append(score)

    judge_scores.append(np.mean(scores))


In [6]:
judge_scores = []


assert "model" in df.columns, "df me 'model' column missing hai"

for model in df["model"].tolist():
    print(f"\nüßë‚Äç‚öñÔ∏è Judging answers for {model}")
    scores = []

    for item in evaluation_data:
        try:

            pred, _, _ = run_llm(model, item["question"])


            if item["answer"].lower() in pred.lower():
                score = 5.0
            elif pred.strip() != "":
                score = 3.0
            else:
                score = 0.0

            scores.append(score)

        except Exception as e:
            print("‚ö†Ô∏è Error:", e)
            scores.append(0.0)

    judge_scores.append(float(np.mean(scores)))

print("\n‚úÖ Judge scores calculated:", judge_scores)


NameError: name 'df' is not defined

In [7]:
results = []

for model in MODEL_CONFIG.keys():
    print(f"\nüîç Running evaluation for: {model}")
    metrics = run_experiment(model)
    results.append(metrics)

df = pd.DataFrame(results)
df



üîç Running evaluation for: gpt-4o-mini


  0%|          | 0/5 [00:00<?, ?it/s]

‚è≥ Rate limit hit. Retrying in 1.1s...
‚è≥ Rate limit hit. Retrying in 2.5s...
‚è≥ Rate limit hit. Retrying in 4.9s...


 20%|‚ñà‚ñà        | 1/5 [00:15<01:01, 15.29s/it]

‚è≥ Rate limit hit. Retrying in 1.6s...
‚è≥ Rate limit hit. Retrying in 2.2s...
‚è≥ Rate limit hit. Retrying in 4.1s...


 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [00:28<00:42, 14.11s/it]

‚è≥ Rate limit hit. Retrying in 1.7s...
‚è≥ Rate limit hit. Retrying in 2.4s...
‚è≥ Rate limit hit. Retrying in 4.7s...


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [00:42<00:28, 14.04s/it]

‚è≥ Rate limit hit. Retrying in 1.8s...
‚è≥ Rate limit hit. Retrying in 2.5s...
‚è≥ Rate limit hit. Retrying in 4.7s...


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [00:56<00:13, 13.87s/it]

‚è≥ Rate limit hit. Retrying in 1.7s...
‚è≥ Rate limit hit. Retrying in 2.0s...
‚è≥ Rate limit hit. Retrying in 4.1s...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:08<00:00, 13.71s/it]



üîç Running evaluation for: gpt-4o


  0%|          | 0/5 [00:00<?, ?it/s]

‚è≥ Rate limit hit. Retrying in 1.3s...
‚è≥ Rate limit hit. Retrying in 2.0s...
‚è≥ Rate limit hit. Retrying in 4.5s...


 20%|‚ñà‚ñà        | 1/5 [00:12<00:48, 12.23s/it]

‚è≥ Rate limit hit. Retrying in 1.9s...
‚è≥ Rate limit hit. Retrying in 2.0s...
‚è≥ Rate limit hit. Retrying in 4.9s...


 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [00:25<00:38, 12.92s/it]

‚è≥ Rate limit hit. Retrying in 1.8s...
‚è≥ Rate limit hit. Retrying in 2.1s...
‚è≥ Rate limit hit. Retrying in 4.2s...


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [00:38<00:25, 12.78s/it]

‚è≥ Rate limit hit. Retrying in 1.9s...
‚è≥ Rate limit hit. Retrying in 2.7s...
‚è≥ Rate limit hit. Retrying in 4.1s...


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [00:51<00:12, 12.99s/it]

‚è≥ Rate limit hit. Retrying in 1.0s...
‚è≥ Rate limit hit. Retrying in 2.4s...
‚è≥ Rate limit hit. Retrying in 4.1s...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:03<00:00, 12.73s/it]



üîç Running evaluation for: gpt-3.5-turbo


  0%|          | 0/5 [00:00<?, ?it/s]

‚è≥ Rate limit hit. Retrying in 1.9s...
‚è≥ Rate limit hit. Retrying in 2.9s...
‚è≥ Rate limit hit. Retrying in 4.0s...


 20%|‚ñà‚ñà        | 1/5 [00:13<00:52, 13.23s/it]

‚è≥ Rate limit hit. Retrying in 1.2s...
‚è≥ Rate limit hit. Retrying in 2.5s...
‚è≥ Rate limit hit. Retrying in 4.1s...


 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [00:25<00:37, 12.55s/it]

‚è≥ Rate limit hit. Retrying in 1.7s...
‚è≥ Rate limit hit. Retrying in 2.5s...
‚è≥ Rate limit hit. Retrying in 4.0s...


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [00:38<00:25, 12.73s/it]

‚è≥ Rate limit hit. Retrying in 1.6s...
‚è≥ Rate limit hit. Retrying in 2.6s...
‚è≥ Rate limit hit. Retrying in 4.2s...


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [00:51<00:12, 12.89s/it]

‚è≥ Rate limit hit. Retrying in 1.1s...
‚è≥ Rate limit hit. Retrying in 2.6s...
‚è≥ Rate limit hit. Retrying in 4.8s...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [01:04<00:00, 12.86s/it]


Unnamed: 0,model,accuracy,avg_latency_sec,avg_cost_usd,total_cost_usd
0,gpt-4o-mini,0.0,0.0,0.0,0
1,gpt-4o,0.0,0.0,0.0,0
2,gpt-3.5-turbo,0.0,0.0,0.0,0


In [8]:
print(df.columns)


Index(['model', 'accuracy', 'avg_latency_sec', 'avg_cost_usd',
       'total_cost_usd'],
      dtype='object')


In [9]:
judge_scores = []

for model in df["model"].tolist():
    print(f"\nüßë‚Äç‚öñÔ∏è Judging answers for {model}")
    scores = []

    for item in evaluation_data:
        pred, _, _ = run_llm(model, item["question"])


        if item["answer"].lower() in pred.lower():
            score = 5.0
        elif pred.strip():
            score = 3.0
        else:
            score = 0.0

        scores.append(score)

    judge_scores.append(float(np.mean(scores)))

print("‚úÖ Judge scores:", judge_scores)



üßë‚Äç‚öñÔ∏è Judging answers for gpt-4o-mini
‚è≥ Rate limit hit. Retrying in 1.6s...
‚è≥ Rate limit hit. Retrying in 2.9s...
‚è≥ Rate limit hit. Retrying in 4.6s...
‚è≥ Rate limit hit. Retrying in 1.5s...
‚è≥ Rate limit hit. Retrying in 2.3s...
‚è≥ Rate limit hit. Retrying in 4.5s...
‚è≥ Rate limit hit. Retrying in 1.3s...
‚è≥ Rate limit hit. Retrying in 2.3s...
‚è≥ Rate limit hit. Retrying in 4.2s...
‚è≥ Rate limit hit. Retrying in 1.9s...
‚è≥ Rate limit hit. Retrying in 2.5s...
‚è≥ Rate limit hit. Retrying in 4.2s...
‚è≥ Rate limit hit. Retrying in 1.5s...
‚è≥ Rate limit hit. Retrying in 2.1s...
‚è≥ Rate limit hit. Retrying in 4.6s...

üßë‚Äç‚öñÔ∏è Judging answers for gpt-4o
‚è≥ Rate limit hit. Retrying in 1.4s...
‚è≥ Rate limit hit. Retrying in 2.4s...
‚è≥ Rate limit hit. Retrying in 4.5s...
‚è≥ Rate limit hit. Retrying in 2.0s...
‚è≥ Rate limit hit. Retrying in 2.2s...
‚è≥ Rate limit hit. Retrying in 4.6s...
‚è≥ Rate limit hit. Retrying in 1.8s...
‚è≥ Rate limit hit. Retrying in

In [10]:
!pip install fastapi uvicorn nest-asyncio




In [11]:
from fastapi import FastAPI
from pydantic import BaseModel
import nest_asyncio
import uvicorn

nest_asyncio.apply()


In [12]:
class EvalRequest(BaseModel):
    question: str
    ground_truth: str
    model: str = "gpt-4o-mini"

class EvalResponse(BaseModel):
    model: str
    prediction: str
    latency_sec: float
    cost_usd: float
    judge_score: float


In [13]:
app = FastAPI(title="LLM Evaluation & Cost Optimization API")

@app.post("/evaluate", response_model=EvalResponse)
def evaluate_llm(req: EvalRequest):

    prediction, latency, cost = run_llm(req.model, req.question)

    if req.ground_truth.lower() in prediction.lower():
        judge_score = 5.0
    elif prediction.strip():
        judge_score = 3.0
    else:
        judge_score = 0.0

    return EvalResponse(
        model=req.model,
        prediction=prediction,
        latency_sec=round(latency, 3),
        cost_usd=round(cost, 6),
        judge_score=judge_score
    )


In [15]:
import threading

def run():
    uvicorn.run(app, host="0.0.0.0", port=8000)

thread = threading.Thread(target=run)
thread.start()


In [16]:
import requests
import time

time.sleep(2)  # server start hone do

url = "http://localhost:8000/evaluate"

payload = {
    "question": "What is aspirin used for?",
    "ground_truth": "pain relief",
    "model": "gpt-4o-mini"
}

response = requests.post(url, json=payload)
print(response.json())


‚è≥ Rate limit hit. Retrying in 1.5s...
‚è≥ Rate limit hit. Retrying in 2.3s...
‚è≥ Rate limit hit. Retrying in 4.1s...
INFO:     127.0.0.1:42380 - "POST /evaluate HTTP/1.1" 200 OK
{'model': 'gpt-4o-mini', 'prediction': 'ERROR', 'latency_sec': 0.0, 'cost_usd': 0.0, 'judge_score': 3.0}
