In [4]:
SYSTEM_PROMPT = """
You are a discount calculation assistant. I will provide a customer profile and you must calculate their discount percentage and explain your reasoning.

Discount rules:
- Age 65+ OR student status: 15% discount
- Annual income < $30,000: 20% discount  
- Premium member for 2+ years: 10% discount
- New customer (< 6 months): 5% discount

Rules can stack up to a maximum of 35% discount.

Respond in JSON format only:
{
  "discount_percentage": number,
  "reason": "clear explanation of which rules apply and calculations",
  "applied_rules": ["list", "of", "applied", "rule", "names"]
}
"""

In [5]:
import os

from dotenv import load_dotenv
from openai import AsyncOpenAI
import asyncio
# Load environment variables
load_dotenv()


True

In [6]:
DEFAULT_MODEL = "gpt-4.1-nano-2025-04-14"

def get_client() -> AsyncOpenAI:
    """Lazily create an AsyncOpenAI client, requiring the API key only when used.

    This avoids raising errors during module import (e.g., when running --help).
    """
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError(
            "OPENAI_API_KEY is not set. Please export it before running prompts."
        )
    return AsyncOpenAI(api_key=api_key)


async def run_prompt(prompt: str, model: str = DEFAULT_MODEL):
    """Run the discount calculation prompt with the specified model."""
    client = get_client()
    response = await client.chat.completions.create(
        model=model,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt},
        ],
    )
    response = response.choices[0].message.content.strip()
    return response


async def main():
    customer_profile = """
    Customer Profile:
    - Name: Sarah Johnson
    - Age: 67
    - Student: No
    - Annual Income: $45,000
    - Premium Member: Yes, for 3 years
    - Account Age: 3 years
    """
    print("=== System Prompt ===")
    print(SYSTEM_PROMPT)
    print("\n=== Customer Profile ===")
    print(customer_profile)
    print(f"\n=== Running Prompt with default model {DEFAULT_MODEL} ===")
    print(await run_prompt(customer_profile, model=DEFAULT_MODEL))


    

In [5]:
# Test with a sample customer profile
customer_profile = """
Customer Profile:
- Name: Sarah Johnson
- Age: 67
- Student: No
- Annual Income: $45,000
- Premium Member: Yes, for 3 years
- Account Age: 3 years
"""

result = await run_prompt(customer_profile)
print(result)

{
  "discount_percentage": 25,
  "reason": "Sarah is eligible for a 15% discount due to age being over 65. Additionally, her premium membership duration of over 2 years grants a 10% discount. These discounts stack for a total of 25%. No other rules apply as she is not a student, does not have income below $30,000, and is not a new customer.",
  "applied_rules": ["Age 65+", "Premium member for 2+ years"]
}


In [10]:
os.path.dirname(os.path.abspath("/"))

'/'

In [11]:
from ragas.dataset import Dataset

def load_dataset():
    """Load the dataset from CSV file. Downloads from GitHub if not found locally."""
    import urllib.request
    # Get the current working directory (notebook's directory)
    current_dir = os.getcwd()
    dataset_path = os.path.join(current_dir, "datasets", "discount_benchmark.csv")
    # Download dataset from GitHub if it doesn't exist locally
    if not os.path.exists(dataset_path):
        os.makedirs(os.path.dirname(dataset_path), exist_ok=True)
        urllib.request.urlretrieve("https://raw.githubusercontent.com/vibrantlabsai/ragas/main/examples/ragas_examples/benchmark_llm/datasets/discount_benchmark.csv", dataset_path)
    return Dataset.load(name="discount_benchmark", backend="local/csv", root_dir=current_dir)

In [None]:
import json
from ragas.metrics.discrete import discrete_metric
from ragas.metrics.result import MetricResult

@discrete_metric(name="discount_accuracy", allowed_values=["correct", "incorrect"])
def discount_accuracy(prediction: str, expected_discount):
    """Check if the discount prediction is correct."""
    

    parsed_json = json.loads(prediction)
    predicted_discount = parsed_json.get("discount_percentage")
    expected_discount_int = int(expected_discount)

    if predicted_discount == expected_discount_int:
        return MetricResult(
            value="correct", 
            reason=f"Correctly calculated discount={expected_discount_int}%"
        )
    else:
        return MetricResult(
            value="incorrect",
            reason=f"Expected discount={expected_discount_int}%; Got discount={predicted_discount}%"
        )

In [3]:
from ragas import experiment

@experiment()
async def benchmark_experiment(row, model_name: str):
    # Get model response
    response = await run_prompt(row["customer_profile"], model=model_name)

    # Parse response (strict JSON mode expected)
    try:
        parsed_json = json.loads(response)
        predicted_discount = parsed_json.get('discount_percentage')
    except Exception:
        predicted_discount = None

    # Score the response
    score = discount_accuracy.score(
        prediction=response,
        expected_discount=row["expected_discount"]
    )

    return {
        **row,
        "model": model_name,
        "response": response,
        "predicted_discount": predicted_discount,
        "score": score.value,
        "score_reason": score.reason
    }

In [12]:

# Load dataset
dataset = load_dataset()
print(f"Dataset loaded with {len(dataset)} samples")

# Run baseline experiment
baseline_results = await benchmark_experiment.arun(
    dataset,
    name="gpt-4.1-nano-2025-04-14",
    model_name="gpt-4.1-nano-2025-04-14"
)

# Calculate and display accuracy
baseline_accuracy = sum(1 for r in baseline_results if r["score"] == "correct") / len(baseline_results)
print(f"Baseline Accuracy: {baseline_accuracy:.2%}")

# Run candidate experiment
candidate_results = await benchmark_experiment.arun(
    dataset,
    name="gpt-5-nano-2025-08-07",
    model_name="gpt-5-nano-2025-08-07"
)

# Calculate and display accuracy
candidate_accuracy = sum(1 for r in candidate_results if r["score"] == "correct") / len(candidate_results)
print(f"Candidate Accuracy: {candidate_accuracy:.2%}")

Dataset loaded with 10 samples


Running experiment: 100%|██████████| 10/10 [00:03<00:00,  2.77it/s]


Baseline Accuracy: 50.00%


Running experiment: 100%|██████████| 10/10 [00:13<00:00,  1.37s/it]

Candidate Accuracy: 100.00%





In [18]:
import pandas as pd
from typing import List, Optional
import datetime

def compare_inputs_to_output(
    inputs: List[str], output_path: Optional[str] = None
) -> str:
    """Compare multiple experiment CSVs and write a combined CSV.

    - Requires 'id' column in all inputs; uses it as the alignment key
    - Builds output with id + canonical columns + per-experiment response/score/reason columns
    - Returns the full output path
    """
    if not inputs or len(inputs) < 2:
        raise ValueError("At least two input CSV files are required for comparison")

    # Load all inputs
    dataframes = []
    experiment_names = []
    for path in inputs:
        df = pd.read_csv(path)
        if "model" not in df.columns:
            raise ValueError(f"Missing 'model' column in {path}")
        exp_name = str(df["model"].iloc[0])
        experiment_names.append(exp_name)
        dataframes.append(df)

    canonical_cols = ["customer_profile", "description", "expected_discount"]
    base_df = dataframes[0]

    # Require 'id' in all inputs
    if not all("id" in df.columns for df in dataframes):
        raise ValueError(
            "All input CSVs must contain an 'id' column to align rows. Re-run experiments after adding 'id' to your dataset."
        )

    # Validate duplicates and matching sets of IDs
    key_sets = []
    for idx, df in enumerate(dataframes):
        keys = df["id"].astype(str)
        if keys.duplicated().any():
            dupes = keys[keys.duplicated()].head(3).tolist()
            raise ValueError(
                f"Input {inputs[idx]} contains duplicate id values. Examples: {dupes}"
            )
        key_sets.append(set(keys.tolist()))

    base_keys = key_sets[0]
    for i, ks in enumerate(key_sets[1:], start=1):
        if ks != base_keys:
            missing_in_other = list(base_keys - ks)[:5]
            missing_in_base = list(ks - base_keys)[:5]
            raise ValueError(
                "Inputs do not contain the same set of IDs.\n"
                f"- Missing in file {i + 1}: {missing_in_other}\n"
                f"- Extra in file {i + 1}: {missing_in_base}"
            )

    # Validate canonical columns exist in base
    missing = [c for c in canonical_cols if c not in base_df.columns]
    if missing:
        raise ValueError(f"First CSV missing required columns: {missing}")

    # Build combined on base order using 'id' as alignment key
    base_ids_str = base_df["id"].astype(str)
    combined = base_df[["id"] + canonical_cols].copy()

    # Append per-experiment outputs by aligned ID
    for df, exp_name in zip(dataframes, experiment_names):
        df = df.copy()
        df["id"] = df["id"].astype(str)
        df = df.set_index("id")
        for col in ["response", "score", "score_reason"]:
            if col not in df.columns:
                raise ValueError(
                    f"Column '{col}' not found in one input. Please provide per-row '{col}'."
                )
        combined[f"{exp_name}_response"] = base_ids_str.map(df["response"])
        combined[f"{exp_name}_score"] = base_ids_str.map(df["score"])
        combined[f"{exp_name}_score_reason"] = base_ids_str.map(df["score_reason"])

    # Determine output path
    current_dir = os.getcwd()
    experiments_dir = os.path.join(current_dir, "experiments")
    os.makedirs(experiments_dir, exist_ok=True)

    if output_path is None or output_path.strip() == "":
        run_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        output_path = os.path.join(experiments_dir, f"{run_id}-comparison.csv")
    else:
        # If relative path, place under experiments dir
        if not os.path.isabs(output_path):
            output_path = os.path.join(experiments_dir, output_path)

    # Sort by id for user-friendly reading
    if "id" in combined.columns:
        combined = combined.sort_values(by="id").reset_index(drop=True)
    combined.to_csv(output_path, index=False)

    # Print per-experiment accuracy summary
    for df, exp_name in zip(dataframes, experiment_names):
        try:
            acc = (df["score"] == "correct").mean()
            print(f"{exp_name} Accuracy: {acc:.2%}")
        except Exception:
            pass

    return output_path

In [19]:


# Compare the two experiment results
# Update these paths to match your actual experiment output files
output_path = compare_inputs_to_output(
    inputs=[
        "experiments/gpt-4.1-nano-2025-04-14.csv",
        "experiments/gpt-5-nano-2025-08-07.csv"
    ]
)

print(f"Comparison saved to: {output_path}")

gpt-4.1-nano-2025-04-14 Accuracy: 50.00%
gpt-5-nano-2025-08-07 Accuracy: 100.00%
Comparison saved to: /Users/ksulahian/ML_journey/random_code/ragas/experiments/20251212-160802-comparison.csv


In [21]:
result = pd.read_csv(output_path)
result.head()

Unnamed: 0,id,customer_profile,description,expected_discount,gpt-4.1-nano-2025-04-14_response,gpt-4.1-nano-2025-04-14_score,gpt-4.1-nano-2025-04-14_score_reason,gpt-5-nano-2025-08-07_response,gpt-5-nano-2025-08-07_score,gpt-5-nano-2025-08-07_score_reason
0,1,Martha is a 70-year-old retiree who enjoys gar...,Senior only,15,"{\n ""discount_percentage"": 15,\n ""reason"": ""...",correct,Correctly calculated discount=15%,"{\n ""discount_percentage"": 15,\n ""reason"": ""...",correct,Correctly calculated discount=15%
1,2,"Arjun, aged 19, is a full-time computer-scienc...",Student only,15,"{\n ""discount_percentage"": 0,\n ""reason"": ""N...",incorrect,Expected discount=15%; Got discount=0%,"{\n ""discount_percentage"": 15,\n ""reason"": ""...",correct,Correctly calculated discount=15%
2,3,"Cynthia, a 40-year-old freelance artist, earns...",Low income only,20,"{\n ""discount_percentage"": 20,\n ""reason"": ""...",correct,Correctly calculated discount=20%,"{\n ""discount_percentage"": 20,\n ""reason"": ""...",correct,Correctly calculated discount=20%
3,4,"Mr. Ocampo is 68, lives on social security of ...","Senior, low income, new customer (capped)",35,"{\n ""discount_percentage"": 35,\n ""reason"": ""...",correct,Correctly calculated discount=35%,"{\n ""discount_percentage"": 35,\n ""reason"": ""...",correct,Correctly calculated discount=35%
4,5,Hannah is a 24-year-old postgraduate student d...,"Student, low income, premium 3 yrs (capped)",35,"{\n ""discount_percentage"": 35,\n ""reason"": ""...",correct,Correctly calculated discount=35%,"{\n ""discount_percentage"": 35,\n ""reason"": ""...",correct,Correctly calculated discount=35%
