In [None]:
import json
import pandas as pd
from datasets import Dataset
from itertools import product
from tqdm.auto import tqdm

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision

from search import SurfSpotRetriever
from forecast import get_weekend_forecast
from report_generator import SurfReportGenerator

with open("ground_truth_rag_surf.json", "r", encoding="utf-8") as f:
    all_gt = json.load(f)

forecast = get_weekend_forecast()

generation_models = ["gpt-4o", "o3-mini"]
temperatures      = [0.3, 0.7]
top_ks            = [3, 10]

all_summaries = []

for gen_model, temperature, top_k in tqdm(
    product(generation_models, temperatures, top_ks),
    total=len(generation_models)*len(temperatures)*len(top_ks),
    desc="Param configs"
):
    
    retriever = SurfSpotRetriever()
    generator = SurfReportGenerator(
        spots=[],  # placeholder, we'll fill per‐query
        forecast=forecast,
        generation_model=gen_model,
        temperature=temperature
    )

    
    rows = []
    for item in tqdm(all_gt, desc="Queries", leave=False):
        q  = item["Query"]
        gt = item["Expected Answer"]
        
        spots = retriever.retrieve_spots(
            user_query=q,
            preferred_direction=item["Direction of Wave"],
            preferred_bottom=item["Type of Bottom"],
            top_k=top_k
        )
        contexts = [s["description"] for s in spots]
        
        answer = generator.generate_report(q)

        rows.append({
            "question":           q,
            "ground_truth":       gt,
            "answer":             answer,
            "retrieved_contexts": contexts
        })

    # Build a HuggingFace Dataset for RAGAS
    df = pd.DataFrame(rows)
    ds = Dataset.from_pandas(df)

    # RAGAS evaluation on the three selected metrics
    results = evaluate(
        ds,
        metrics=[faithfulness, answer_relevancy, context_precision],
        show_progress=False  
    )

    # Aggregate each metric’s mean score
    summary = {
        "model":            gen_model,
        "temperature":      temperature,
        "top_k":            top_k,
        "faithfulness":     sum(results["faithfulness"]) / len(results["faithfulness"]),
        "answer_relevancy": sum(results["answer_relevancy"]) / len(results["answer_relevancy"]),
        "context_precision":sum(results["context_precision"]) / len(results["context_precision"])
    }
    all_summaries.append(summary)


results_df = pd.DataFrame(all_summaries)


print("## Final Evaluation Summary")
print(results_df)


Param configs:   0%|          | 0/8 [00:00<?, ?it/s]

Queries:   0%|          | 0/59 [00:00<?, ?it/s]

Queries:   0%|          | 0/59 [00:00<?, ?it/s]

Queries:   0%|          | 0/59 [00:00<?, ?it/s]

Queries:   0%|          | 0/59 [00:00<?, ?it/s]

Queries:   0%|          | 0/59 [00:00<?, ?it/s]

Queries:   0%|          | 0/59 [00:00<?, ?it/s]

Queries:   0%|          | 0/59 [00:00<?, ?it/s]

Queries:   0%|          | 0/59 [00:00<?, ?it/s]

## Final Evaluation Summary
     model  temperature  top_k  faithfulness  answer_relevancy  \
0   gpt-4o          0.3      3      0.257175          0.801051   
1   gpt-4o          0.3     10      0.272369          0.815170   
2   gpt-4o          0.7      3      0.295014          0.831306   
3   gpt-4o          0.7     10      0.270825          0.784918   
4  o3-mini          0.3      3      0.365034          0.844974   
5  o3-mini          0.3     10      0.405670          0.796703   
6  o3-mini          0.7      3      0.485295          0.827477   
7  o3-mini          0.7     10      0.436700          0.783967   

   context_precision  
0           0.454802  
1           0.452448  
2           0.460452  
3           0.451507  
4           0.454802  
5           0.446798  
6           0.454802  
7           0.453390  


In [2]:
results_df.to_pickle("full_eval_summary.pkl")