# 0. Install dependencies

In [1]:
pip install ragas langchain-openai

Collecting ragas
  Downloading ragas-0.4.3-py3-none-any.whl.metadata (23 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.1.7-py3-none-any.whl.metadata (2.6 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting instructor (from ragas)
  Downloading instructor-1.14.4-py3-none-any.whl.metadata (12 kB)
Collecting scikit-network (from ragas)
  Downloading scikit_network-0.33.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting langchain-community (from ragas)
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting jiter<1,>=0.10.0 (from openai>=1.0.0->ragas)
  Downloading jiter-0.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting pre-commit>=4.3.0 (from instructor->ragas)
  Downloading pre_commit-4.5.1-py

# 1. Imports & API key

In [2]:
import os
import pandas as pd
from datasets import Dataset

from ragas import evaluate
## Three RAGAS metrics
from ragas.metrics import (
    answer_correctness,
    faithfulness,
    context_precision,
)

from ragas.run_config import RunConfig

from langchain_openai import ChatOpenAI



All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  loader.exec_module(module)
  from ragas.metrics import (
  from ragas.metrics import (
  from ragas.metrics import (


# 2. Load GPT-4.1 nano as the judge

In [3]:
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [4]:
judge_llm = ChatOpenAI(
    model="gpt-4.1-nano",
    temperature=0.0,
    max_tokens=2048,     # safer for structured output
    timeout=120,        # per API call
    request_timeout=120 # explicit safeguard (LangChain quirk)
)

# 3. Reused Functions (Load dataset, Summary result)

In [6]:
from datasets import Dataset
import pandas as pd

def load_ragas_dataset(
    file_path,
    model_tag: str,
    use_context: bool = False,
):
    df = pd.read_excel(file_path)

    disease_col = f"{model_tag}_answer_disease"
    reasoning_col = f"{model_tag}_answer_reasoning"
    context_col = f"{model_tag}_context"

    required = ["user_input", "ground_truth", disease_col, reasoning_col]
    for col in required:
        if col not in df.columns:
            raise KeyError(f"❌ Missing column: {col}")

    if use_context and context_col not in df.columns:
        raise KeyError(f"❌ Missing context column: {context_col}")

    def reconstruct_answer(row):
        return (
            f"Diagnosis: {str(row[disease_col]).strip()}\n"
            f"Reason: {str(row[reasoning_col]).strip()}"
        )

    df["response"] = df.apply(reconstruct_answer, axis=1)

    df = df.rename(columns={"ground_truth": "reference"})

    if use_context:
        def normalize_context(x):
            x = str(x).strip()
            return [x] if x and x.lower() != "nan" else []
        df["retrieved_contexts"] = df[context_col].apply(normalize_context)
    else:
        df["retrieved_contexts"] = [[] for _ in range(len(df))]

    for col in ["user_input", "response", "reference"]:
        df[col] = df[col].astype(str)

    return Dataset.from_pandas(
        df[["user_input", "response", "reference", "retrieved_contexts"]]
    )


# 4. Running RAGAS

In [None]:
DATASET_FILE = "ragas_evaluation_dataset.xlsx"
SUMMARY_FILE = "summary_metrics.xlsx"

### 4.1 Gwen base

In [None]:
dataset_gwen_base = load_ragas_dataset(
    file_path="ragas_evaluation_dataset.xlsx",
    model_tag="gwen_base",
    use_context=True,
)
print("Rows:", len(dataset_gwen_base))
dataset_gwen_base[0]

In [None]:
run_config = RunConfig(
    timeout=300,        # per sample
    max_retries=5,
    max_workers=1,
    log_tenacity=True
)

result = evaluate(
    dataset=dataset_gwen_base,
    metrics=[answer_correctness],
    llm=judge_llm,
    run_config=run_config,
)

### 4.2 Gwen finetune

In [None]:
run_config = RunConfig(
    timeout=300,        # per sample
    max_retries=5,
    max_workers=1,
    log_tenacity=True
)

dataset = load_ragas_dataset(
    DATASET_FILE,
    model_tag="gwen_finetune",
    use_context=False
)

result = evaluate(
    dataset=dataset,
    metrics=[answer_correctness],
    llm=judge_llm,
    run_config=run_config,
)

Evaluating:   0%|          | 0/33 [00:00<?, ?it/s]

In [None]:
df_scores = result.to_pandas()
df_scores.head()

Unnamed: 0,user_input,retrieved_contexts,response,reference,answer_correctness
0,"A 53-year-old woman presented with fever, coug...",[],Diagnosis: Malaria\nReason: The patient presen...,The diagnosis is enteric fever caused by Salmo...,0.212907
1,A 26-year-old woman presents one week after re...,[],Diagnosis: Malaria\nReason: The patient's symp...,"The diagnosis is malaria, caused by a Plasmodi...",0.556483
2,A 72-year-old man who lives in the south of Sp...,[],Diagnosis: Acquired Immunodeficiency Syndrome ...,"The diagnosis is visceral leishmaniasis, likel...",0.212084
3,A 24-year-old man presents after a 3-month bac...,[],Diagnosis: Giardiasis\nReason: The presence of...,"The final diagnosis is giardiasis, caused by t...",0.532174
4,A 60-year-old man from Ohio with a history of ...,[],Diagnosis: Severe respiratory failure due to i...,"The diagnosis is histoplasmosis, caused by the...",0.63553


In [None]:
df_scores.to_csv("gwen_finetune_answer_correctness.csv", index=False)
print("✅ Saved: gwen_finetune_answer_correctness.csv")

✅ Saved: gwen_finetune_answer_correctness.csv


### 4.3 Gemini + RAG

In [7]:
dataset_gemini = load_ragas_dataset(
    file_path="ragas_evaluation_dataset_gemini.xlsx",
    model_tag="gemini",
    use_context=True,
)
print("Rows:", len(dataset_gemini))
dataset_gemini[0]

Rows: 33


{'user_input': 'A 53-year-old woman presented with fever, cough, and malaise after returning from a visit to Lahore. On examination, her temperature was 38°C and she had a rash on her upper chest. A chest X-ray showed patchy basal consolidation and a full blood count revealed a relative lymphocytosis. Malaria films were negative. Blood cultures were drawn and later grew gram-negative bacilli.',
 'response': 'Diagnosis: Typhoid fever\nReason: The diagnosis is based on the clinical presentation of fever and rash following travel to an endemic area (Pakistan), supported by the definitive laboratory finding of gram-negative bacilli in the blood culture.',
 'reference': "The diagnosis is enteric fever caused by Salmonella typhi. This is supported by the patient's presentation with a febrile illness, rash, and relative lymphocytosis after returning from an endemic area (Lahore). The diagnosis was confirmed by the isolation of Salmonella typhi from blood cultures.",
 'retrieved_contexts': ['R

In [8]:
run_config = RunConfig(
    timeout=300,        # per sample
    max_retries=5,
    max_workers=1,
    log_tenacity=True
)

result = evaluate(
    dataset=dataset_gemini,
    metrics=[
        answer_correctness,
        faithfulness,
        context_precision
    ],
    llm=judge_llm,
    run_config=run_config,
)

Evaluating:   0%|          | 0/99 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[48]: OutputParserException(Failed to parse StringIO from completion {"TP": [{"statement": "The diagnosis is Cutaneous Leishmaniasis.", "reason": "This statement is directly supported by the ground truth, which explicitly states that the diagnosis is Cutaneous Leishmaniasis."}, {"statement": "The patient's presentation includes a painless papule that evolved slowly over years.", "reason": "This matches the ground truth description of a painless, slowly expanding papule that became ulcerated over several years."}, {"statement": "The papule developed into a chronic, non-healing ulcer.", "reason": "This is supported by the ground truth indicating the lesion became ulcerated and failed to heal over years."}], "FP": [{"statement": "The reason for this diagnosis is that the patient's origin is from an endemic region, which is Peru.", "reason": "While the ground truth mentions Peru as an endemic area, it does not explicitly state that the patient's 

In [9]:
df_score_gemini = result.to_pandas()
df_score_gemini.to_csv("df_score_gemini_scores.csv", index=False)

print("✅ Saved: df_score_gemini_scores.csv")
df_score_gemini.head()

✅ Saved: df_score_gemini_scores.csv


Unnamed: 0,user_input,retrieved_contexts,response,reference,answer_correctness,faithfulness,context_precision
0,"A 53-year-old woman presented with fever, coug...",[Reference cases (supporting evidence only):\n...,Diagnosis: Typhoid fever\nReason: The diagnosi...,The diagnosis is enteric fever caused by Salmo...,0.399963,0.0,1.0
1,A 26-year-old woman presents one week after re...,[Reference cases (supporting evidence only):\n...,Diagnosis: Malaria (Plasmodium falciparum)\nRe...,"The diagnosis is malaria, caused by a Plasmodi...",0.717428,0.0,1.0
2,A 72-year-old man who lives in the south of Sp...,[Reference cases (supporting evidence only):\n...,Diagnosis: Visceral Leishmaniasis\nReason: The...,"The diagnosis is visceral leishmaniasis, likel...",0.880538,0.166667,1.0
3,A 24-year-old man presents after a 3-month bac...,[Reference cases (supporting evidence only):\n...,Diagnosis: Giardiasis\nReason: The patient's s...,"The final diagnosis is giardiasis, caused by t...",0.735727,1.0,1.0
4,A 60-year-old man from Ohio with a history of ...,[Reference cases (supporting evidence only):\n...,Diagnosis: Disseminated Histoplasmosis\nReason...,"The diagnosis is histoplasmosis, caused by the...",0.762972,0.0,0.0


# 5. Report Result

In [10]:
import numpy as np
import pandas as pd

def summarize_metric(scores, model_name, metric_name):
    scores = np.asarray(scores)
    scores = scores[~np.isnan(scores)]
    N = len(scores)

    mean = scores.mean()
    std = scores.std(ddof=1)

    rng = np.random.default_rng(42)
    boot_means = [
        rng.choice(scores, size=N, replace=True).mean()
        for _ in range(1000)
    ]

    ci_low, ci_high = np.percentile(boot_means, [2.5, 97.5])

    return pd.DataFrame([{
        "model_name": model_name,
        "metric": metric_name,
        "num_samples": N,
        "mean": mean,
        "std": std,
        "ci_95_lower": ci_low,
        "ci_95_upper": ci_high,
    }])


### 5.1 Gwen base

In [None]:
df_scores = result.to_pandas()

report = summarize_metric(
    df_scores["answer_correctness"],
    model_name="gwen_base",
    metric_name="answer_correctness"
)

report.to_excel("ragas_answer_correctness_report.xlsx", index=False)

### 5.2 Gwen finetune

In [None]:
df_scores = result.to_pandas()

report = summarize_metric(
    df_scores["answer_correctness"],
    model_name="gwen_finetune",
    metric_name="answer_correctness"
)

report.to_excel("ragas_answer_correctness_report.xlsx", index=False)

### 5.3 Gemini + RAG

In [11]:
summaries = []

for metric in ["answer_correctness", "faithfulness", "context_precision"]:
    summaries.append(
        summarize_metric(
            scores=df_score_gemini[metric],
            model_name="gemini",
            metric_name=metric
        )
    )

summary_df = pd.concat(summaries, ignore_index=True)
summary_df

Unnamed: 0,model_name,metric,num_samples,mean,std,ci_95_lower,ci_95_upper
0,gemini,answer_correctness,32,0.595933,0.1658,0.533011,0.652245
1,gemini,faithfulness,33,0.284885,0.316649,0.177771,0.399239
2,gemini,context_precision,33,0.787879,0.415149,0.636364,0.909091


In [12]:
summary_df.to_csv("gemini_summary.csv", index=False)

In [None]:
import numpy as np

summary = []

for metric in ["answer_correctness", "faithfulness", "context_precision"]:
    scores = df_scores_rag[metric].dropna().values
    summary.append({
        "model_name": "gwen_rag",
        "metric": metric,
        "num_samples": len(scores),
        "mean": scores.mean(),
        "std": scores.std(ddof=1),
    })

summary_df = pd.DataFrame(summary)
summary_df.to_csv("gwen_rag_ragas_summary.csv", index=False)
summary_df.to_excel("gwen_rag_ragas_summary.xlsx", index=False)

# EVALUATE SOME UNTIDY DATASET

In [None]:
DATASET_FILE = "ragas_evaluation_dataset_ft_rag_1.xlsx"
SUMMARY_FILE = "summary_metrics.xlsx"

In [None]:
from datasets import Dataset
import pandas as pd
from pathlib import Path

def load_ragas_dataset(
    file_path,
    model_tag: str = "gwen_rag",  # Default to the tag in your file
    use_context: bool = True,
):
    # -----------------------------
    # 1. LOAD DATA
    # -----------------------------
    # robustly load csv or excel regardless of extension
    try:
        df = pd.read_excel(file_path)
    except Exception:
        df = pd.read_csv(file_path)

    # -----------------------------
    # 2. DEFINE COLUMNS
    # -----------------------------
    answer_col = f"{model_tag}_answer"
    context_col = f"{model_tag}_context"

    # Check for required columns
    required = ["user_input", "ground_truth", answer_col]
    if use_context:
        required.append(context_col)

    for col in required:
        if col not in df.columns:
            raise KeyError(f"❌ Missing column: '{col}' in {file_path}")

    # -----------------------------
    # 3. TRANSFORM
    # -----------------------------
    # Rename ground_truth -> reference
    df = df.rename(columns={"ground_truth": "reference"})

    # Map answer -> response
    # (No reconstruction needed, just clean the text)
    df["response"] = df[answer_col].astype(str).str.strip()

    # Map context -> retrieved_contexts
    if use_context:
        def normalize_context(x):
            x = str(x).strip()
            # Ragas expects a list of strings.
            # We wrap the context string in a list [x].
            return [x] if x and x.lower() != "nan" else []

        df["retrieved_contexts"] = df[context_col].apply(normalize_context)
    else:
        df["retrieved_contexts"] = [[] for _ in range(len(df))]

    # Ensure all text fields are strings
    for col in ["user_input", "response", "reference"]:
        df[col] = df[col].astype(str)

    # -----------------------------
    # 4. RETURN DATASET
    # -----------------------------
    return Dataset.from_pandas(
        df[["user_input", "response", "reference", "retrieved_contexts"]]
    )


In [None]:
ds = load_ragas_dataset(
    file_path="ragas_evaluation_dataset_ft_rag_4.xlsx",
    model_tag="gwen_ft_rag_4"
)
print(ds[0])

{'user_input': 'A 53-year-old woman presented with fever, cough, and malaise after returning from a visit to Lahore. On examination, her temperature was 38°C and she had a rash on her upper chest. A chest X-ray showed patchy basal consolidation and a full blood count revealed a relative lymphocytosis. Malaria films were negative. Blood cultures were drawn and later grew gram-negative bacilli.', 'response': 'Predicted disease: Bartonellosis\nReasoning: The clinical images and symptoms, such as the presence of a neck lesion, fever, jaundice, and altered mental status, are consistent with Bartonellosis.', 'reference': "The diagnosis is enteric fever caused by Salmonella typhi. This is supported by the patient's presentation with a febrile illness, rash, and relative lymphocytosis after returning from an endemic area (Lahore). The diagnosis was confirmed by the isolation of Salmonella typhi from blood cultures.", 'retrieved_contexts': ['[{"case_id": "21---A-35-Year-Old-American-Man-With-Fa

In [None]:
run_config = RunConfig(
    timeout=300,        # per sample
    max_retries=5,
    max_workers=1,
    log_tenacity=True
)

result = evaluate(
    dataset=ds,
    metrics=[
        answer_correctness,
        faithfulness,
        context_precision
    ],
    llm=judge_llm,
    run_config=run_config,
)

Evaluating:   0%|          | 0/99 [00:00<?, ?it/s]

In [None]:
df_scores_ft_rag = result.to_pandas()
df_scores_ft_rag.to_csv("gwen_ft_rag_4_scores.csv", index=False)

print("✅ Saved: gwen_ft_rag_4_scores.csv")
df_scores_ft_rag.head()

✅ Saved: gwen_ft_rag_4_scores.csv


Unnamed: 0,user_input,retrieved_contexts,response,reference,answer_correctness,faithfulness,context_precision
0,"A 53-year-old woman presented with fever, coug...","[[{""case_id"": ""21---A-35-Year-Old-American-Man...",Predicted disease: Bartonellosis\nReasoning: T...,The diagnosis is enteric fever caused by Salmo...,0.211857,0.142857,0.0
1,A 26-year-old woman presents one week after re...,"[[{""case_id"": ""94---A-20-Year-Old-Woman-from-t...",Human monkeypox,"The diagnosis is malaria, caused by a Plasmodi...",0.197292,0.6,0.0
2,A 72-year-old man who lives in the south of Sp...,"[[{""case_id"": ""46---A-45-Year-Old-Man-from-Sri...",Predicted disease: Bartonellosis\nReasoning: T...,"The diagnosis is visceral leishmaniasis, likel...",0.378366,0.0,0.0
3,A 24-year-old man presents after a 3-month bac...,"[[{""case_id"": ""49---A-33-Year-Old-Male-Travell...",Giardiasis,"The final diagnosis is giardiasis, caused by t...",0.822742,1.0,1.0
4,A 60-year-old man from Ohio with a history of ...,"[[{""case_id"": ""46---A-45-Year-Old-Man-from-Sri...",The most likely diagnosis is Paragonimiasis. T...,"The diagnosis is histoplasmosis, caused by the...",0.198239,0.0,0.0


In [None]:
summaries = []

for metric in ["answer_correctness", "faithfulness", "context_precision"]:
    summaries.append(
        summarize_metric(
            scores=df_scores_ft_rag[metric],
            model_name="gwen_ft_rag_4",
            metric_name=metric
        )
    )

summary_df = pd.concat(summaries, ignore_index=True)
summary_df

Unnamed: 0,model_name,metric,num_samples,mean,std,ci_95_lower,ci_95_upper
0,gwen_ft_rag_4,answer_correctness,33,0.349892,0.190624,0.288622,0.418912
1,gwen_ft_rag_4,faithfulness,33,0.211917,0.280434,0.12751,0.313737
2,gwen_ft_rag_4,context_precision,33,0.242424,0.435194,0.121212,0.393939


In [None]:
summary_df.to_csv("gwen_ft_rag_4_summary.csv", index=False)