In [1]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [6]:
import pandas as pd
from bert_score import score as bert_score_calc
import torch

# --- Configuration ---
concatenated_csv_path = "/content/agentic_rag_vqa_results.csv"

PREDICTED_ANSWER_COL = 'Predicted Answer'
TRUE_ANSWER_COL = 'True Answer'
GENERATED_RATIONALE_COL = 'Generated Rationale'
TRUE_SOLUTION_COL = 'True Solution'
# --- End Configuration ---

def calculate_metrics_for_subset(results_list, subset_name=""):
    """
    Calculates and prints accuracy and BERTScore for a given subset of results.
    """
    print(f"\n" + "="*20 + f" Performance Metrics ({subset_name}) " + "="*20)

    if not results_list:
        print(f"No valid results found for {subset_name}.")
        return float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan')

    valid_for_answer_metrics = [
        r for r in results_list
        if (r and isinstance(r.get(PREDICTED_ANSWER_COL), str) and
            isinstance(r.get(TRUE_ANSWER_COL), str) and
            not r[PREDICTED_ANSWER_COL].startswith("error:"))
    ]

    valid_for_rationale_metrics = [
        r for r in results_list
        if (r and isinstance(r.get(GENERATED_RATIONALE_COL), str) and
            isinstance(r.get(TRUE_SOLUTION_COL), str) and
            not r.get(GENERATED_RATIONALE_COL, "").startswith("error:"))
    ]

    num_total_subset = len(results_list)
    num_valid_for_answer = len(valid_for_answer_metrics)
    num_valid_for_rationale = len(valid_for_rationale_metrics)

    print(f"Total Cases in Subset: {num_total_subset}")
    print(f"Valid for Answer Metrics: {num_valid_for_answer}")
    print(f"Valid for Rationale Metrics: {num_valid_for_rationale}")

    accuracy_pa_ta = float('nan')
    bert_f1_pa_ta, bert_p_pa_ta, bert_r_pa_ta = float('nan'), float('nan'), float('nan')
    bert_f1_gr_ts, bert_p_gr_ts, bert_r_gr_ts = float('nan'), float('nan'), float('nan')

    if num_valid_for_answer > 0:
        accuracy_pa_ta = sum(
            (r[TRUE_ANSWER_COL] in r[PREDICTED_ANSWER_COL] or r[PREDICTED_ANSWER_COL] in r[TRUE_ANSWER_COL])
            for r in valid_for_answer_metrics
        ) / num_valid_for_answer
        print(f"\nApproximate Accuracy (True Answer vs. Predicted Answer): {accuracy_pa_ta:.2%}")

        predictions_pa = [r[PREDICTED_ANSWER_COL] for r in valid_for_answer_metrics]
        references_ta = [r[TRUE_ANSWER_COL] for r in valid_for_answer_metrics]
        valid_pairs_bs_pa_ta = [
            (p, ref) for p, ref in zip(predictions_pa, references_ta)
            if p and p.strip() and ref and ref.strip() and ref.lower() != 'n/a'
        ]
        if valid_pairs_bs_pa_ta:
            valid_preds_bs = [pair[0] for pair in valid_pairs_bs_pa_ta]
            valid_refs_bs = [[pair[1]] for pair in valid_pairs_bs_pa_ta]
            try:
                device = "cuda" if torch.cuda.is_available() else "cpu"
                P, R, F1 = bert_score_calc(
                    valid_preds_bs, valid_refs_bs, lang="en", model_type='bert-base-uncased', verbose=False, device=device
                )
                bert_f1_pa_ta = F1.mean().item()
                bert_p_pa_ta = P.mean().item()
                bert_r_pa_ta = R.mean().item()
                print(f"\nBERTScore (True Answer vs. Predicted Answer):")
                print(f"  F1: {bert_f1_pa_ta:.4f}, Precision: {bert_p_pa_ta:.4f}, Recall: {bert_r_pa_ta:.4f}")
            except Exception as e:
                print(f"Error calculating BERTScore for (True Answer vs. Predicted Answer): {e}")
        else:
            print("\nBERTScore (True Answer vs. Predicted Answer): No valid non-empty pairs.")
    else:
        print("\nNo valid data for (True Answer vs. Predicted Answer) Accuracy or BERTScore.")

    if num_valid_for_rationale > 0:
        predictions_gr = [r[GENERATED_RATIONALE_COL] for r in valid_for_rationale_metrics]
        references_ts = [r[TRUE_SOLUTION_COL] for r in valid_for_rationale_metrics]
        valid_pairs_bs_gr_ts = [
            (p, ref) for p, ref in zip(predictions_gr, references_ts)
            if p and p.strip() and ref and ref.strip() and ref.lower() != 'n/a'
        ]
        if valid_pairs_bs_gr_ts:
            valid_preds_bs = [pair[0] for pair in valid_pairs_bs_gr_ts]
            valid_refs_bs = [[pair[1]] for pair in valid_pairs_bs_gr_ts]
            try:
                device = "cuda" if torch.cuda.is_available() else "cpu"
                P, R, F1 = bert_score_calc(
                    valid_preds_bs, valid_refs_bs, lang="en", model_type='bert-base-uncased', verbose=False, device=device
                )
                bert_f1_gr_ts = F1.mean().item()
                bert_p_gr_ts = P.mean().item()
                bert_r_gr_ts = R.mean().item()
                print(f"\nBERTScore (Generated Rationale vs. True Solution):")
                print(f"  F1: {bert_f1_gr_ts:.4f}, Precision: {bert_p_gr_ts:.4f}, Recall: {bert_r_gr_ts:.4f}")
            except Exception as e:
                print(f"Error calculating BERTScore for (Generated Rationale vs. True Solution): {e}")
        else:
            print("\nBERTScore (Generated Rationale vs. True Solution): No valid non-empty pairs.")
    else:
        print("\nNo valid data for (Generated Rationale vs. True Solution) BERTScore.")

    return (accuracy_pa_ta,
            bert_f1_pa_ta, bert_p_pa_ta, bert_r_pa_ta,
            bert_f1_gr_ts, bert_p_gr_ts, bert_r_gr_ts)

# --- Main Script Execution ---
try:
    full_results_df = pd.read_csv(concatenated_csv_path)
    print(f"Successfully loaded {len(full_results_df)} records from the concatenated CSV file.")
except Exception as e:
    print(f"Error loading concatenated CSV file: {e}")
    raise SystemExit("Exiting due to error loading concatenated results.")

# Check for necessary columns
required_cols = [PREDICTED_ANSWER_COL, TRUE_ANSWER_COL, GENERATED_RATIONALE_COL, TRUE_SOLUTION_COL]
missing_cols = [col for col in required_cols if col not in full_results_df.columns]
if missing_cols:
    print(f"Error: The CSV file is missing the following required columns: {', '.join(missing_cols)}")
    print("Please ensure these columns exist or update the 'Configuration' section in the script with the correct names from your CSV.")
    raise SystemExit("Exiting due to missing columns.")

full_results_list = full_results_df.to_dict(orient='records')

# --- Infer Question Type and Perform Initial Validation ---
processed_results_with_type = []
skipped_count = 0
for r_orig in full_results_list:
    r = r_orig.copy()

    # Ensure key fields exist and convert to string, handling None
    for col_key in [PREDICTED_ANSWER_COL, TRUE_ANSWER_COL, GENERATED_RATIONALE_COL, TRUE_SOLUTION_COL]:
        if r.get(col_key) is None:
            r[col_key] = "" # Convert None to empty string
        else:
            r[col_key] = str(r.get(col_key, "")) # Convert to string, default to empty if key somehow missing after check

    # Validate Predicted Answer
    if r[PREDICTED_ANSWER_COL].startswith("error:"):
        skipped_count += 1
        continue

    # Validate True Answer for type inference
    true_answer_for_inference = r[TRUE_ANSWER_COL].strip().lower()
    if not true_answer_for_inference: # Skip if True Answer is empty after stripping
        # print(f"Warning: Record skipped. Empty '{TRUE_ANSWER_COL}' for type inference: {r_orig}")
        skipped_count += 1
        continue

    # Infer question type
    if true_answer_for_inference in ['yes', 'no']:
        r['_inferred_question_type'] = 'closed-ended'
    else:
        r['_inferred_question_type'] = 'open-ended'

    processed_results_with_type.append(r)

num_total_records = len(full_results_list)
num_processed = len(processed_results_with_type)
num_skipped = num_total_records - num_processed

print("\n" + "="*30 + " Overall Data Overview " + "="*30)
print(f"Total Records Loaded: {num_total_records}")
print(f"Records Processed (after initial validation and type inference): {num_processed}")
print(f"Records Skipped (e.g., error in Predicted Answer or empty True Answer): {num_skipped}")

# --- Separate by Inferred Question Type ---
closed_ended_results = [
    r for r in processed_results_with_type if r.get('_inferred_question_type') == 'closed-ended'
]
open_ended_results = [
    r for r in processed_results_with_type if r.get('_inferred_question_type') == 'open-ended'
]

print(f"\nNumber of Closed-Ended Questions (Inferred): {len(closed_ended_results)}")
print(f"Number of Open-Ended Questions (Inferred): {len(open_ended_results)}")

# --- Calculate metrics for Closed-Ended Questions ---
(acc_pa_ta_closed,
 bs_f1_pa_ta_closed, bs_p_pa_ta_closed, bs_r_pa_ta_closed,
 bs_f1_gr_ts_closed, bs_p_gr_ts_closed, bs_r_gr_ts_closed) = calculate_metrics_for_subset(
    closed_ended_results, "MODIFIED Agentic RAG - Closed-Ended (Inferred)"
)

# --- Calculate metrics for Open-Ended Questions ---
(acc_pa_ta_open,
 bs_f1_pa_ta_open, bs_p_pa_ta_open, bs_r_pa_ta_open,
 bs_f1_gr_ts_open, bs_p_gr_ts_open, bs_r_gr_ts_open) = calculate_metrics_for_subset(
    open_ended_results, "MODIFIED Agentic RAG - Open-Ended (Inferred)"
)

# --- Final Summary ---
print("\n\n" + "="*30 + " FINAL PERFORMANCE SUMMARY (MODIFIED Agentic RAG) " + "="*30)

print("\n--- Closed-Ended Questions (Inferred) ---")
if closed_ended_results:
    print(f"  Accuracy (True Answer vs. Pred. Answer): {acc_pa_ta_closed:.2%}" if not pd.isna(acc_pa_ta_closed) else "N/A")
    print(f"  BERTScore F1 (True Answer vs. Pred. Answer): {bs_f1_pa_ta_closed:.4f}" if not pd.isna(bs_f1_pa_ta_closed) else "N/A")
    print(f"  BERTScore F1 (Gen. Rationale vs. True Solution): {bs_f1_gr_ts_closed:.4f}" if not pd.isna(bs_f1_gr_ts_closed) else "N/A")
else:
    print("  No valid closed-ended questions processed or found.")

print("\n--- Open-Ended Questions (Inferred) ---")
if open_ended_results:
    print(f"  Accuracy (True Answer vs. Pred. Answer): {acc_pa_ta_open:.2%}" if not pd.isna(acc_pa_ta_open) else "N/A")
    print(f"  BERTScore F1 (True Answer vs. Pred. Answer): {bs_f1_pa_ta_open:.4f}" if not pd.isna(bs_f1_pa_ta_open) else "N/A")
    print(f"  BERTScore F1 (Gen. Rationale vs. True Solution): {bs_f1_gr_ts_open:.4f}" if not pd.isna(bs_f1_gr_ts_open) else "N/A")
else:
    print("  No valid open-ended questions processed or found.")

print("\n" + "="*70)

Successfully loaded 442 records from the concatenated CSV file.

Total Records Loaded: 442
Records Processed (after initial validation and type inference): 442
Records Skipped (e.g., error in Predicted Answer or empty True Answer): 0

Number of Closed-Ended Questions (Inferred): 270
Number of Open-Ended Questions (Inferred): 172

Total Cases in Subset: 270
Valid for Answer Metrics: 270
Valid for Rationale Metrics: 270

Approximate Accuracy (True Answer vs. Predicted Answer): 62.59%


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


BERTScore (True Answer vs. Predicted Answer):
  F1: 0.8525, Precision: 0.8488, Recall: 0.8573

BERTScore (Generated Rationale vs. True Solution):
  F1: 0.6143, Precision: 0.6414, Recall: 0.5922

Total Cases in Subset: 172
Valid for Answer Metrics: 172
Valid for Rationale Metrics: 172

Approximate Accuracy (True Answer vs. Predicted Answer): 25.00%

BERTScore (True Answer vs. Predicted Answer):
  F1: 0.5161, Precision: 0.4871, Recall: 0.5648

BERTScore (Generated Rationale vs. True Solution):
  F1: 0.5872, Precision: 0.6181, Recall: 0.5620



--- Closed-Ended Questions (Inferred) ---
  Accuracy (True Answer vs. Pred. Answer): 62.59%
  BERTScore F1 (True Answer vs. Pred. Answer): 0.8525
  BERTScore F1 (Gen. Rationale vs. True Solution): 0.6143

--- Open-Ended Questions (Inferred) ---
  Accuracy (True Answer vs. Pred. Answer): 25.00%
  BERTScore F1 (True Answer vs. Pred. Answer): 0.5161
  BERTScore F1 (Gen. Rationale vs. True Solution): 0.5872

