In [None]:
import os
import glob
import random
# from openai import AzureOpenAI # Assuming 'client' is already initialized
import time # Import time for delays between API calls
import json # To save comparison results
import math # For entropy and log in PMI
from collections import Counter # For entropy and PMI counts
import sacrebleu # For BLEU
from bert_score import score as bert_score_fn # For BERTScore
import numpy as np # For averaging
# from scipy.stats import rel_entr # For KL Divergence (relative entropy) - Requires SciPy >= 0.15.0
from scipy.stats import entropy # Alternative for KL divergence calculation, available in older SciPy
from sklearn.feature_extraction.text import TfidfVectorizer # For classifier features
from sklearn.linear_model import LogisticRegression # Simple classifier for demonstration
from sklearn.model_selection import train_test_split # For splitting data
from sklearn.metrics import roc_auc_score, average_precision_score # For AUC and AUPRC
from sklearn.pipeline import Pipeline # To chain vectorizer and classifier

# --- Configuration ---
# Assume AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, AZURE_OPENAI_DEPLOYMENT_NAME, API_VERSION
# and the 'client' AzureOpenAI object are initialized and available from previous blocks.

# Specify the directory containing the original pseudonymized Markdown files for comparison
# NOTE: This path might differ from the input PDF path if you saved pseudonymized files elsewhere
PSEUDO_MD_DIRECTORY_PATH_COMPARE = r".\Lagerugpijn\pseudonymized-epds" # Assuming pseudo_*.md are here

# Specify the directory containing the generated synthetic Markdown files
SYNTHETIC_MD_DIRECTORY_PATH = r".\Lagerugpijn\synthetic_epds"

# Configure how many comparison pairs to evaluate using GPT-4
# This should not exceed the number of synthetic files available
NUM_COMPARISON_PAIRS_TO_EVALUATE = 5 # Example: Compare 5 synthetic files using GPT-4

# Output file for comparison results (optional)
COMPARISON_RESULTS_FILE = os.path.join(os.path.dirname(SYNTHETIC_MD_DIRECTORY_PATH), "comparison_results.json")

# Minimum frequency for a bigram to be included in Avg PMI calculation
PMI_MIN_BIGRAM_FREQ = 3

# Parameters for Classifier Performance evaluation
CLASSIFIER_TEST_SIZE = 0.3 # Percentage of data to use for testing
CLASSIFIER_RANDOM_STATE = 42 # Random state for reproducibility
CLASSIFIER_MAX_FEATURES = 1000 # Max features for TF-IDF Vectorizer


# --- Helper Function to Load File Content ---
def load_file_content(filepath):
    """Loads content from a markdown file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading file {filepath}: {e}")
        return None

# --- Benchmark Calculation Functions ---

def calculate_entropy(text, unit='char'):
    """Calculates Shannon's Entropy for text."""
    if not text:
        return 0.0
    if unit == 'char':
        tokens = list(text)
    elif unit == 'word':
        # Simple word tokenization by whitespace and lowercase
        tokens = text.lower().split()
    else:
        raise ValueError("Unit must be 'char' or 'word'")

    if not tokens:
        return 0.0

    counts = Counter(tokens)
    total_count = len(tokens)
    entropy = 0.0
    # Use log2 for entropy in bits
    for count in counts.values():
        p = count / total_count
        entropy -= p * math.log2(p)
    return entropy

def calculate_avg_bigram_pmi(text, min_freq=3):
    """
    Calculates the average Pointwise Mutual Information (PMI) for word bigrams
    that occur at least min_freq times.
    A proxy metric related to Mutual Information, measuring word association strength.
    """
    if not text:
        return 0.0

    # Simple word tokenization and lowercase
    words = text.lower().split()
    if len(words) < 2:
        return 0.0

    word_counts = Counter(words)
    bigram_counts = Counter(zip(words[:-1], words[1:])) # Count occurrences of bigrams

    total_words = len(words)
    # total_bigrams = len(list(zip(words[:-1], words[1:]))) # Count actual bigram instances

    pmi_values = []
    for bigram, bigram_count in bigram_counts.items():
        # Only consider bigrams that meet the minimum frequency threshold
        if bigram_count >= min_freq:
            word1, word2 = bigram

            # Calculate probabilities (using total_words for marginals is common)
            p_w1 = word_counts[word1] / total_words if total_words > 0 else 0
            p_w2 = word_counts[word2] / total_words if total_words > 0 else 0
            p_w1_w2 = bigram_count / total_words if total_words > 0 else 0 # Use total words as normalization

            # Avoid log(0) - check if probabilities are positive
            if p_w1 > 0 and p_w2 > 0 and p_w1_w2 > 0:
                 # PMI formula: log2( P(w1,w2) / (P(w1) * P(w2)) )
                 pmi = math.log2(p_w1_w2 / (p_w1 * p_w2))
                 pmi_values.append(pmi)
            # Note: Bigrams that never appear together with positive marginals would have PMI -infinity.
            # We only average over bigrams that *do* appear (with >= min_freq).

    if not pmi_values:
        return 0.0 # Return 0 if no bigrams meet min_freq or text was empty/too short

    return np.mean(pmi_values)


def calculate_kl_divergence(text1, text2, unit='word'):
    """
    Calculates the symmetric Kullback-Leibler Divergence (JSD)
    between the distributions of tokens (chars or words) in two texts.
    KL(P || Q) is asymmetric, JSD is symmetric and always finite.
    We'll use JSD = 0.5 * (KL(P || Q) + KL(Q || P)).
    Uses scipy.stats.entropy for KL calculation, compatible with older SciPy.
    """
    if not text1 or not text2:
        return np.nan # Cannot compute divergence with empty text

    if unit == 'char':
        tokens1 = list(text1)
        tokens2 = list(text2)
    elif unit == 'word':
        # Simple word tokenization and lowercase
        tokens1 = text1.lower().split()
        tokens2 = text2.lower().split()
    else:
        raise ValueError("Unit must be 'char' or 'word'")

    if not tokens1 or not tokens2:
         return np.nan

    # Build a combined vocabulary
    vocab = list(set(tokens1 + tokens2))
    vocab_size = len(vocab)

    # Create frequency distributions
    counts1 = Counter(tokens1)
    counts2 = Counter(tokens2)

    # Create probability distributions over the combined vocabulary
    # Add a small smoothing value to avoid zero probabilities, which cause log(0) issues
    smoothing = 1e-9
    p1 = np.array([counts1.get(token, 0) + smoothing for token in vocab])
    p2 = np.array([counts2.get(token, 0) + smoothing for token in vocab])

    # Normalize to get probability distributions
    p1 = p1 / p1.sum()
    p2 = p2 / p2.sum()

    # Calculate KL Divergence using scipy.stats.entropy
    # entropy(pk, qk) calculates KL(pk || qk)
    kl_pq = entropy(p1, qk=p2, base=2) # Use base=2 for bits
    kl_qp = entropy(p2, qk=p1, base=2) # Use base=2 for bits


    # Calculate Jensen-Shannon Divergence (JSD) - symmetric and bounded
    # JSD = 0.5 * (KL(P || Q) + KL(Q || P))
    jsd = 0.5 * (kl_pq + kl_qp)

    # Note: Another common approach is to use JSD = H(M) - 0.5 * (H(P) + H(Q))
    # where M = 0.5 * (P + Q) and H is Shannon Entropy. rel_entr is more direct,
    # but using entropy(pk, qk) is also a valid way to get KL.

    return jsd


def calculate_corpus_bleu(synthetic_contents, pseudo_contents_list):
    """Calculates BLEU score for a corpus of synthetic texts against a list of references."""
    if not synthetic_contents or not pseudo_contents_list:
        return np.nan # Use NaN for scores if input is empty

    # sacrebleu expects references as list of lists, where each inner list
    # contains all reference translations for one candidate.
    # Here, all pseudo_contents_list serve as references for EACH synthetic_content.
    references_for_all_candidates = [pseudo_contents_list] * len(synthetic_contents)

    # sacrebleu.corpus_bleu expects candidates as a list of strings
    # and references as a list of lists of strings.
    try:
        bleu = sacrebleu.corpus_bleu(synthetic_contents, references_for_all_candidates)
        return bleu.score # Return the BLEU score (float)
    except Exception as e:
        print(f"Error calculating BLEU: {e}")
        return np.nan # Return NaN in case of error


def calculate_corpus_bertscore(synthetic_contents, pseudo_contents_list, lang='nl'):
    """Calculates BERTScore F1 for a corpus of synthetic texts against references."""
    if not synthetic_contents or not pseudo_contents_list:
         # Return NaN for scores if input is empty
         return np.nan, np.nan, np.nan

    # BERT Score can be computationally intensive and requires torch
    try:
        # bert_score.score expects candidates as a list of strings
        # and references as a list of strings. It computes pairwise scores,
        # and we get the max reference score for each candidate.
        # We need to provide ALL pseudo_contents as references for the entire set of candidates.

        # Creating the references list of lists format needed by bert_score
        references_bert = [pseudo_contents_list] * len(synthetic_contents)

        print("  Running BERT Score...")
        # Note: verbose=True prints progress
        P, R, F1 = bert_score_fn(synthetic_contents, references_bert, lang=lang, verbose=False) # Set verbose=False for cleaner output

        # BERTScore returns tensors. We usually want the mean score across the corpus.
        # .item() extracts the scalar value from a tensor
        return P.mean().item(), R.mean().item(), F1.mean().item()
    except Exception as e:
        print(f"Error calculating BERTScore: {e}")
        # Return NaN for scores in case of error
        return np.nan, np.nan, np.nan

def evaluate_classifier_performance(pseudo_contents, synthetic_contents, test_size=0.3, random_state=42, max_features=1000):
    """
    Trains a classifier to distinguish between pseudonymized and synthetic data
    and reports AUC and AUPRC.
    Lower scores indicate better synthetic data (harder to distinguish).
    """
    if not pseudo_contents or not synthetic_contents:
        print("  Skipping Classifier Performance: Insufficient data.")
        return np.nan, np.nan

    # Create labels: 0 for pseudo, 1 for synthetic
    X = pseudo_contents + synthetic_contents
    y = [0] * len(pseudo_contents) + [1] * len(synthetic_contents)

    if len(X) < 2 or len(set(y)) < 2:
        print("  Skipping Classifier Performance: Need at least two samples from each class.")
        return np.nan, np.nan

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    # Create a pipeline: TF-IDF Vectorizer + Logistic Regression
    model_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=max_features)),
        ('classifier', LogisticRegression(random_state=random_state, solver='liblinear')) # Use liblinear for smaller datasets
    ])

    print("  Training classifier to distinguish pseudo vs synthetic...")
    try:
        # Train the model
        model_pipeline.fit(X_train, y_train)

        # Predict probabilities on the test set
        y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1] # Probability of belonging to the synthetic class (label 1)

        # Calculate AUC and AUPRC
        auc_score = roc_auc_score(y_test, y_pred_proba)
        auprc_score = average_precision_score(y_test, y_pred_proba)

        print(f"  Classifier Performance calculated on {len(X_test)} test samples.")
        return auc_score, auprc_score

    except Exception as e:
        print(f"Error during Classifier Performance evaluation: {e}")
        return np.nan, np.nan


# --- Function to Compare Documents using Azure OpenAI GPT-4 (Reuse) ---
def compare_docs_with_gpt4(client, pseudo_content, synthetic_content, pseudo_filename, synthetic_filename):
    """
    Sends a pair of document contents to GPT-4 for similarity comparison
    based on structure, style, clinical patterns, and realism.
    """
    if not pseudo_content or not synthetic_content:
        return {"pseudo_file": pseudo_filename, "synthetic_file": synthetic_filename,
                "status": "Skipped", "reason": "Failed to load file content",
                "description": "N/A", "rating": "N/A"}

    # Prompt designed to instruct GPT-4 on the comparison task (Reuse)
    system_prompt = """Je bent een expert in het beoordelen van klinische documentatie en analyseert de gelijkenis in structuur, schrijfstijl en inhoudelijke patronen tussen paren van Nederlandse fysiotherapeutische patiëntdossiers."""

    user_prompt = f"""Beoordeel de gelijkenis tussen de twee onderstaande fysiotherapeutische patiëntdossiers. Dossier 1 is een gepseudonimiseerd voorbeeld uit de praktijk ('{pseudo_filename}'). Dossier 2 is een synthetisch gegenereerd dossier ('{synthetic_filename}').

Focus je beoordeling op de volgende aspecten:
- **Structuur:** Komen de belangrijke secties overeen (anamnese, ICF-diagnose, doelen, behandelplan, SOEP-notities)? Is de algehele opbouw vergelijkbaar?
- **Schrijfstijl en toon:** Komt het taalgebruik, de formaliteit en de professionele toon overeen met realistische fysiotherapeutische verslaglegging in Nederland? Worden afkortingen (indien aanwezig) realistisch gebruikt en uitgebreid (indien nodig)?
- **Klinische patronen en realisme:** Zijn de beschreven klachten, diagnoses, behandelinterventies en het verloop van de behandeling (in de SOEP-notities) klinisch plausibel en realistisch voor patiënten met lage rugpijn? Is de variatie in de voortgangsnotities (aantal sessies, beschreven progressie, setbacks, aanpassingen) realistisch en gevarieerd, vergelijkbaar met de voorbeelden?
- **Adherentie aan format:** Volgen de voortgangsnotities consistent het SOEP-formaat (Subjectief, Objectief, Evaluatie, Plan)?

**Belangrijk:** Vergelijk **NIET** de specifieke persoonsgegevens of de exacte inhoudelijke details van de klacht, doelen, scores (zoals specifieke NRS-waardes of PSK-scores), behandelingen of data, aangezien Dossier 2 een **nieuw, verzonnen geval** is en geen kopie van Dossier 1 of een ander voorbeeld. Beoordeel de gelijkenis op het niveau van het **sjabloon, de opbouw, de stijl, het detailniveau en de realistische weergave** van een fysiotherapeutisch proces voor lage rugpijn.

Presenteer de twee dossiers:

--- GEPSEUDONIMISEERD VOORBEELD DOSSIER: {pseudo_filename} ---
{pseudo_content.strip()}
--- EINDE GEPSEUDONIMISEERD VOORBEELD ---

--- SYNTHETISCH GEGENEREERD DOSSIER: {synthetic_filename} ---
{synthetic_content.strip()}
--- EINDE SYNTHETISCH GEGENEREERD DOSSIER ---

Geef nu je beoordeling. Begin met een beschrijving van de belangrijkste overeenkomsten en verschillen gebaseerd op de bovengenoemde criteria. Eindig op a new line with a summarizing judgment on the overall degree of similarity on a scale of 'Laag', 'Matig', or 'Hoog', followed by a brief explanation of why you reached this judgment.

Formaat van de output:
[Beschrijving van overeenkomsten en verschillen]
Oordeel: [Laag/Matig/Hoog] - [Korte toelichting]
"""

    print(f"  Comparing '{synthetic_filename}' with '{pseudo_filename}' using GPT-4...")

    try:
        response = client.chat.completions.create(
            model=AZURE_OPENAI_DEPLOYMENT_NAME, # Your GPT-4 deployment name
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1, # Keep temperature low for consistent analysis
            max_tokens=2000 # Enough tokens for the analysis output
        )
        comparison_text = response.choices[0].message.content.strip()

        # Parse the output to extract description and rating
        description = comparison_text
        rating = "N/A"
        if "\nOordeel: " in comparison_text:
            parts = comparison_text.split("\nOordeel: ", 1)
            description = parts[0].strip()
            rating = parts[1].strip()

        print(f"  Comparison successful. Rating: {rating}")

        return {"pseudo_file": pseudo_filename, "synthetic_file": synthetic_filename,
                "status": "Success", "description": description, "rating": rating}

    except Exception as e:
        print(f"Error calling Azure OpenAI API for comparison: {e}")
        # Consider adding a small delay or retry logic here
        # time.sleep(5)
        return {"pseudo_file": pseudo_filename, "synthetic_file": synthetic_filename,
                "status": "Failed", "reason": str(e),
                "description": "N/A", "rating": "N/A"}


# --- Main Execution Logic for Evaluation ---
if __name__ == "__main__":
    # --- Assuming the previous script's main block finished and the client is available ---

    print("\n--- Starting Synthetic Data Evaluation ---")

    # Get lists of available files
    # Using the comparison path for pseudo files as specified by the user
    pseudo_files = glob.glob(os.path.join(PSEUDO_MD_DIRECTORY_PATH_COMPARE, "pseudo_*.md"))
    synthetic_files = glob.glob(os.path.join(SYNTHETIC_MD_DIRECTORY_PATH, "synthetic_patient_*.md"))

    if not pseudo_files:
        print(f"Error: No pseudonymized markdown files found in '{PSEUDO_MD_DIRECTORY_PATH_COMPARE}'. Cannot perform evaluation.")
        # exit() # Don't exit, allow partial evaluation if synthetic files exist

    if not synthetic_files:
        print(f"Error: No synthetic markdown files found in '{SYNTHETIC_MD_DIRECTORY_PATH}'. Cannot perform evaluation.")
        # exit() # Don't exit if pseudo files exist

    print(f"Found {len(pseudo_files)} pseudonymized files.")
    print(f"Found {len(synthetic_files)} synthetic files.")

    # Load ALL content for benchmarks
    print("\nLoading all file contents for benchmark calculations...")
    all_pseudo_contents = []
    for f in pseudo_files:
        content = load_file_content(f)
        if content is not None:
            all_pseudo_contents.append(content)
        else:
            print(f"Warning: Could not load content for file {f}")

    all_synthetic_contents = []
    for f in synthetic_files:
        content = load_file_content(f)
        if content is not None:
            all_synthetic_contents.append(content)
        else:
            print(f"Warning: Could not load content for file {f}")


    if not all_pseudo_contents or not all_synthetic_contents:
        print("\nInsufficient data loaded for comprehensive benchmarks. Skipping some calculations.")
        skip_benchmarks = True
    else:
        skip_benchmarks = False
        print(f"Loaded content for {len(all_pseudo_contents)} pseudonymized files.")
        print(f"Loaded content for {len(all_synthetic_contents)} synthetic files.")


    # --- Calculate Benchmarks ---
    benchmark_results = {}

    if not skip_benchmarks:
        print("\nCalculating quantitative benchmark metrics...")

        # Concatenate all content for corpus-level metrics
        corpus_pseudo_text = "\n".join(all_pseudo_contents)
        corpus_synthetic_text = "\n".join(all_synthetic_contents)

        # Shannon's Entropy (character level)
        entropy_pseudo = calculate_entropy(corpus_pseudo_text, unit='char')
        entropy_synthetic = calculate_entropy(corpus_synthetic_text, unit='char')
        benchmark_results['shannon_entropy_char'] = {'pseudonymized': entropy_pseudo, 'synthetic': entropy_synthetic}

        # Shannon's Entropy (word level)
        entropy_pseudo_word = calculate_entropy(corpus_pseudo_text, unit='word')
        entropy_synthetic_word = calculate_entropy(corpus_synthetic_text, unit='word')
        benchmark_results['shannon_entropy_word'] = {'pseudonymized': entropy_pseudo_word, 'synthetic': entropy_synthetic_word}

        # Average Document Length (Character Count) as a simple proxy
        avg_len_pseudo = np.mean([len(c) for c in all_pseudo_contents]) if all_pseudo_contents else 0
        avg_len_synthetic = np.mean([len(c) for c in all_synthetic_contents]) if all_synthetic_contents else 0
        benchmark_results['avg_doc_length_chars'] = {'pseudonymized': avg_len_pseudo, 'synthetic': avg_len_synthetic}

        # Average Bigram Pointwise Mutual Information (PMI)
        avg_pmi_pseudo = calculate_avg_bigram_pmi(corpus_pseudo_text, min_freq=PMI_MIN_BIGRAM_FREQ)
        avg_pmi_synthetic = calculate_avg_bigram_pmi(corpus_synthetic_text, min_freq=PMI_MIN_BIGRAM_FREQ)
        benchmark_results['avg_bigram_pmi'] = {'pseudonymized': avg_pmi_pseudo, 'synthetic': avg_pmi_synthetic}

        # KL Divergence (Word Distribution) - using JSD
        # Note: KL divergence is asymmetric, JSD is symmetric. JSD is often preferred for comparing distributions.
        # We calculate JSD between the word distributions of the two corpora.
        jsd_word = calculate_kl_divergence(corpus_pseudo_text, corpus_synthetic_text, unit='word')
        benchmark_results['jsd_word'] = jsd_word # Store JSD, note it's related to KL

        # BLEU Score (Synthetic vs Pseudonymized References)
        if all_synthetic_contents and all_pseudo_contents:
            bleu_score = calculate_corpus_bleu(all_synthetic_contents, all_pseudo_contents)
            benchmark_results['bleu_score'] = bleu_score
        else:
            benchmark_results['bleu_score'] = np.nan
            print("Skipping BLEU calculation due to insufficient data.")

        # BERT Score (Synthetic vs Pseudonymized References)
        # BERT Score can be computationally intensive.
        if all_synthetic_contents and all_pseudo_contents:
            print("Calculating BERT Score (this may take some time)...")
            bert_p, bert_r, bert_f1 = calculate_corpus_bertscore(all_synthetic_contents, all_pseudo_contents, lang='nl')
            benchmark_results['bert_score'] = {'precision': bert_p, 'recall': bert_r, 'f1': bert_f1}
        else:
            benchmark_results['bert_score'] = {'precision': np.nan, 'recall': np.nan, 'f1': np.nan}
            print("Skipping BERT Score calculation due to insufficient data.")

        # Classifier Performance (AUC/AUPRC)
        # Requires a mix of pseudo and synthetic documents
        if len(all_pseudo_contents) >= 1 and len(all_synthetic_contents) >= 1: # Need at least one of each
             print("\nCalculating Classifier Performance (AUC/AUPRC)...")
             auc_score, auprc_score = evaluate_classifier_performance(
                 all_pseudo_contents,
                 all_synthetic_contents,
                 test_size=CLASSIFIER_TEST_SIZE,
                 random_state=CLASSIFIER_RANDOM_STATE,
                 max_features=CLASSIFIER_MAX_FEATURES
             )
             benchmark_results['classifier_performance'] = {'auc': auc_score, 'auprc': auprc_score}
        else:
             benchmark_results['classifier_performance'] = {'auc': np.nan, 'auprc': np.nan}
             print("Skipping Classifier Performance calculation due to insufficient data.")


        # --- Add placeholder/note for Informational Accuracy ---
        benchmark_results['informational_accuracy_note'] = "A standard, generalizable metric for 'Informational Accuracy' between synthetic and real clinical text is not straightforward without a specific definition or clinical ontology. Aspects of information capture and clinical plausibility are covered qualitatively by the GPT-4 comparison and partially by BERTScore (semantic similarity) and length comparison."


    # --- Perform Pairwise GPT-4 Comparison ---
    # Need to get file paths corresponding to the loaded contents for pairwise comparison
    # Filter file paths to only include those whose content was successfully loaded
    loaded_pseudo_filepaths = [f for f in pseudo_files if load_file_content(f) in all_pseudo_contents]
    loaded_synthetic_filepaths = [f for f in synthetic_files if load_file_content(f) in all_synthetic_contents]


    actual_num_comparisons = min(NUM_COMPARISON_PAIRS_TO_EVALUATE, len(loaded_synthetic_filepaths))

    comparison_results_gpt4 = [] # Initialize the list even if no comparisons run

    if actual_num_comparisons > 0 and loaded_pseudo_filepaths:
        print(f"\nInitiating {actual_num_comparisons} pairwise GPT-4 comparisons.")

        # Select synthetic files to compare (e.g., take the first N)
        synthetic_files_for_gpt4 = loaded_synthetic_filepaths[:actual_num_comparisons]
        # If you want random sampling instead: synthetic_files_for_gpt4 = random.sample(loaded_synthetic_filepaths, actual_num_comparisons)


        for i, synthetic_filepath in enumerate(synthetic_files_for_gpt4):
            comparison_index = i + 1
            synthetic_filename = os.path.basename(synthetic_filepath)

            # Randomly select one pseudonymized file for this comparison
            if not loaded_pseudo_filepaths:
                 print(f"Skipping GPT-4 comparison {comparison_index}: No pseudonymized files available.")
                 continue

            pseudo_filepath = random.choice(loaded_pseudo_filepaths)
            pseudo_filename = os.path.basename(pseudo_filepath)

            print(f"\n--- GPT-4 Comparison {comparison_index} of {actual_num_comparisons} ---")
            print(f"  Synthetic: {synthetic_filename}")
            print(f"  Pseudonymized (randomly selected): {pseudo_filename}")

            # Load file contents again for this specific pair (could optimize by using loaded content list)
            synthetic_content = load_file_content(synthetic_filepath)
            pseudo_content = load_file_content(pseudo_filepath)

            # Perform the comparison using GPT-4
            result = compare_docs_with_gpt4(client, pseudo_content, synthetic_content, pseudo_filename, synthetic_filename)
            comparison_results_gpt4.append(result)

            # Add a delay between API calls to manage rate limits
            time.sleep(2) # Adjust delay as needed


        print("\n--- Pairwise GPT-4 Comparison Complete ---")
        print("\nSummary of Pairwise GPT-4 Results:")

        # Print summary of results (Reuse)
        for i, result in enumerate(comparison_results_gpt4):
            print(f"\nComparison Pair {i+1}:")
            print(f"  Synthetic: {result['synthetic_file']}")
            print(f"  Pseudonymized: {result['pseudo_file']}")
            if result['status'] == 'Success':
                print(f"  Status: Success")
                print(f"  Rating: {result['rating']}")
                # print(f"  Description:\n{result['description']}") # Uncomment to print full description
            else:
                 print(f"  Status: {result['status']}")
                 print(f"  Reason: {result['reason']}")
    elif not loaded_pseudo_filepaths:
         print("\nSkipping pairwise GPT-4 comparisons: No pseudonymized files loaded successfully.")
    else: # actual_num_comparisons <= 0 or no synthetic files loaded
        print("\nSkipping pairwise GPT-4 comparisons as no synthetic files were loaded successfully or requested.")


    # --- Final Report ---
    print("\n--- Comprehensive Evaluation Report ---")

    # Report Benchmarks
    print("\nQuantitative Benchmark Metrics:")
    if 'shannon_entropy_char' in benchmark_results:
         print(f"  Shannon Entropy (Character):")
         print(f"    Pseudonymized Corpus: {benchmark_results['shannon_entropy_char']['pseudonymized']:.4f}")
         print(f"    Synthetic Corpus:     {benchmark_results['shannon_entropy_char']['synthetic']:.4f}")
    if 'shannon_entropy_word' in benchmark_results:
         print(f"  Shannon Entropy (Word):")
         print(f"    Pseudonymized Corpus: {benchmark_results['shannon_entropy_word']['pseudonymized']:.4f}")
         print(f"    Synthetic Corpus:     {benchmark_results['shannon_entropy_word']['synthetic']:.4f}")
    if 'avg_doc_length_chars' in benchmark_results:
         print(f"  Average Document Length (Characters):")
         print(f"    Pseudonymized Files: {benchmark_results['avg_doc_length_chars']['pseudonymized']:.2f}")
         print(f"    Synthetic Files:     {benchmark_results['avg_doc_length_chars']['synthetic']:.2f}")
    if 'avg_bigram_pmi' in benchmark_results:
         print(f"  Average Bigram Pointwise Mutual Information (PMI, min_freq={PMI_MIN_BIGRAM_FREQ}):")
         print(f"    Pseudonymized Corpus: {benchmark_results['avg_bigram_pmi']['pseudonymized']:.4f}")
         print(f"    Synthetic Corpus:     {benchmark_results['avg_bigram_pmi']['synthetic']:.4f}")
    if 'jsd_word' in benchmark_results and not np.isnan(benchmark_results['jsd_word']):
         print(f"  Jensen-Shannon Divergence (Word Distribution): {benchmark_results['jsd_word']:.4f}")
    else:
         print(f"  Jensen-Shannon Divergence (Word Distribution): N/A (Insufficient data)")
    if 'bleu_score' in benchmark_results and not np.isnan(benchmark_results['bleu_score']):
         print(f"  BLEU Score (Synthetic vs All Pseudonymized): {benchmark_results['bleu_score']:.4f}")
    else:
         print(f"  BLEU Score (Synthetic vs All Pseudonymized): N/A (Insufficient data)")
    if 'bert_score' in benchmark_results and not np.isnan(benchmark_results['bert_score']['f1']):
         print(f"  BERT Score (Synthetic vs All Pseudonymized):")
         print(f"    Precision: {benchmark_results['bert_score']['precision']:.4f}")
         print(f"    Recall:    {benchmark_results['bert_score']['recall']:.4f}")
         print(f"    F1:        {benchmark_results['bert_score']['f1']:.4f}")
    else:
         print(f"  BERT Score (Synthetic vs All Pseudonymized): N/A (Insufficient data)")

    if 'classifier_performance' in benchmark_results and not np.isnan(benchmark_results['classifier_performance']['auc']):
         print(f"\n  Classifier Performance (Distinguishing Pseudo vs Synthetic):")
         print(f"    AUC:   {benchmark_results['classifier_performance']['auc']:.4f}")
         print(f"    AUPRC: {benchmark_results['classifier_performance']['auprc']:.4f}")
         print(f"    (Lower scores indicate better synthetic data - harder to distinguish)")
    else:
         print(f"\n  Classifier Performance (Distinguishing Pseudo vs Synthetic): N/A (Insufficient data)")


    if 'informational_accuracy_note' in benchmark_results:
         print(f"\n  Note on Informational Accuracy:")
         print(f"    {benchmark_results['informational_accuracy_note']}")


    # Report GPT-4 Summary
    print("\nQualitative Pairwise GPT-4 Comparison Summary:")
    # Ensure comparison_results_gpt4 is defined
    if comparison_results_gpt4:
        ratings = [r['rating'] for r in comparison_results_gpt4 if r['status'] == 'Success' and r['rating'] in ['Laag', 'Matig', 'Hoog']]
        if ratings:
             rating_counts = Counter(ratings)
             print(f"  Overall GPT-4 Ratings Distribution: {rating_counts}")
        else:
             print("  No successful GPT-4 comparisons yielded a Laag/Matig/Hoog rating.")

        # You could add code here to summarize the descriptions if needed

    else:
        print("  No pairwise GPT-4 comparisons were performed or successful.")


    # Save all results (benchmarks + GPT-4 summaries)
    # Ensure comparison_results_gpt4 is included safely
    full_results = {
         "benchmarks": benchmark_results,
         "pairwise_gpt4_comparisons": comparison_results_gpt4
    }
    try:
        # Create the output directory if it doesn't exist
        os.makedirs(os.path.dirname(COMPARISON_RESULTS_FILE), exist_ok=True)
        with open(COMPARISON_RESULTS_FILE, 'w', encoding='utf-8') as f:
            json.dump(full_results, f, indent=4, ensure_ascii=False)
        print(f"\nFull evaluation report saved to: {COMPARISON_RESULTS_FILE}")
    except Exception as e:
        print(f"Error saving full results to JSON: {e}")


print("\nEvaluation process complete.")
