In [None]:
!pip install google-generativeai pandas tqdm




In [None]:
import pandas as pd
import re
from tqdm.auto import tqdm
import google.generativeai as genai
from google.colab import files

# üëâ ENTER YOUR GEMINI API KEY HERE
genai.configure(api_key="")

MODEL_NAME = "gemini-2.5-flash"
  # SUPER FAST + CHEAP
BATCH_SIZE = 20                   # batch 20 rows ‚Üí 20√ó speedup

NEGATION_WORDS = ["no","not","never","without","doesn't","don't","cant","can't"]

In [None]:
def meaning_safe(original: str, noisy: str) -> bool:
    original = original.lower()
    noisy = noisy.lower()
    for w in NEGATION_WORDS:
        if len(re.findall(rf"\b{w}\b", original)) != len(re.findall(rf"\b{w}\b", noisy)):
            return False
    return True


In [None]:
def build_batch_prompt(batch):
    numbered = "\n".join([f"{i+1}. {txt}" for i, txt in enumerate(batch)])
    return f"""
You are a noise generator. Add ONLY surface-level noise to each sentence below.

Allowed noise:
- typos
- repeated letters
- character swaps
- spacing/punctuation errors
- mild filler words ("uh", "erm")

Forbidden:
- paraphrasing
- synonyms
- rewording
- adding or removing symptoms/behaviours
- flipping meaning or negation ("now" vs "not")
- adding details
- summarising

Return output as a numbered list (1., 2., 3., ...).

Input sentences:
{numbered}
"""


In [None]:
def gemini_generate_noisy_batch(batch):
    prompt = build_batch_prompt(batch)

    model = genai.GenerativeModel(MODEL_NAME)

    response = model.generate_content(prompt)

    lines = response.text.strip().split("\n")

    noisy = []
    for line in lines:
        if "." in line:
            noisy.append(line.split(".", 1)[1].strip())
        else:
            noisy.append(line.strip())

    # adjust length (sometimes Gemini outputs extra whitespace lines)
    noisy = noisy[:len(batch)]
    return noisy


In [None]:
def gemini_generate_noisy_safe_batch(batch):
    output = gemini_generate_noisy_batch(batch)

    final = []
    for original, noisy in zip(batch, output):
        if meaning_safe(original, noisy):
            final.append(noisy)
        else:
            final.append(original)  # fallback
    return final


In [None]:
print("üìÅ Upload your CSV file")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

df = pd.read_csv(file_name)
print("Columns:", df.columns.tolist())

# auto-detect likely text column
text_col = max(df.columns, key=lambda c: df[c].astype(str).str.len().mean())
print("Detected free-text column:", text_col)


üìÅ Upload your CSV file


Saving split_part_1.csv to split_part_1 (2).csv
Columns: ['free_text', 'ASD_label']
Detected free-text column: free_text


In [None]:
noisy_output = []
tqdm.pandas()

print("üöÄ Injecting Gemini-based noise in batches...")

for start in tqdm(range(0, len(df), BATCH_SIZE)):
    batch = df[text_col].astype(str).iloc[start:start+BATCH_SIZE].tolist()
    noisy_batch = gemini_generate_noisy_safe_batch(batch)
    noisy_output.extend(noisy_batch)

df[text_col + "_noisy"] = noisy_output


üöÄ Injecting Gemini-based noise in batches...


  0%|          | 0/38 [00:00<?, ?it/s]

In [None]:
print("Original vs. Noisy Text Examples:")
for i in range(5):
    original_text = df[text_col].iloc[i]
    noisy_text = df[text_col + "_noisy"].iloc[i]
    print(f"\n--- Example {i+1} ---")
    print(f"Original: {original_text}")
    print(f"Noisy:    {noisy_text}")


print("\nExplanation:\nThe `meaning_safe` function (defined earlier) is designed to ensure that the noise added doesn't accidentally flip the meaning of the sentence, particularly around negation words (like 'no', 'not', 'never'). If the AI's generated 'noisy' version of a sentence changes the presence or count of these negation words, the system falls back to using the `original` sentence instead of the 'noisy' one. This makes the noise very conservative.")

# You can check how many times the original was used as a fallback.
# To do this, we would need to modify the `gemini_generate_noisy_safe_batch` function
# to return a count of fallbacks. Would you like to do that?

Original vs. Noisy Text Examples:

--- Example 1 ---
Original: The responses indicate that my daughter is engaging in a number of typical behaviors for her age, although there are some areas of concern. She consistently looks at me when I call her name and often makes eye contact, which suggests she is generally attentive and responsive. She usually points to express her desires and frequently shares her interests by pointing at things she finds intriguing. Play seems to be an important part of her interactions, as she pretends to care for dolls and engages in imaginative play, which is a positive sign of development. Additionally, she usually follows my gaze, indicating that she is aware of and interested in her surroundings. However, there are notable gaps in her emotional responses; she has never shown signs of wanting to comfort someone who is upset, and she has not yet spoken her first words. While she uses simple gestures, like waving goodbye, there is a concerning behavior where

In [None]:
output_file = "noisy_with_gemini.csv"
df.to_csv(output_file, index=False)
files.download(output_file)

print("‚úÖ Done! Noisy file is ready:", output_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ Done! Noisy file is ready: noisy_with_gemini.csv


In [None]:
# ============================================
# 1. Install & imports
# ============================================
!pip install -q google-generativeai pandas tqdm

import pandas as pd
import google.generativeai as genai
from google.colab import files
from tqdm.auto import tqdm

# ============================================
# 2. Configure Gemini
# ============================================
API_KEY = ""   # <-- put your key here
genai.configure(api_key=API_KEY)

MODEL_NAME = "gemini-2.5-flash"        # you can change to gemini-1.5-pro, etc.

# ============================================
# 3. Define noise-injection function
# ============================================
def inject_noise_with_gemini(text: str) -> str:
    """
    Ask Gemini to add surface-level noise to the text
    (typos, repeated letters, spacing/punctuation mistakes, etc.)
    while keeping the meaning as close as possible.
    """
    if not isinstance(text, str):
        text = str(text)

    prompt = f"""
You are a noise-injection system for caregiver free-text responses.

Add ONLY surface-level noise to the following text, while keeping the original meaning
as close as possible. Do NOT summarize, rewrite, or change the core meaning.

Allowed types of noise:
- typos and minor misspellings
- repeated letters (e.g., "cooontact")
- small character swaps
- missing or extra spaces
- minor punctuation errors
- occasional filler tokens like "uh", "erm", "hmm"

Forbidden:
- changing "no" to "now" or "not" and vice versa
- adding or removing negation (no / not / never / cannot / doesn't / can't)
- changing autism-related behaviors or the presence/absence of symptoms
- paraphrasing or rewording the sentence
- adding new information or removing existing information

Input text:
{text}

Return ONLY the noisy version of the text, nothing else.
"""

    model = genai.GenerativeModel(MODEL_NAME)
    response = model.generate_content(prompt)

    noisy = response.text.strip()
    return noisy

# ============================================
# 4. Upload CSV
# ============================================
print("üìÅ Please upload your CSV file (must contain a 'free_text' column):")
uploaded = files.upload()

if not uploaded:
    raise ValueError("No file uploaded.")

input_filename = list(uploaded.keys())[0]
print(f"‚úÖ Uploaded: {input_filename}")

df = pd.read_csv(input_filename)
print("Columns found:", list(df.columns))

# Check that free_text column exists
if "free_text" not in df.columns:
    raise ValueError("Column 'free_text' not found. Please rename your text column to 'free_text' or change the code.")

# ============================================
# 5. Apply noise to free_text column
# ============================================
tqdm.pandas(desc="Injecting noise")
df["free_text_noisy"] = df["free_text"].astype(str).progress_apply(inject_noise_with_gemini)

# ============================================
# 6. Save & download noisy CSV
# ============================================
output_filename = input_filename.replace(".csv", "_gemini_noisy.csv")
df.to_csv(output_filename, index=False)
print(f"üíæ Saved noisy file as: {output_filename}")

files.download(output_filename)


üìÅ Please upload your CSV file (must contain a 'free_text' column):


KeyboardInterrupt: 

In [None]:
# ============================================
# 1. Install & imports
# ============================================
!pip install -q google-generativeai pandas tqdm

import pandas as pd
import google.generativeai as genai
from google.colab import files
from tqdm.auto import tqdm

# ============================================
# 2. Configure Gemini
# ============================================
   # <-- put your key here
genai.configure(api_key=API_KEY)

MODEL_NAME = "gemini-2.5-flash"        # you can change to gemini-1.5-pro, etc.

# ============================================
# 3. Define noise-injection function
# ============================================
def inject_noise_with_gemini(text: str) -> str:
    """
    Ask Gemini to add surface-level noise to the text
    (typos, repeated letters, spacing/punctuation mistakes, etc.)
    while keeping the meaning as close as possible.
    """
    if not isinstance(text, str):
        text = str(text)

    prompt = f"""
You are a noise-injection system for caregiver free-text responses.

Add ONLY surface-level noise to the following text, while keeping the original meaning
as close as possible. Do NOT summarize, rewrite, or change the core meaning.

Allowed types of noise:
- typos and minor misspellings
- repeated letters (e.g., "cooontact")
- small character swaps
- missing or extra spaces
- minor punctuation errors
- occasional filler tokens like "uh", "erm", "hmm"

Forbidden:
- changing "no" to "now" or "not" and vice versa
- adding or removing negation (no / not / never / cannot / doesn't / can't)
- changing autism-related behaviors or the presence/absence of symptoms
- paraphrasing or rewording the sentence
- adding new information or removing existing information

Input text:
{text}

Return ONLY the noisy version of the text, nothing else.
"""

    model = genai.GenerativeModel(MODEL_NAME)
    response = model.generate_content(prompt)

    noisy = response.text.strip()
    return noisy

# ============================================
# 4. Upload CSV
# ============================================
print("üìÅ Please upload your CSV file (must contain a 'free_text' column):")
uploaded = files.upload()

if not uploaded:
    raise ValueError("No file uploaded.")

input_filename = list(uploaded.keys())[0]
print(f"‚úÖ Uploaded: {input_filename}")

df = pd.read_csv(input_filename)
print("Columns found:", list(df.columns))

# Check that free_text column exists
if "free_text" not in df.columns:
    raise ValueError("Column 'free_text' not found. Please rename your text column to 'free_text' or change the code.")

# ============================================
# 5. Apply noise to free_text column
# ============================================
tqdm.pandas(desc="Injecting noise")
df["free_text_noisy"] = df["free_text"].astype(str).progress_apply(inject_noise_with_gemini)

# ============================================
# 6. Save & download noisy CSV
# ============================================
output_filename = input_filename.replace(".csv", "_gemini_noisy.csv")
df.to_csv(output_filename, index=False)
print(f"üíæ Saved noisy file as: {output_filename}")

files.download(output_filename)


üìÅ Please upload your CSV file (must contain a 'free_text' column):


KeyboardInterrupt: 

In [None]:
import os
import json
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv
from openai import OpenAI

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI(api_key="")

# Mapping dictionaries based on feature_enginnering.md
SEX_MAP = {0: "female", 1: "male"}
CHEST_PAIN_MAP = {0: "typical angina", 1: "atypical angina", 2: "non-anginal pain", 3: "asymptomatic"}
FASTING_BLOOD_SUGAR_MAP = {0: "normal", 1: "elevated"}
RESTING_ECG_MAP = {0: "normal ECG", 1: "ST-T wave abnormality", 2: "left ventricular hypertrophy"}
EXERCISE_ANGINA_MAP = {0: "No", 1: "Yes"}
ST_SLOPE_MAP = {0: "upsloping", 1: "flat", 2: "downsloping"}
TARGET_MAP = {0: "No heart Disease", 1: "Heart Disease"}

# List of all fields for validation
FIELDS = ["age", "sex", "chest_pain_type", "resting_bp", "cholesterol",
          "fasting_blood_sugar", "resting_ecg", "max_heart_rate",
          "exercise_angina", "oldpeak", "ST_slope"]


# =============================================================================
# STEP 2.1: Validation & Correction - Check for missing/contradictions
# =============================================================================
def step2_1_validate_and_correct(free_text, original_row):
    """
    STEP 2.1: Ask LLM to check if the free-text description is missing any
    important clinical information or contains contradictions compared to
    the original data. Correct if needed while keeping language natural.

    Uses OpenAI.
    """
    # Prepare original values with mappings
    sex = SEX_MAP.get(int(original_row['sex']), str(original_row['sex']))
    chest_pain = CHEST_PAIN_MAP.get(int(original_row['chest pain type']), str(original_row['chest pain type']))
    fbs = FASTING_BLOOD_SUGAR_MAP.get(int(original_row['fasting blood sugar']), str(original_row['fasting blood sugar']))
    ecg = RESTING_ECG_MAP.get(int(original_row['resting ecg']), str(original_row['resting ecg']))
    exercise_angina = EXERCISE_ANGINA_MAP.get(int(original_row['exercise angina']), str(original_row['exercise angina']))
    st_slope = ST_SLOPE_MAP.get(int(original_row['ST slope']), str(original_row['ST slope']))

    validation_prompt = f"""
ORIGINAL PATIENT DATA (Ground Truth):
- Age: {original_row['age']} years
- Sex: {sex}
- Chest Pain Type: {chest_pain}
- Resting Blood Pressure: {original_row['resting bp s']} mm Hg
- Serum Cholesterol: {original_row['cholesterol']} mg/dL
- Fasting Blood Sugar: {fbs}
- Resting ECG: {ecg}
- Maximum Heart Rate Achieved: {original_row['max heart rate']} bpm
- Exercise-Induced Angina: {exercise_angina}
- ST Depression (Oldpeak): {original_row['oldpeak']}
- ST Slope at Peak Exercise: {st_slope}

GENERATED FREE-TEXT DESCRIPTION:
{free_text}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": """You are a medical data validator. Your task is to:

1. COMPARE the generated free-text description with the original patient data
2. CHECK for:
   - Missing clinical information (any value not mentioned in the text)
   - Contradictions (values that don't match the original data)
   - Incorrect medical terminology or values
3. If there are issues, CORRECT the text to accurately reflect ALL original data
4. Keep the corrected text natural, flowing, and medically appropriate

Output format (JSON):
{
    "has_issues": true/false,
    "issues_found": ["list of specific issues found"],
    "corrected_text": "the corrected free-text (or original if no issues)"
}

Return ONLY valid JSON, nothing else."""
                },
                {
                    "role": "user",
                    "content": validation_prompt
                }
            ],
            max_tokens=700,
            temperature=0
        )

        result = response.choices[0].message.content.strip()
        # Clean up potential markdown formatting
        if result.startswith("```"):
            result = result.split("```")[1]
            if result.startswith("json"):
                result = result[4:]
        result = result.strip()

        return json.loads(result)
    except json.JSONDecodeError as e:
        print(f"  [Step 2.1] JSON parse error: {e}")
        return {"has_issues": False, "issues_found": [], "corrected_text": free_text}
    except Exception as e:
        print(f"  [Step 2.1] Azure OpenAI API error: {e}")
        return {"has_issues": False, "issues_found": [], "corrected_text": free_text}


# =============================================================================
# STEP 2.2: Reverse-Mapping Verification
# =============================================================================
def step2_2_reverse_map(corrected_text):
    """
    STEP 2.2: Given ONLY the corrected free-text description, ask LLM to
    re-generate the original clinical values. This tests if the text
    accurately represents the data.

    Uses OpenAI.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": """You are a medical data extractor. Given ONLY a clinical free-text description, extract the patient's clinical values.

Use these mappings to convert text descriptions to numeric codes:
- sex: female=0, male=1
- chest pain type: typical angina=0, atypical angina=1, non-anginal pain=2, asymptomatic=3
- fasting blood sugar: normal/‚â§120=0, elevated/>120=1
- resting ecg: normal ECG/normal=0, ST-T wave abnormality=1, left ventricular hypertrophy=2
- exercise angina: No=0, Yes=1
- ST slope: upsloping=0, flat=1, downsloping=2

Extract and return ONLY these values in JSON format:
{
    "age": <number>,
    "sex": <0 or 1>,
    "chest_pain_type": <0-3>,
    "resting_bp": <number>,
    "cholesterol": <number>,
    "fasting_blood_sugar": <0 or 1>,
    "resting_ecg": <0-2>,
    "max_heart_rate": <number>,
    "exercise_angina": <0 or 1>,
    "oldpeak": <number>,
    "ST_slope": <0-2>
}

Return ONLY valid JSON, nothing else."""
                },
                {
                    "role": "user",
                    "content": f"Extract clinical values from this medical report:\n\n{corrected_text}"
                }
            ],
            max_tokens=300,
            temperature=0
        )

        result = response.choices[0].message.content.strip()
        # Clean up potential markdown formatting
        if result.startswith("```"):
            result = result.split("```")[1]
            if result.startswith("json"):
                result = result[4:]
        result = result.strip()

        return json.loads(result)
    except json.JSONDecodeError as e:
        print(f"  [Step 2.2] JSON parse error: {e}")
        return None
    except Exception as e:
        print(f"  [Step 2.2] Azure OpenAI API error: {e}")
        return None

def compare_values(original_row, regenerated_values):
    """
    Compare original values with regenerated values and calculate consistency score.
    """
    if regenerated_values is None:
        return {"consistency_score": 0, "field_matches": {}, "total_matches": 0, "total_fields": len(FIELDS)}

    field_matches = {}

    # Map original row keys to our field names
    original_mapping = {
        "age": float(original_row['age']),
        "sex": int(original_row['sex']),
        "chest_pain_type": int(original_row['chest pain type']),
        "resting_bp": float(original_row['resting bp s']),
        "cholesterol": float(original_row['cholesterol']),
        "fasting_blood_sugar": int(original_row['fasting blood sugar']),
        "resting_ecg": int(original_row['resting ecg']),
        "max_heart_rate": float(original_row['max heart rate']),
        "exercise_angina": int(original_row['exercise angina']),
        "oldpeak": float(original_row['oldpeak']),
        "ST_slope": int(original_row['ST slope'])
    }

    total_matches = 0
    for field in FIELDS:
        original_val = original_mapping.get(field)
        regen_val = regenerated_values.get(field)

        # Handle numeric comparison with tolerance for floats
        if original_val is not None and regen_val is not None:
            if isinstance(original_val, float) and isinstance(regen_val, (int, float)):
                match = abs(float(original_val) - float(regen_val)) < 0.5
            else:
                match = int(original_val) == int(regen_val)
        else:
            match = False

        field_matches[field] = {
            "original": original_val,
            "regenerated": regen_val,
            "match": match
        }

        if match:
            total_matches += 1

    consistency_score = (total_matches / len(FIELDS)) * 100

    return {
        "consistency_score": round(consistency_score, 2),
        "field_matches": field_matches,
        "total_matches": total_matches,
        "total_fields": len(FIELDS)
    }


# =============================================================================
# MAIN VALIDATION PIPELINE (PHASE 2)
# =============================================================================
def validate_dataset(input_file, original_data_file, output_prefix="output"):
    """
    Validate the generated Free-Text from Phase 1.

    This is Phase 2 of the pipeline:
    - Step 2.1: Check for missing content/contradictions in Free-Text
    - Step 2.2: Reverse-mapping verification

    Args:
        input_file: CSV from Phase 1 (heart_disease_freetext_step1_generated.csv)
                    Contains: Feature, Free_Text, Target
        original_data_file: Original dataset (Heart_Disease.csv) for validation comparison
        output_prefix: Prefix for output files

    Outputs:
    1. {output_prefix}_final.csv - Final dataset with validated free-text
    2. {output_prefix}_validation_report.csv - Full audit trail
    3. {output_prefix}_validation_summary.txt - Statistics summary
    """
    # Read the Phase 1 output
    print(f"\n{'='*70}")
    print(f"FREE-TEXT VALIDATION PIPELINE (PHASE 2)")
    print(f"Using OpenAI: gpt-4o")
    print(f"{'='*70}")

    print(f"\nReading Phase 1 output from: {input_file}")
    phase1_df = pd.read_csv(input_file)

    print(f"Reading original data from: {original_data_file}")
    original_df = pd.read_csv(original_data_file)

    print(f"Phase 1 dataset shape: {phase1_df.shape}")
    print(f"Original dataset shape: {original_df.shape}")

    total_rows = len(phase1_df)

    if len(phase1_df) != len(original_df):
        print("WARNING: Row count mismatch between Phase 1 output and original data!")

    # =========================================================================
    # PHASE 2: Validation & Verification for Free-Text ONLY
    # =========================================================================
    print(f"\n{'='*70}")
    print("PHASE 2: FREE-TEXT VALIDATION & VERIFICATION")
    print("  - Step 2.1: Check for missing content/contradictions")
    print("  - Step 2.2: Reverse-mapping verification")
    print(f"{'='*70}")

    results = []
    for idx in range(total_rows):
        row = original_df.iloc[idx]
        feature_text = phase1_df.iloc[idx]['Feature']
        free_text = phase1_df.iloc[idx]['Free_Text']
        target = phase1_df.iloc[idx]['Target']

        print(f"\n[Row {idx + 1}/{total_rows}] Validating Free-Text...")

        # Step 2.1: Validate and correct Free-Text
        print(f"  Step 2.1: Checking for missing content/contradictions...")
        validation_result = step2_1_validate_and_correct(free_text, row)
        corrected_free_text = validation_result.get("corrected_text", free_text)
        has_issues = validation_result.get("has_issues", False)
        issues_found = validation_result.get("issues_found", [])

        if has_issues:
            print(f"    ‚Üí Issues found: {len(issues_found)}")
        else:
            print(f"    ‚Üí No issues found")

        # Step 2.2: Reverse-map and compare
        print(f"  Step 2.2: Reverse-mapping for verification...")
        regenerated_values = step2_2_reverse_map(corrected_free_text)
        comparison = compare_values(row, regenerated_values)

        print(f"  ‚úì Consistency Score: {comparison['consistency_score']}% ({comparison['total_matches']}/{comparison['total_fields']} fields)")

        results.append({
            "row_index": idx,
            "feature_text": feature_text,
            "original_free_text": free_text,
            "has_issues": has_issues,
            "issues_found": issues_found,
            "corrected_free_text": corrected_free_text,
            "regenerated_values": regenerated_values,
            "comparison": comparison,
            "target": target
        })

    print(f"\n{'='*70}")
    print(f"‚úì PHASE 2 COMPLETE: Validated {total_rows} rows")
    print(f"{'='*70}")

    # ==========================================================================
    # Generate Output 1: final.csv (Feature + Validated Free_Text + Target + Score)
    # ==========================================================================
    final_data = []
    for r in results:
        final_data.append({
            "Feature": r["feature_text"],
            "Free_Text": r["corrected_free_text"],
            "Target": r["target"],
            "consistency_score": r["comparison"]["consistency_score"]
        })

    final_df = pd.DataFrame(final_data)
    final_file = f"{output_prefix}_final.csv"
    final_df.to_csv(final_file, index=False)
    print(f"\n‚úì Final dataset saved: {final_file}")

    # ==========================================================================
    # Generate Output 2: validation_report.csv
    # ==========================================================================
    report_data = []
    for r in results:
        field_details = []
        if r["comparison"]["field_matches"]:
            for field, data in r["comparison"]["field_matches"].items():
                field_details.append(f"{field}: orig={data['original']}, regen={data['regenerated']}, match={data['match']}")

        report_data.append({
            "row_index": r["row_index"],
            "feature_text": r["feature_text"],
            "original_free_text": r["original_free_text"],
            "had_issues": r["has_issues"],
            "issues_found": "; ".join(r["issues_found"]) if r["issues_found"] else "None",
            "corrected_free_text": r["corrected_free_text"],
            "regenerated_values": json.dumps(r["regenerated_values"]) if r["regenerated_values"] else "Failed",
            "consistency_score": r["comparison"]["consistency_score"],
            "matches": f"{r['comparison']['total_matches']}/{r['comparison']['total_fields']}",
            "field_details": " | ".join(field_details),
            "target": r["target"]
        })

    report_df = pd.DataFrame(report_data)
    report_file = f"{output_prefix}_validation_report.csv"
    report_df.to_csv(report_file, index=False)
    print(f"‚úì Validation report saved: {report_file}")

    # ==========================================================================
    # Generate Output 3: validation_summary.txt
    # ==========================================================================
    scores = [r["comparison"]["consistency_score"] for r in results]
    avg_score = sum(scores) / len(scores) if scores else 0
    perfect_matches = sum(1 for s in scores if s == 100)
    high_consistency = sum(1 for s in scores if s >= 90)
    low_consistency = sum(1 for s in scores if s < 90)
    issues_count = sum(1 for r in results if r["has_issues"])

    # Calculate per-field accuracy
    field_accuracy = {field: 0 for field in FIELDS}
    for r in results:
        if r["comparison"]["field_matches"]:
            for field, data in r["comparison"]["field_matches"].items():
                if data["match"]:
                    field_accuracy[field] += 1

    for field in field_accuracy:
        field_accuracy[field] = round((field_accuracy[field] / total_rows) * 100, 2)

    summary_content = f"""
================================================================================
FREE-TEXT VALIDATION SUMMARY REPORT
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
================================================================================

INPUT FILES:
- Phase 1 Output: {input_file}
- Original Data:  {original_data_file}

TOTAL ROWS PROCESSED: {total_rows}

LLM USED: OpenAI (gpt-4o)

================================================================================
PHASE 2: FREE-TEXT VALIDATION RESULTS
================================================================================

STEP 2.1 - Missing Content/Contradiction Check:
-----------------------------------------------
Rows with issues found:    {issues_count} ({(issues_count/total_rows)*100:.1f}%)
Rows without issues:       {total_rows - issues_count} ({((total_rows-issues_count)/total_rows)*100:.1f}%)

STEP 2.2 - Reverse-Mapping Consistency:
-----------------------------------------------
Average Consistency Score: {avg_score:.2f}%
Perfect Matches (100%):    {perfect_matches} rows ({(perfect_matches/total_rows)*100:.1f}%)
High Consistency (>=90%):  {high_consistency} rows ({(high_consistency/total_rows)*100:.1f}%)
Low Consistency (<90%):    {low_consistency} rows ({(low_consistency/total_rows)*100:.1f}%)

--------------------------------------------------------------------------------
PER-FIELD ACCURACY (Reverse-Mapping Verification)
--------------------------------------------------------------------------------
"""

    for field, accuracy in sorted(field_accuracy.items(), key=lambda x: x[1], reverse=True):
        bar_filled = int(accuracy / 5)
        bar_empty = 20 - bar_filled
        bar = "=" * bar_filled + "-" * bar_empty
        summary_content += f"{field:20s}: [{bar}] {accuracy:6.2f}%\n"

    summary_content += f"""
--------------------------------------------------------------------------------
CONSISTENCY SCORE DISTRIBUTION
--------------------------------------------------------------------------------
100%:     {"#" * min(perfect_matches, 50)} ({perfect_matches})
90-99%:   {"#" * min(high_consistency - perfect_matches, 50)} ({high_consistency - perfect_matches})
<90%:     {"#" * min(low_consistency, 50)} ({low_consistency})

--------------------------------------------------------------------------------
INTERPRETATION
--------------------------------------------------------------------------------
"""

    if avg_score >= 95:
        summary_content += "EXCELLENT: The free-text generation is highly reliable.\n"
        summary_content += "   The generated narratives accurately reflect the original clinical data.\n"
    elif avg_score >= 90:
        summary_content += "GOOD: The free-text generation is reliable.\n"
        summary_content += "   Minor discrepancies exist but overall quality is acceptable.\n"
    elif avg_score >= 80:
        summary_content += "FAIR: The free-text generation needs improvement.\n"
        summary_content += "   Review low-scoring rows and consider prompt refinement.\n"
    else:
        summary_content += "POOR: The free-text generation requires significant improvement.\n"
        summary_content += "   Many rows have inconsistencies. Review and refine the approach.\n"

    summary_content += f"""
================================================================================
OUTPUT FILES GENERATED
================================================================================
1. {output_prefix}_final.csv              - Final dataset with validated free-text
2. {output_prefix}_validation_report.csv  - Full audit trail for each row
3. {output_prefix}_validation_summary.txt - This summary report

================================================================================
"""

    summary_file = f"{output_prefix}_validation_summary.txt"
    with open(summary_file, 'w', encoding='utf-8') as f:
        f.write(summary_content)
    print(f"‚úì Validation summary saved: {summary_file}")

    # Print summary to console
    print(f"\n{'='*70}")
    print("VALIDATION COMPLETE - SUMMARY")
    print(f"{'='*70}")
    print(f"Total rows processed: {total_rows}")
    print(f"Rows with issues in Step 2.1: {issues_count} ({(issues_count/total_rows)*100:.1f}%)")
    print(f"Average consistency score: {avg_score:.2f}%")
    print(f"Perfect matches (100%): {perfect_matches} ({(perfect_matches/total_rows)*100:.1f}%)")
    print(f"High consistency (>=90%): {high_consistency} ({(high_consistency/total_rows)*100:.1f}%)")

    return final_df, report_df


def main():
    # File paths
    phase1_output = "heart_disease_freetext_step1_generated.csv"  # Output from transform_dataset_freetext.py
    original_data = "Heart_Disease.csv"
    output_prefix = "heart_disease_freetext"

    # Run the validation pipeline
    final_df, report_df = validate_dataset(phase1_output, original_data, output_prefix)

    print("\n‚úÖ Validation complete!")


if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  ‚úì Consistency Score: 100.0% (11/11 fields)

[Row 361/1190] Validating Free-Text...
  Step 2.1: Checking for missing content/contradictions...
    ‚Üí No issues found
  Step 2.2: Reverse-mapping for verification...
  ‚úì Consistency Score: 100.0% (11/11 fields)

[Row 362/1190] Validating Free-Text...
  Step 2.1: Checking for missing content/contradictions...
    ‚Üí No issues found
  Step 2.2: Reverse-mapping for verification...
  ‚úì Consistency Score: 100.0% (11/11 fields)

[Row 363/1190] Validating Free-Text...
  Step 2.1: Checking for missing content/contradictions...
    ‚Üí Issues found: 2
  Step 2.2: Reverse-mapping for verification...
  ‚úì Consistency Score: 90.91% (10/11 fields)

[Row 364/1190] Validating Free-Text...
  Step 2.1: Checking for missing content/contradictions...
    ‚Üí Issues found: 1
  Step 2.2: Reverse-mapping for verification...
  ‚úì Consistency Score: 100.0% (11/11 fields)

[Row 365/1190] 

In [None]:
from google.colab import files

output_prefix = "heart_disease_freetext"
report_file = f"{output_prefix}_validation_report.csv"

files.download(report_file)

print(f"‚úÖ Downloaded: {report_file}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ Downloaded: heart_disease_freetext_validation_report.csv


In [None]:
# Upload the 'Heart_Disease.csv' file
from google.colab import files

print("Please upload the 'Heart_Disease.csv' file:")
uploaded_original_data = files.upload()

if 'Heart_Disease.csv' not in uploaded_original_data:
    print("Error: 'Heart_Disease.csv' not uploaded. Please try again.")
else:
    print("File 'Heart_Disease.csv' uploaded successfully.")

Please upload the 'Heart_Disease.csv' file:


Saving Heart_Disease.csv to Heart_Disease.csv
File 'Heart_Disease.csv' uploaded successfully.


In [None]:
# Upload the 'heart_disease_freetext_step1_generated.csv' file
from google.colab import files

print("Please upload the 'heart_disease_freetext_step1_generated.csv' file:")
uploaded_phase1 = files.upload()

if 'heart_disease_freetext_step1_generated.csv' not in uploaded_phase1:
    print("Error: 'heart_disease_freetext_step1_generated.csv' not uploaded. Please try again.")
else:
    print("File 'heart_disease_freetext_step1_generated.csv' uploaded successfully.")

Please upload the 'heart_disease_freetext_step1_generated.csv' file:


Saving heart_disease_freetext_step1_generated.csv to heart_disease_freetext_step1_generated.csv
File 'heart_disease_freetext_step1_generated.csv' uploaded successfully.
