In [3]:
import os
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from google.colab import files
import io

# Load environment variables
load_dotenv()

# Initialize OpenAI client


# Mapping dictionaries based on feature_enginnering.md
SEX_MAP = {0: "female", 1: "male"}
CHEST_PAIN_MAP = {0: "typical angina", 1: "atypical angina", 2: "non-anginal pain", 3: "asymptomatic"}
FASTING_BLOOD_SUGAR_MAP = {0: "normal", 1: "elevated"}
RESTING_ECG_MAP = {0: "normal ECG", 1: "ST-T wave abnormality", 2: "left ventricular hypertrophy"}
EXERCISE_ANGINA_MAP = {0: "No", 1: "Yes"}
ST_SLOPE_MAP = {0: "upsloping", 1: "flat", 2: "downsloping"}
TARGET_MAP = {0: "No heart Disease", 1: "Heart Disease"}


def transform_row_to_feature_text(row):
    """
    Transform a single row of data into structured feature text (fallback/manual).
    """
    feature_parts = [
        f"age-{int(row['age'])}",
        f"sex-{SEX_MAP.get(int(row['sex']), row['sex'])}",
        f"chest pain type-{CHEST_PAIN_MAP.get(int(row['chest pain type']), row['chest pain type'])}",
        f"resting bp-{int(row['resting bp s'])}mm Hg",
        f"cholesterol-{int(row['cholesterol'])} mg/dL",
        f"fasting blood sugar-{FASTING_BLOOD_SUGAR_MAP.get(int(row['fasting blood sugar']), row['fasting blood sugar'])}",
        f"resting ecg-{RESTING_ECG_MAP.get(int(row['resting ecg']), row['resting ecg'])}",
        f"Maximum Heart Rate-{int(row['max heart rate'])}",
        f"Exercise Angina-{EXERCISE_ANGINA_MAP.get(int(row['exercise angina']), row['exercise angina'])}",
        f"oldpeak-{row['oldpeak']}",
        f"ST slope-{ST_SLOPE_MAP.get(int(row['ST slope']), row['ST slope'])}"
    ]
    return ", ".join(feature_parts)


def transform_target(target_value):
    """Transform target value to human-readable text."""
    return TARGET_MAP.get(int(target_value), str(target_value))


# =============================================================================
# STEP 1A: Generate Feature (Structured Clinical Data) using LLM
# =============================================================================
def step1_generate_feature_text(row_data):
    """
    STEP 1A: Use GPT-4o to transform raw row data into structured feature text.
    """
    raw_data = f"""
    age: {row_data['age']}
    sex: {row_data['sex']}
    chest pain type: {row_data['chest pain type']}
    resting bp s: {row_data['resting bp s']}
    cholesterol: {row_data['cholesterol']}
    fasting blood sugar: {row_data['fasting blood sugar']}
    resting ecg: {row_data['resting ecg']}
    max heart rate: {row_data['max heart rate']}
    exercise angina: {row_data['exercise angina']}
    oldpeak: {row_data['oldpeak']}
    ST slope: {row_data['ST slope']}
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": """You are a medical data transformer. Convert the given patient data into a descriptive text format.\n\nUse these mappings:\n- sex: 0=female, 1=male\n- chest pain type: 0=typical angina, 1=atypical angina, 2=non-anginal pain, 3=asymptomatic\n- fasting blood sugar: 0=normal, 1=elevated\n- resting ecg: 0=normal ECG, 1=ST-T wave abnormality, 2=left ventricular hypertrophy\n- exercise angina: 0=No, 1=Yes\n- ST slope: 0=upsloping, 1=flat, 2=downsloping\n\nOutput format (single line, comma-separated):\nage-[value], sex-[mapped], chest pain type-[mapped], resting bp-[value]mm Hg, cholesterol-[value] mg/dL, fasting blood sugar-[mapped], resting ecg-[mapped], Maximum Heart Rate-[value], Exercise Angina-[mapped], oldpeak-[value], ST slope-[mapped]\n\nReturn ONLY the formatted text, nothing else."""
                },
                {
                    "role": "user",
                    "content": raw_data
                }
            ],
            max_tokens=300,
            temperature=0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"  [Step 1A] OpenAI API error: {e}")
        return transform_row_to_feature_text(row_data)


# =============================================================================
# STEP 1B: Generate Free-Text (Medical Report Style) using LLM
# =============================================================================
def step1_generate_free_text(row_data):
    """
    STEP 1B: Use GPT-4o to generate a natural language medical report style
    free-text description from raw patient data.
    """
    # Prepare mapped values for the prompt
    sex = SEX_MAP.get(int(row_data['sex']), str(row_data['sex']))
    chest_pain = CHEST_PAIN_MAP.get(int(row_data['chest pain type']), str(row_data['chest pain type']))
    fbs = FASTING_BLOOD_SUGAR_MAP.get(int(row_data['fasting blood sugar']), str(row_data['fasting blood sugar']))
    ecg = RESTING_ECG_MAP.get(int(row_data['resting ecg']), str(row_data['resting ecg']))
    exercise_angina = EXERCISE_ANGINA_MAP.get(int(row_data['exercise angina']), str(row_data['exercise angina']))
    st_slope = ST_SLOPE_MAP.get(int(row_data['ST slope']), str(row_data['ST slope']))

    patient_data = f"""
Patient Clinical Data:
- Age: {row_data['age']} years
- Sex: {sex}
- Chest Pain Type: {chest_pain}
- Resting Blood Pressure: {row_data['resting bp s']} mm Hg
- Serum Cholesterol: {row_data['cholesterol']} mg/dL
- Fasting Blood Sugar: {fbs} (\u2264120 mg/dL is normal)
- Resting ECG: {ecg}
- Maximum Heart Rate Achieved: {row_data['max heart rate']} bpm
- Exercise-Induced Angina: {exercise_angina}
- ST Depression (Oldpeak): {row_data['oldpeak']}
- ST Slope at Peak Exercise: {st_slope}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": """You are a medical professional writing a clinical report. Generate a natural, flowing free-text medical report based on the patient's clinical data.\n\nGuidelines:\n1. Write in MEDICAL REPORT style (third person, professional tone)\n2. Include ALL clinical values naturally in the narrative\n3. Use proper medical terminology\n4. Make it read like a real clinical EHR note\n5. Include observations about risk factors where appropriate\n6. Keep it to 1-2 paragraphs (150-250 words)\n\nExample style:\n\"The patient is a 54-year-old male who presented for cardiac evaluation. Clinical examination revealed a resting blood pressure of 140 mm Hg and serum cholesterol level of 239 mg/dL, both of which are elevated and represent cardiovascular risk factors. The patient reported experiencing atypical angina-type chest pain. Fasting blood sugar was within normal limits (\u2264120 mg/dL). Resting electrocardiogram showed normal results. During stress testing, the patient achieved a maximum heart rate of 160 bpm and did not experience exercise-induced angina. ST segment analysis revealed an oldpeak depression of 1.2 with a flat slope pattern at peak exercise, which may warrant further evaluation for potential ischemic changes.\"\n\nReturn ONLY the free-text narrative, nothing else."""
                },
                {
                    "role": "user",
                    "content": patient_data
                }
            ],
            max_tokens=500,
            temperature=0.3
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"  [Step 1B] OpenAI API error: {e}")
        return f"Medical report generation failed for this patient. Raw data: Age {row_data['age']}, Sex {sex}."


# =============================================================================
# MAIN PIPELINE - PHASE 1 ONLY (Text Generation)
# =============================================================================
def transform_dataset(input_df, output_prefix="output"):
    """
    Transform the entire dataset - PHASE 1 ONLY.

    Generates:
    - Structured Feature text (LLM generated - GPT-4o)
    - Free-text narrative (LLM generated - GPT-4o)

    Output:
    - {output_prefix}_step1_generated.csv - Feature + Free_Text + Target

    NOTE: Run validation.py separately for Phase 2 (validation & verification)
    """
    # Use the provided DataFrame instead of reading from file
    print(f"\n{'='*70}")
    print(f"HEART DISEASE DATA TRANSFORMATION PIPELINE")
    print(f"PHASE 1: SYNTHETIC TEXT GENERATION")
    print(f"{'='*70}")
    print(f"\nUsing provided DataFrame for processing...")
    df = input_df

    print(f"Original dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")

    total_rows = len(df)

    # =========================================================================
    # PHASE 1: Generate Feature (structured) + Free-Text for ALL rows
    # =========================================================================
    print(f"\n{'='*70}")
    print("GENERATING SYNTHETIC TEXT:")
    print("  - Feature: Structured clinical data (GPT-4o)")
    print("  - Free_Text: Medical report narrative (GPT-4o)")
    print(f"{'='*70}")

    generated_data = []
    for idx, row in df.iterrows():
        print(f"\n[Row {idx + 1}/{total_rows}] Generating texts...")

        # Generate structured Feature using LLM
        print(f"  \u2192 Generating structured Feature (LLM)...")
        feature_text = step1_generate_feature_text(row)

        # Generate Free-Text narrative (LLM)
        print(f"  \u2192 Generating Free-Text narrative (LLM)...")
        free_text = step1_generate_free_text(row)

        # Get target
        target = transform_target(row['target'])

        generated_data.append({
            "Feature": feature_text,
            "Free_Text": free_text,
            "Target": target
        })

        print(f"  \u2713 Done")

    print(f"\n{'='*70}")
    print(f"\u2713 PHASE 1 COMPLETE: Generated texts for {total_rows} rows")
    print(f"{'='*70}")

    # =========================================================================
    # Save output
    # =========================================================================
    output_df = pd.DataFrame(generated_data)
    output_file = f"{output_prefix}_step1_generated.csv"
    output_df.to_csv(output_file, index=False)

    print(f"\n\u2713 Output saved: {output_file}")
    print(f"\n{'='*70}")
    print("PHASE 1 SUMMARY")
    print(f"{'='*70}")
    print(f"Total rows processed: {total_rows}")
    print(f"Feature (structured): Generated for all {total_rows} rows (GPT-4o)")
    print(f"Free_Text (narrative): Generated for all {total_rows} rows (GPT-4o)")
    print(f"\nOutput file: {output_file}")
    print(f"\n{'='*70}")
    print("NEXT STEP:")
    print("  Run validation.py to perform Phase 2 (validation & verification)")
    print("  Command: python validation.py")
    print(f"{'='*70}")

    return output_df


def main():
    # --- File Upload ---
    print("Please upload your 'Heart_Disease.csv' file:")
    uploaded_file = files.upload()

    if uploaded_file:
        file_name = next(iter(uploaded_file))
        print(f"Uploaded file: {file_name}")
        df_uploaded = pd.read_csv(io.BytesIO(uploaded_file[file_name]))
        print("First 5 rows of the uploaded CSV:")
        display(df_uploaded.head())

        output_prefix = "heart_disease_freetext"
        # Run Phase 1 (text generation only) with the uploaded DataFrame
        output_df = transform_dataset(df_uploaded, output_prefix)

        print("\n\u2714\ufe0f Phase 1 complete! Run validation.py for Phase 2.")
    else:
        print("No file uploaded. Please upload 'Heart_Disease.csv' to proceed.")


if __name__ == "__main__":
    main()

Please upload your 'Heart_Disease.csv' file:


Saving Heart_Disease.csv to Heart_Disease.csv
Uploaded file: Heart_Disease.csv
First 5 rows of the uploaded CSV:


Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  → Generating Free-Text narrative (LLM)...
  ✓ Done

[Row 196/1190] Generating texts...
  → Generating structured Feature (LLM)...
  → Generating Free-Text narrative (LLM)...
  ✓ Done

[Row 197/1190] Generating texts...
  → Generating structured Feature (LLM)...
  → Generating Free-Text narrative (LLM)...
  ✓ Done

[Row 198/1190] Generating texts...
  → Generating structured Feature (LLM)...
  → Generating Free-Text narrative (LLM)...
  ✓ Done

[Row 199/1190] Generating texts...
  → Generating structured Feature (LLM)...
  → Generating Free-Text narrative (LLM)...
  ✓ Done

[Row 200/1190] Generating texts...
  → Generating structured Feature (LLM)...
  → Generating Free-Text narrative (LLM)...
  ✓ Done

[Row 201/1190] Generating texts...
  → Generating structured Feature (LLM)...
  → Generating Free-Text narrative (LLM)...
  ✓ Done

[Row 202/1190] Generating texts...
  → Generating structured Feature (LLM)...
  → Generat

In [4]:
import pandas as pd

output_file_path = "heart_disease_freetext_step1_generated.csv"

try:
    output_df = pd.read_csv(output_file_path)
    print(f"Content of '{output_file_path}' (first 5 rows):")
    display(output_df.head())
except FileNotFoundError:
    print(f"Error: The file '{output_file_path}' was not found. Please ensure the generation process completed successfully.")
except Exception as e:
    print(f"An error occurred while reading the output file: {e}")

Content of 'heart_disease_freetext_step1_generated.csv' (first 5 rows):


Unnamed: 0,Feature,Free_Text,Target
0,"age-40.0, sex-male, chest pain type-non-angina...",The patient is a 40-year-old male who presente...,No heart Disease
1,"age-49.0, sex-female, chest pain type-asymptom...",The patient is a 49-year-old female who presen...,Heart Disease
2,"age-37.0, sex-male, chest pain type-non-angina...",The patient is a 37-year-old male who presente...,No heart Disease
3,"age-48.0, sex-female, chest pain type-asymptom...",The patient is a 48-year-old female who presen...,Heart Disease
4,"age-54.0, sex-male, chest pain type-asymptomat...",The patient is a 54-year-old male who presente...,No heart Disease


In [5]:
display(output_df.head(20))

Unnamed: 0,Feature,Free_Text,Target
0,"age-40.0, sex-male, chest pain type-non-angina...",The patient is a 40-year-old male who presente...,No heart Disease
1,"age-49.0, sex-female, chest pain type-asymptom...",The patient is a 49-year-old female who presen...,Heart Disease
2,"age-37.0, sex-male, chest pain type-non-angina...",The patient is a 37-year-old male who presente...,No heart Disease
3,"age-48.0, sex-female, chest pain type-asymptom...",The patient is a 48-year-old female who presen...,Heart Disease
4,"age-54.0, sex-male, chest pain type-asymptomat...",The patient is a 54-year-old male who presente...,No heart Disease
5,"age-39.0, sex-male, chest pain type-asymptomat...",The patient is a 39-year-old male who presente...,No heart Disease
6,"age-45.0, sex-female, chest pain type-non-angi...",The patient is a 45-year-old female who presen...,No heart Disease
7,"age-54.0, sex-male, chest pain type-non-angina...",The patient is a 54-year-old male who presente...,No heart Disease
8,"age-37.0, sex-male, chest pain type-asymptomat...",The patient is a 37-year-old male who presente...,Heart Disease
9,"age-48.0, sex-female, chest pain type-non-angi...",The patient is a 48-year-old female who presen...,No heart Disease


In [6]:
from google.colab import files

output_file_path = "heart_disease_freetext_step1_generated.csv"
files.download(output_file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>