In [1]:
"""
NOTEBOOK 01: SYNTHETIC CLINICAL NOTES GENERATOR
Generate realistic clinical notes for 10 patients
"""

# Download required libraries
!pip install faker

# Imports and setup
import json
import random
import pandas as pd
from datetime import datetime, timedelta
from faker import Faker
import os

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Paths
PROJECT_ROOT = "/content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project"
OUTPUT_DIR = f"{PROJECT_ROOT}/01_data_generation/outputs"
RAW_NOTES_DIR = f"{OUTPUT_DIR}/raw_clinical_notes"
os.makedirs(RAW_NOTES_DIR, exist_ok=True)

# Initialize
fake = Faker()
Faker.seed(42)
random.seed(42)

print("‚úÖ Setup complete")
print(f"üìÅ Output: {OUTPUT_DIR}")

Collecting faker
  Downloading faker-40.1.2-py3-none-any.whl.metadata (16 kB)
Downloading faker-40.1.2-py3-none-any.whl (2.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.0 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-40.1.2
Mounted at /content/drive
‚úÖ Setup complete
üìÅ Output: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/01_data_generation/outputs


In [2]:
# Medical conditions with realistic clinical parameters
CONDITIONS = {
    "Type 2 Diabetes": {
        "meds": ["Metformin 500mg", "Glipizide 5mg", "Insulin Glargine"],
        "labs": {"HbA1c": (7.0, 9.5), "Fasting Glucose": (140, 200)},
        "symptoms": ["increased thirst", "frequent urination", "fatigue"]
    },
    "Hypertension": {
        "meds": ["Lisinopril 10mg", "Amlodipine 5mg", "Hydrochlorothiazide 25mg"],
        "labs": {"Systolic BP": (140, 160), "Diastolic BP": (90, 100)},
        "symptoms": ["headaches", "dizziness", "chest discomfort"]
    },
    "Asthma": {
        "meds": ["Albuterol Inhaler", "Fluticasone 250mcg", "Montelukast 10mg"],
        "labs": {"Peak Flow": (250, 380)},
        "symptoms": ["wheezing", "shortness of breath", "coughing"]
    },
    "Chronic Kidney Disease": {
        "meds": ["Epoetin Alfa", "Sodium Bicarbonate", "Phosphate Binder"],
        "labs": {"Creatinine": (2.5, 4.0), "eGFR": (30, 59)},
        "symptoms": ["fatigue", "ankle swelling", "decreased urine output"]
    },
    "Hyperlipidemia": {
        "meds": ["Atorvastatin 20mg", "Rosuvastatin 10mg"],
        "labs": {"LDL": (160, 220), "Triglycerides": (200, 400)},
        "symptoms": ["none (asymptomatic)"]
    }
}

print(f"‚úÖ Loaded {len(CONDITIONS)} condition templates")

‚úÖ Loaded 5 condition templates


In [3]:
# Generate 10 patients with demographics and conditions
patients = []
for i in range(1, 11):
    patient_id = f"patient_{i:03d}"
    num_conditions = random.randint(1, 3)
    conditions = random.sample(list(CONDITIONS.keys()), num_conditions)
    num_visits = random.randint(3, 5)

    # Visit dates over past 2 years
    base_date = datetime.now() - timedelta(days=730)
    visit_dates = sorted([
        (base_date + timedelta(days=random.randint(0, 730))).strftime("%Y-%m-%d")
        for _ in range(num_visits)
    ])

    patients.append({
        "patient_id": patient_id,
        "name": fake.name(),
        "age": random.randint(35, 75),
        "gender": random.choice(["Male", "Female"]),
        "conditions": conditions,
        "visit_dates": visit_dates
    })

print(f"‚úÖ Generated {len(patients)} patients")
pd.DataFrame(patients)[["patient_id", "age", "gender", "conditions"]].head()

‚úÖ Generated 10 patients


Unnamed: 0,patient_id,age,gender,conditions
0,patient_001,69,Male,"[Type 2 Diabetes, Hyperlipidemia, Asthma]"
1,patient_002,73,Male,"[Chronic Kidney Disease, Type 2 Diabetes, Hype..."
2,patient_003,62,Female,"[Hypertension, Chronic Kidney Disease, Type 2 ..."
3,patient_004,57,Female,"[Hypertension, Hyperlipidemia]"
4,patient_005,70,Female,"[Asthma, Type 2 Diabetes, Hyperlipidemia]"


In [4]:
def generate_note(patient, visit_date, visit_num):
    """Generate a realistic clinical note"""
    condition = random.choice(patient['conditions'])
    cond_data = CONDITIONS[condition]

    # Generate lab results (70% abnormal since patients have chronic conditions)
    labs = {test: round(random.uniform(*ranges), 1)
            for test, ranges in cond_data['labs'].items()}

    # Select meds and symptoms
    meds = random.sample(cond_data['meds'], min(random.randint(1, 3), len(cond_data['meds'])))
    symptoms = random.sample(cond_data['symptoms'], min(random.randint(0, 2), len(cond_data['symptoms'])))

    # Build note
    note = f"""CLINICAL NOTE
{'='*70}
Patient: {patient['name']} (ID: {patient['patient_id']})
Date: {visit_date} | Age: {patient['age']} | Gender: {patient['gender']}
Visit Type: {"Follow-up" if visit_num > 1 else "Initial Consultation"}

CHIEF COMPLAINT:
Follow-up for {condition} management.

HISTORY:
Patient presents for routine follow-up. """

    if symptoms:
        note += f"Reports: {', '.join(symptoms)}. "
    else:
        note += "Denies new symptoms. "

    note += f"Managing {', '.join(patient['conditions'])}.\n\nCURRENT MEDICATIONS:\n"
    for med in meds:
        note += f"  ‚Ä¢ {med}\n"

    note += f"\nVITAL SIGNS:\n  BP: {random.randint(110,145)}/{random.randint(70,95)} mmHg"
    note += f" | HR: {random.randint(65,90)} bpm | Temp: {round(random.uniform(36.5,37.2),1)}¬∞C\n"

    note += "\nLAB RESULTS:\n"
    for test, value in labs.items():
        note += f"  ‚Ä¢ {test}: {value} [ABNORMAL]\n"

    any_abnormal = len(labs) > 0
    note += f"\nASSESSMENT:\n{condition} - "
    note += "Suboptimal control, adjusting medications.\n" if any_abnormal else "Well-controlled.\n"

    note += "\nPLAN:\n"
    note += "  1. Continue medications with adjustments\n"
    note += "  2. Repeat labs in 3 months\n"
    note += "  3. Lifestyle modifications counseling\n"
    note += f"  4. Follow-up in {'1 month' if any_abnormal else '3 months'}\n"
    note += f"\nProvider: Dr. {fake.last_name()}, MD\n{'='*70}"

    return note

# Test
print(generate_note(patients[0], patients[0]['visit_dates'][0], 1)[:500] + "...")

CLINICAL NOTE
Patient: Allison Hill (ID: patient_001)
Date: 2024-05-01 | Age: 69 | Gender: Male
Visit Type: Initial Consultation

CHIEF COMPLAINT:
Follow-up for Asthma management.

HISTORY:
Patient presents for routine follow-up. Reports: coughing. Managing Type 2 Diabetes, Hyperlipidemia, Asthma.

CURRENT MEDICATIONS:
  ‚Ä¢ Albuterol Inhaler
  ‚Ä¢ Fluticasone 250mcg

VITAL SIGNS:
  BP: 139/74 mmHg | HR: 73 bpm | Temp: 36.6¬∞C

LA...


In [5]:
# Generate and save all clinical notes
all_notes = {}
total_notes = 0

for patient in patients:
    patient_notes = []
    patient_dir = f"{RAW_NOTES_DIR}/{patient['patient_id']}"
    os.makedirs(patient_dir, exist_ok=True)

    for visit_num, visit_date in enumerate(patient['visit_dates'], 1):
        note = generate_note(patient, visit_date, visit_num)

        # Save to file
        filename = f"{patient_dir}/visit_{visit_num}_{visit_date}.txt"
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(note)

        patient_notes.append({
            "visit_number": visit_num,
            "visit_date": visit_date,
            "note_content": note
        })
        total_notes += 1

    all_notes[patient['patient_id']] = patient_notes

print(f"‚úÖ Generated and saved {total_notes} clinical notes")
print(f"üìÇ Location: {RAW_NOTES_DIR}")

‚úÖ Generated and saved 36 clinical notes
üìÇ Location: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/01_data_generation/outputs/raw_clinical_notes


In [6]:
# Save patient metadata
metadata = {
    "generation_info": {
        "version": "v1.0",
        "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "num_patients": len(patients),
        "total_notes": total_notes
    },
    "patients": patients
}

with open(f"{OUTPUT_DIR}/patient_metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2)

# Create summary
summary_df = pd.DataFrame([{
    "Patient ID": p['patient_id'],
    "Age": p['age'],
    "Gender": p['gender'],
    "Conditions": len(p['conditions']),
    "Visits": len(p['visit_dates']),
    "Primary Condition": p['conditions'][0]
} for p in patients])

summary_df.to_csv(f"{OUTPUT_DIR}/generation_summary.csv", index=False)

print("‚úÖ Saved metadata and summary")
print(f"\nüìä Dataset Statistics:")
print(f"  ‚Ä¢ Total Patients: {len(patients)}")
print(f"  ‚Ä¢ Total Notes: {total_notes}")
print(f"  ‚Ä¢ Avg Notes/Patient: {total_notes/len(patients):.1f}")
print(f"\n{summary_df}")

‚úÖ Saved metadata and summary

üìä Dataset Statistics:
  ‚Ä¢ Total Patients: 10
  ‚Ä¢ Total Notes: 36
  ‚Ä¢ Avg Notes/Patient: 3.6

    Patient ID  Age  Gender  Conditions  Visits       Primary Condition
0  patient_001   69    Male           3       4         Type 2 Diabetes
1  patient_002   73    Male           3       3  Chronic Kidney Disease
2  patient_003   62  Female           3       4            Hypertension
3  patient_004   57  Female           2       4            Hypertension
4  patient_005   70  Female           3       4                  Asthma
5  patient_006   49  Female           3       3          Hyperlipidemia
6  patient_007   75  Female           1       3            Hypertension
7  patient_008   39    Male           1       4                  Asthma
8  patient_009   49  Female           3       4            Hypertension
9  patient_010   39    Male           1       3            Hypertension


In [8]:
# Display one complete sample note
sample_file = f"{RAW_NOTES_DIR}/patient_001/visit_1_{patients[0]['visit_dates'][0]}.txt"
with open(sample_file, 'r') as f:
    sample = f.read()

print("="*80)
print("SAMPLE CLINICAL NOTE")
print("="*80)
print(sample)

SAMPLE CLINICAL NOTE
CLINICAL NOTE
Patient: Allison Hill (ID: patient_001)
Date: 2024-05-01 | Age: 69 | Gender: Male
Visit Type: Initial Consultation

CHIEF COMPLAINT:
Follow-up for Asthma management.

HISTORY:
Patient presents for routine follow-up. Reports: shortness of breath, coughing. Managing Type 2 Diabetes, Hyperlipidemia, Asthma.

CURRENT MEDICATIONS:
  ‚Ä¢ Montelukast 10mg
  ‚Ä¢ Fluticasone 250mcg

VITAL SIGNS:
  BP: 124/74 mmHg | HR: 81 bpm | Temp: 36.8¬∞C

LAB RESULTS:
  ‚Ä¢ Peak Flow: 323.0 [ABNORMAL]

ASSESSMENT:
Asthma - Suboptimal control, adjusting medications.

PLAN:
  1. Continue medications with adjustments
  2. Repeat labs in 3 months
  3. Lifestyle modifications counseling
  4. Follow-up in 1 month

Provider: Dr. Blair, MD


In [9]:
print("\n" + "="*80)
print("‚úÖ NOTEBOOK 01 COMPLETE")
print("="*80)
print(f"\nüìÇ Outputs saved to: {OUTPUT_DIR}")


‚úÖ NOTEBOOK 01 COMPLETE

üìÇ Outputs saved to: /content/drive/MyDrive/Colab_Notebooks/LLMs/clinical_notes_qa_project/01_data_generation/outputs
