In [0]:
%pip install faker

In [0]:
import pandas as pd
from faker import Faker
import random
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()

# --- Custom Medical Data Generators ---
# To make the demo credible, we need semi-realistic unstructured text.
# We will mix "templates" to create variability.

body_parts = ['Chest', 'Head', 'C-Spine', 'Abdomen/Pelvis', 'Right Knee']
modalities = ['CT', 'MRI', 'X-Ray']

# 1. Critical Findings (The "Needle in the Haystack" for ai_extract)
critical_findings = [
    "acute pulmonary embolism visible in the right lower lobe",
    "large right-sided pneumothorax with mediastinal shift",
    "acute subdural hematoma with 4mm midline shift",
    "free air under the diaphragm suggestive of perforation",
    "displaced fracture of the C2 vertebra (Hangman's fracture)"
]

# 2. Routine/Normal Findings (The "Noise")
routine_findings = [
    "lungs are clear. No pleural effusion or pneumothorax.",
    "cardiac silhouette is within normal limits.",
    "no acute intracranial hemorrhage or mass effect.",
    "degenerative changes noted in the lumbar spine.",
    "unremarkable study. No acute abnormality."
]

# 3. Urgency Context (for ai_classify)
clinical_history = [
    "Patient presenting with acute shortness of breath.",
    "Trauma activation, status post fall from ladder.",
    "Severe 10/10 thunderclap headache.",
    "Chronic lower back pain for evaluation.",
    "Follow-up on pulmonary nodule."
]

def generate_radiology_report():
    """Generates a messy, free-text radiology report."""
    is_critical = random.choice([True, False])
    
    # Pick a finding based on critical status
    finding = random.choice(critical_findings) if is_critical else random.choice(routine_findings)
    history = random.choice(clinical_history)
    modality = random.choice(modalities)
    body_part = random.choice(body_parts)
    
    # Construct the unstructured text blob
    # We deliberately add variation in formatting to test the AI's flexibility
    report_text = f"EXAM: {modality} {body_part}\n"
    report_text += f"HISTORY: {history}\n"
    report_text += f"FINDINGS: The study was performed without contrast. {finding} "
    report_text += f"Bones appear intact. Soft tissues unremarkable.\n"
    report_text += f"IMPRESSION: {finding} Correlation with clinical symptoms suggested."
    
    return report_text, is_critical

# --- Build the Dataset ---

def create_demo_data(num_rows=50):
    data = []
    
    for _ in range(num_rows):
        report_text, is_critical = generate_radiology_report()
        
        row = {
            "report_id": fake.uuid4(),
            "patient_id": fake.random_int(min=10000, max=99999),
            "patient_name": f"{fake.first_name()} {fake.last_name()}",
            "ingestion_date": fake.date_time_between(start_date='-1d', end_date='now'),
            "referring_physician": f"Dr. {fake.last_name()}",
            # The raw text column is what we will target with AI Functions
            "raw_text": report_text,
            # Ground truth for validation (optional)
            "ground_truth_critical": is_critical
        }
        data.append(row)
            
    return pd.DataFrame(data)

# Generate the data
pdf = create_demo_data(50)

# Display the first few rows to verify unstructured text
print(pdf[['report_id', 'raw_text']].head(3))

# --- Convert to Spark for Databricks ---
# In a Databricks environment, uncomment the lines below to create the table:
spark.sql("use catalog users")
spark.sql("drop table users.guy_livni.incoming_radiology_reports")
spark_df = spark.createDataFrame(pdf)
spark_df.write.mode("overwrite").saveAsTable("users.guy_livni.incoming_radiology_reports")
