In [19]:
# 1. Import libraries
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta
import os

# 2. Setup Faker and random seed
fake = Faker()
random.seed(42)
Faker.seed(42)
fake.seed_instance(42)

# 3. Configuration
num_records = 100
diagnoses = ['Hypertension', 'Diabetes', 'Asthma', 'Healthy', 'Obesity', 'Heart Disease']
genders = ['Male', 'Female']
smoking_statuses = ['Smoker', 'Non-smoker']
activity_levels = ['Sedentary', 'Moderate', 'Active']
cholesterol_levels = ['Low', 'Normal', 'High']
adherence_status = ['Adherent', 'Non-adherent']

# 4. Generate deterministic 5-digit patient IDs
patient_ids = list(range(10001, 10001 + num_records))

# 5. Generate health records (repeatable)
records = []
for i in range(num_records):
    age = 20 + (i % 60)  # Cycles from 20 to 79
    weight_kg = 55 + (i % 40)
    height_m = 1.55 + (i % 10) * 0.01
    bmi = round(weight_kg / (height_m ** 2), 1)
    systolic = 100 + (i % 40)
    diastolic = 60 + (i % 25)
    bp = f"{systolic}/{diastolic}"
    visit_date = datetime(2023, 1, 1) + timedelta(days=i*5)
    readmitted = 'Yes' if i % 7 == 0 else 'No'

    # ✅ This block must be INSIDE the loop
    record = {
        'patient_id': patient_ids[i],
        'name': fake.unique.name(),
        'age': age,
        'gender': genders[i % len(genders)],
        'diagnosis': diagnoses[i % len(diagnoses)],
        'blood_pressure': bp,
        'bmi': bmi,
        'smoking_status': smoking_statuses[i % len(smoking_statuses)],
        'physical_activity': activity_levels[i % len(activity_levels)],
        'cholesterol_level': cholesterol_levels[i % len(cholesterol_levels)],
        'medication_adherence': adherence_status[i % len(adherence_status)],
        'hospital_stay_days': i % 15,
        'readmitted': readmitted,
        'visit_date': visit_date.date()
    }
    records.append(record)




In [None]:
# Using ETL to extract, transform, and load health records data
# 6. Create DataFrame
df = pd.DataFrame(records)

df.drop_duplicates(inplace=True)
df['visit_date'] = pd.to_datetime(df['visit_date'])

# 7. Save DataFrame to CSV in the correct raw data folder
output_dir = '../data/raw'  # go up one level from notebooks/ to access data/
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'health_records.csv')
df.to_csv(output_file, index=False)


# 8. Preview all data
df

Unnamed: 0,patient_id,name,age,gender,diagnosis,blood_pressure,bmi,smoking_status,physical_activity,cholesterol_level,medication_adherence,hospital_stay_days,readmitted,visit_date
0,10001,Allison Hill,20,Male,Hypertension,100/60,22.9,Smoker,Sedentary,Low,Adherent,0,Yes,2023-01-01
1,10002,Noah Rhodes,21,Female,Diabetes,101/61,23.0,Non-smoker,Moderate,Normal,Non-adherent,1,No,2023-01-06
2,10003,Angie Henderson,22,Male,Asthma,102/62,23.1,Smoker,Active,High,Adherent,2,No,2023-01-11
3,10004,Daniel Wagner,23,Female,Healthy,103/63,23.2,Non-smoker,Sedentary,Low,Non-adherent,3,No,2023-01-16
4,10005,Cristian Santos,24,Male,Obesity,104/64,23.3,Smoker,Moderate,Normal,Adherent,4,No,2023-01-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,10096,Anna Henderson,55,Female,Heart Disease,115/80,27.3,Non-smoker,Active,High,Non-adherent,5,No,2024-04-20
96,10097,Aaron Wise,56,Male,Hypertension,116/81,27.4,Smoker,Sedentary,Low,Adherent,6,No,2024-04-25
97,10098,Deborah Figueroa,57,Female,Diabetes,117/82,27.4,Non-smoker,Moderate,Normal,Non-adherent,7,No,2024-04-30
98,10099,Jessica Smith,58,Male,Asthma,118/83,27.5,Smoker,Active,High,Adherent,8,Yes,2024-05-05


In [25]:
# 9. Add enriched fields using vectorized operations
def enrich_row(row):
    age = row['age']
    bmi = row['bmi']
    smoking = row['smoking_status']
    cholesterol = row['cholesterol_level']
    diagnosis = row['diagnosis']
    
    # Age group
    if age < 18:
        age_group = 'Child'
    elif age < 60:
        age_group = 'Adult'
    else:
        age_group = 'Elderly'
    
    # Risk score
    risk_score = 0
    if smoking == 'Smoker':
        risk_score += 2
    if cholesterol == 'High':
        risk_score += 2
    if bmi >= 30:
        risk_score += 2
    elif bmi >= 25:
        risk_score += 1

    # Chronic condition flag
    is_chronic = 1 if diagnosis in ['Hypertension', 'Diabetes', 'Heart Disease', 'Asthma'] else 0
    
    return pd.Series([age_group, risk_score, is_chronic], index=['age_group', 'risk_score', 'is_chronic'])

# Apply to DataFrame
df[['age_group', 'risk_score', 'is_chronic']] = df.apply(enrich_row, axis=1)

# Save cleaned and enriched data to transformed folder
transformed_dir = '../data/transformed'
os.makedirs(transformed_dir, exist_ok=True)
transformed_file = os.path.join(transformed_dir, 'cleaned_health_records.csv')
df.to_csv(transformed_file, index=False)


# 10. Preview
df.head()


Unnamed: 0,patient_id,name,age,gender,diagnosis,blood_pressure,bmi,smoking_status,physical_activity,cholesterol_level,medication_adherence,hospital_stay_days,readmitted,visit_date,age_group,risk_score,is_chronic
0,10001,Allison Hill,20,Male,Hypertension,100/60,22.9,Smoker,Sedentary,Low,Adherent,0,Yes,2023-01-01,Adult,2,1
1,10002,Noah Rhodes,21,Female,Diabetes,101/61,23.0,Non-smoker,Moderate,Normal,Non-adherent,1,No,2023-01-06,Adult,0,1
2,10003,Angie Henderson,22,Male,Asthma,102/62,23.1,Smoker,Active,High,Adherent,2,No,2023-01-11,Adult,4,1
3,10004,Daniel Wagner,23,Female,Healthy,103/63,23.2,Non-smoker,Sedentary,Low,Non-adherent,3,No,2023-01-16,Adult,0,0
4,10005,Cristian Santos,24,Male,Obesity,104/64,23.3,Smoker,Moderate,Normal,Adherent,4,No,2023-01-21,Adult,2,0


In [22]:
# 11.Handling outliers, standardizing formats, removing noise"
df.drop_duplicates(inplace=True)
df['visit_date'] = pd.to_datetime(df['visit_date'])
