In [1]:
# CELL 1: Imports & paths
import pandas as pd
import numpy as np
from pathlib import Path

ROOT = Path.cwd().parent            # notebooks/ -> project root
RAW_DIR = ROOT / "raw"
OUT_DIR = ROOT / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# input file
BP_FILE = RAW_DIR / "BloodPressureData.csv"   # keep in raw/
assert BP_FILE.exists(), f"Missing file: {BP_FILE}"


In [2]:
# CELL 2: Load raw blood pressure data
bp_raw = pd.read_csv(BP_FILE)
bp_raw.head()


Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [3]:
# CELL 3: Helper to split 'Blood Pressure' like "120/80" -> systolic, diastolic
def split_bp(s):
    if pd.isna(s):
        return np.nan, np.nan
    if isinstance(s, str) and "/" in s:
        a, b = s.split("/", 1)
        try:
            return float(a), float(b)
        except:
            return np.nan, np.nan
    return np.nan, np.nan

# Valid ranges for quick plausibility screens (drop, no imputation)
RANGES = {
    "Age": (10, 100),
    "Sleep Duration": (0, 24),
    "Quality of Sleep": (0, 10),
    "Physical Activity Level": (0, 200),  # minutes or score; generous
    "Stress Level": (0, 10),
    "Heart Rate": (30, 220),
    "Daily Steps": (0, 200_000),
    # BP will be validated after split
}


In [4]:
# CELL 4: Clean & normalize
bp = bp_raw.copy()

# Normalize column names (strip spaces)
bp.columns = [c.strip() for c in bp.columns]

# Person ID -> person_id (string to align with other domains later)
if "Person ID" in bp.columns:
    bp["person_id"] = bp["Person ID"].astype(str)
else:
    raise ValueError("Column 'Person ID' not found")

# Split Blood Pressure
bp[["bp_systolic","bp_diastolic"]] = bp["Blood Pressure"].apply(lambda s: pd.Series(split_bp(s)))

# Coerce numerics
num_cols = ["Age","Sleep Duration","Quality of Sleep","Physical Activity Level",
            "Stress Level","Heart Rate","Daily Steps","bp_systolic","bp_diastolic"]
for c in num_cols:
    if c in bp.columns:
        bp[c] = pd.to_numeric(bp[c], errors="coerce")

# Drop exact duplicates
bp = bp.drop_duplicates().reset_index(drop=True)

# Range-based row filtering (drop out-of-range only for columns present)
mask_ok = pd.Series(True, index=bp.index)
for c, (lo, hi) in RANGES.items():
    if c in bp.columns:
        mask_ok &= bp[c].between(lo, hi) | bp[c].isna()  # keep NaN (we won't impute here)

# BP plausibility (very loose medical windows)
if {"bp_systolic","bp_diastolic"}.issubset(bp.columns):
    mask_ok &= (bp["bp_systolic"].between(70, 250) | bp["bp_systolic"].isna())
    mask_ok &= (bp["bp_diastolic"].between(40, 150) | bp["bp_diastolic"].isna())
    # require systolic >= diastolic when both present
    both = bp["bp_systolic"].notna() & bp["bp_diastolic"].notna()
    mask_ok &= (~both) | (bp["bp_systolic"] >= bp["bp_diastolic"])

rows_before = len(bp)
bp_clean = bp.loc[mask_ok].copy()
rows_after = len(bp_clean)
rows_dropped = rows_before - rows_after

# Standardize categoricals to lower_snake for downstream ML
def norm_cat(s):
    if pd.isna(s): return np.nan
    return str(s).strip().lower().replace(" ", "_")

for cat_col in ["Gender","Occupation","BMI Category","Sleep Disorder"]:
    if cat_col in bp_clean.columns:
        bp_clean[cat_col] = bp_clean[cat_col].map(norm_cat)

# Rename to domain-prefixed schema
rename_map = {
    "Gender":"demo_gender",
    "Age":"demo_age",
    "Occupation":"demo_occupation",
    "Sleep Duration":"SLEEP_hours",
    "Quality of Sleep":"SLEEP_quality",
    "Physical Activity Level":"ACTIVITY_level",
    "Stress Level":"STRESS_level",
    "BMI Category":"demo_bmi_category",
    "Heart Rate":"HR_bpm",
    "Daily Steps":"ACTIVITY_steps",
    "Sleep Disorder":"SLEEP_disorder"
}
bp_clean = bp_clean.rename(columns=rename_map)

# Keep tidy set of columns
keep_cols = ["person_id","demo_gender","demo_age","demo_occupation",
             "demo_bmi_category","SLEEP_hours","SLEEP_quality","SLEEP_disorder",
             "ACTIVITY_level","ACTIVITY_steps","STRESS_level","HR_bpm",
             "bp_systolic","bp_diastolic"]
bp_clean = bp_clean[[c for c in keep_cols if c in bp_clean.columns]]

bp_clean.head()


Unnamed: 0,person_id,demo_gender,demo_age,demo_occupation,demo_bmi_category,SLEEP_hours,SLEEP_quality,SLEEP_disorder,ACTIVITY_level,ACTIVITY_steps,STRESS_level,HR_bpm,bp_systolic,bp_diastolic
0,1,male,27,software_engineer,overweight,6.1,6,,42,4200,6,77,126.0,83.0
1,2,male,28,doctor,normal,6.2,6,,60,10000,8,75,125.0,80.0
2,3,male,28,doctor,normal,6.2,6,,60,10000,8,75,125.0,80.0
3,4,male,28,sales_representative,obese,5.9,4,sleep_apnea,30,3000,8,85,140.0,90.0
4,5,male,28,sales_representative,obese,5.9,4,sleep_apnea,30,3000,8,85,140.0,90.0


In [5]:
# CELL 5: QC summary and save
qc = {
    "rows_before": rows_before,
    "rows_after": rows_after,
    "rows_dropped": rows_dropped,
    "na_counts": bp_clean.isna().sum().to_dict(),
    "duplicates_in_person_id": int(bp_clean["person_id"].duplicated().sum())
}

bp_out = OUT_DIR / "bloodpressure_clean.csv"
qc_out = OUT_DIR / "bloodpressure_qc.json"

bp_clean.to_csv(bp_out, index=False)
pd.Series(qc, dtype="object").to_json(qc_out, indent=2)

print(f"âœ… Saved: {bp_out}")
print(f"ðŸ§¾ QC:    {qc_out}")
qc


âœ… Saved: d:\SOMNiA\AI\processed\bloodpressure_clean.csv
ðŸ§¾ QC:    d:\SOMNiA\AI\processed\bloodpressure_qc.json


{'rows_before': 374,
 'rows_after': 374,
 'rows_dropped': 0,
 'na_counts': {'person_id': 0,
  'demo_gender': 0,
  'demo_age': 0,
  'demo_occupation': 0,
  'demo_bmi_category': 0,
  'SLEEP_hours': 0,
  'SLEEP_quality': 0,
  'SLEEP_disorder': 219,
  'ACTIVITY_level': 0,
  'ACTIVITY_steps': 0,
  'STRESS_level': 0,
  'HR_bpm': 0,
  'bp_systolic': 0,
  'bp_diastolic': 0},
 'duplicates_in_person_id': 0}