In [2]:
# CELL 1: Imports & paths
import pandas as pd
import numpy as np
from pathlib import Path

ROOT = Path.cwd().parent
RAW_DIR = ROOT / "raw"
OUT_DIR = ROOT / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# try both spellings
candidates = [RAW_DIR / "spo2Data.csv", RAW_DIR / "sp02Data.csv"]
SPO2_FILE = next((p for p in candidates if p.exists()), None)
assert SPO2_FILE is not None, f"Missing file: {candidates[0]} (or {candidates[1]})"
SPO2_FILE


WindowsPath('d:/SOMNiA/AI/raw/sp02Data.csv')

In [3]:
# CELL 2: Load SpO2 raw
s2_raw = pd.read_csv(SPO2_FILE)
s2_raw.head()


Unnamed: 0,Heart_Rate,SpO2,Stress_Level,Sleep_Hours,Steps,Anomaly
0,82,99,5,7.150235,14479,0
1,72,98,10,5.290899,14058,1
2,84,97,2,5.790723,6622,0
3,97,95,7,4.573354,9052,0
4,71,98,2,6.301869,2378,0


In [4]:
# CELL 3: Cleaning rules (drop invalids, no imputation)
# Expected columns: Heart_Rate, SpO2, Stress_Level, Sleep_Hours, Steps, Anomaly
expected = ["Heart_Rate","SpO2","Stress_Level","Sleep_Hours","Steps","Anomaly"]
missing = [c for c in expected if c not in s2_raw.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

RANGES = {
    "Heart_Rate": (30, 220),
    "SpO2": (50, 100),             # typical clinical range 70â€“100; be generous
    "Stress_Level": (0, 10),
    "Sleep_Hours": (0, 24),
    "Steps": (0, 200_000),
    "Anomaly": (0, 1),             # 0/1 only
}


In [5]:
# CELL 4: Coerce types and filter by ranges
s2 = s2_raw.copy()

for c in expected:
    s2[c] = pd.to_numeric(s2[c], errors="coerce")

rows_before = len(s2)

# Hard filter ranges (keep NaN? -> for cross-domain we avoid fabricating; but here all should be numeric)
mask_ok = pd.Series(True, index=s2.index)
for c, (lo, hi) in RANGES.items():
    mask_ok &= s2[c].between(lo, hi)

# Also ensure Anomaly âˆˆ {0,1}
mask_ok &= s2["Anomaly"].isin([0,1])

s2_clean = s2.loc[mask_ok].copy()
rows_after = len(s2_clean)
rows_dropped = rows_before - rows_after

# Add a stable record id (no person_id exists here; do NOT invent)
s2_clean.insert(0, "record_id", np.arange(1, len(s2_clean)+1))

# Rename to domain-friendly columns
s2_clean = s2_clean.rename(columns={
    "Heart_Rate":"HR_bpm",
    "SpO2":"SPO2_pct",
    "Stress_Level":"STRESS_level",
    "Sleep_Hours":"SLEEP_hours",
    "Steps":"ACTIVITY_steps",
    "Anomaly":"SPO2_anomaly"
})

s2_clean.head()


Unnamed: 0,record_id,HR_bpm,SPO2_pct,STRESS_level,SLEEP_hours,ACTIVITY_steps,SPO2_anomaly
0,1,82,99,5,7.150235,14479,0
1,2,72,98,10,5.290899,14058,1
2,3,84,97,2,5.790723,6622,0
3,4,97,95,7,4.573354,9052,0
4,5,71,98,2,6.301869,2378,0


In [6]:
# CELL 5: QC summary and save
qc = {
    "rows_before": rows_before,
    "rows_after": rows_after,
    "rows_dropped": rows_dropped,
    "na_counts": s2_clean.isna().sum().to_dict(),
    "value_ranges": {k: (float(s2_clean[k].min()), float(s2_clean[k].max())) for k in ["HR_bpm","SPO2_pct","STRESS_level","SLEEP_hours","ACTIVITY_steps"]},
    "anomaly_counts": s2_clean["SPO2_anomaly"].value_counts(dropna=False).to_dict()
}

s2_out = OUT_DIR / "spo2_clean.csv"
qc_out = OUT_DIR / "spo2_qc.json"

s2_clean.to_csv(s2_out, index=False)
pd.Series(qc, dtype="object").to_json(qc_out, indent=2)

print(f"âœ… Saved: {s2_out}")
print(f"ðŸ§¾ QC:    {qc_out}")
qc


âœ… Saved: d:\SOMNiA\AI\processed\spo2_clean.csv
ðŸ§¾ QC:    d:\SOMNiA\AI\processed\spo2_qc.json


{'rows_before': 1000,
 'rows_after': 969,
 'rows_dropped': 31,
 'na_counts': {'record_id': 0,
  'HR_bpm': 0,
  'SPO2_pct': 0,
  'STRESS_level': 0,
  'SLEEP_hours': 0,
  'ACTIVITY_steps': 0,
  'SPO2_anomaly': 0},
 'value_ranges': {'HR_bpm': (34.0, 132.0),
  'SPO2_pct': (91.0, 100.0),
  'STRESS_level': (0.0, 10.0),
  'SLEEP_hours': (1.5128664201333557, 11.092920448109329),
  'ACTIVITY_steps': (1001.0, 14991.0)},
 'anomaly_counts': {0: 777, 1: 192}}