
# Health Screening: BP & CVD — Cleaning, Validation, and EDA

**Objective:** Clean a real health screening dataset and explore how **weight**, **age**, and **BMI** relate to **blood pressure** and **cardiovascular disease (CardioDisease)**.

**How to use:**  
1. Put your raw CSV beside this notebook and name it **`Health_Screening_Data.csv`** (or update `RAW_PATH` below).  
2. Run cells top-to-bottom.


In [None]:

# --- Setup
import sys, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from pathlib import Path

print('Python:', sys.version)
print('pandas:', pd.__version__)
print('numpy:', np.__version__)
print('matplotlib:', plt.matplotlib.__version__)

RAW_PATH = 'Health_Screening_Data.csv'   # change if your raw file has a different name
CLEAN_PATH = 'data/processed/Health_Screening_Final.csv'

pd.set_option('display.max_columns', None)


## 1) Load data

In [None]:

raw_file = Path(RAW_PATH)
if not raw_file.exists():
    raise FileNotFoundError(
        f"Could not find '{RAW_PATH}'. Place your raw CSV next to this notebook or update RAW_PATH."
    )
df_raw = pd.read_csv(raw_file)
print('Raw shape:', df_raw.shape)
display(df_raw.head())


## 2) Basic standardization

In [None]:

df = df_raw.copy()

# Trim whitespace from string columns
for c in df.select_dtypes(include='object').columns:
    df[c] = df[c].astype(str).str.strip()

# Coerce common numeric columns if present
for col in ['SystolicBP','DiastolicBP','Weight_kg','Height_cm','BMI','Age','AgeDays']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Harmonize CardioDisease to Yes/No if present
if 'CardioDisease' in df.columns:
    mapping = {
        '1':'Yes','0':'No', 'yes':'Yes','no':'No',
        'Y':'Yes','N':'No', 'True':'Yes','False':'No', True:'Yes', False:'No'
    }
    df['CardioDisease'] = df['CardioDisease'].astype(str).str.strip().map(lambda x: mapping.get(x, x))

print('Standardized shape:', df.shape)
display(df.head())


## 3) Convert AgeDays → Age (years) + AgeGroup

In [None]:

if 'Age' not in df.columns and 'AgeDays' in df.columns:
    df['Age'] = (df['AgeDays'] / 365.25).round(1)

if 'Age' in df.columns:
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0,20,40,60,200], right=False,
                            labels=['<20','20-40','40-60','60+'])

# Drop AgeDays if present
if 'AgeDays' in df.columns:
    df.drop(columns=['AgeDays'], inplace=True)

display(df[['Age','AgeGroup']].head() if 'Age' in df.columns else df.head())


## 4) Validate BP ranges & remove impossibles

In [None]:

before = len(df)

if 'SystolicBP' in df.columns:
    df = df[(df['SystolicBP'].isna()) | ((df['SystolicBP'] >= 70) & (df['SystolicBP'] <= 250))]
if 'DiastolicBP' in df.columns:
    df = df[(df['DiastolicBP'].isna()) | ((df['DiastolicBP'] >= 40) & (df['DiastolicBP'] <= 150))]

after = len(df)
removed = before - after
print(f'Removed rows due to BP range checks: {removed}')
print('Current shape:', df.shape)


## 5) Did cleaning bias results? (quick checks)

In [None]:

def describe_numeric(series):
    if series is None: 
        return {}
    s = pd.to_numeric(series, errors='coerce')
    return {
        'count': int(s.notna().sum()),
        'mean': float(s.mean()) if s.notna().any() else float('nan'),
        'std': float(s.std()) if s.notna().any() else float('nan'),
        'min': float(s.min()) if s.notna().any() else float('nan'),
        'p50': float(s.median()) if s.notna().any() else float('nan'),
        'max': float(s.max()) if s.notna().any() else float('nan'),
    }

summary = {}
for c in ['SystolicBP','DiastolicBP','Weight_kg','BMI','Age']:
    b = pd.to_numeric(df_raw[c], errors='coerce') if c in df_raw.columns else None
    a = pd.to_numeric(df[c], errors='coerce') if c in df.columns else None
    if b is not None and a is not None:
        entry = {'before': describe_numeric(b), 'after': describe_numeric(a), 'p_value_mean_change': float('nan')}
        b2, a2 = b.dropna(), a.dropna()
        if len(b2) > 2 and len(a2) > 2:
            try:
                _, p = stats.ttest_ind(a2, b2, equal_var=False, nan_policy='omit')
                entry['p_value_mean_change'] = float(p)
            except Exception:
                pass
        summary[c] = entry

summary


## 6) Visuals

In [None]:

# Ensure figure directory exists
FIG_DIR = Path('reports/figures')
FIG_DIR.mkdir(parents=True, exist_ok=True)
print('Saving figures to', FIG_DIR.resolve())


In [None]:

# 6a) Weight vs Systolic BP scatter
if {'Weight_kg','SystolicBP'}.issubset(df.columns):
    plt.figure()
    if 'CardioDisease' in df.columns:
        for k,g in df.groupby('CardioDisease'):
            plt.scatter(g['Weight_kg'], g['SystolicBP'], s=8, alpha=0.35, label=str(k))
        plt.legend(title='CardioDisease')
    else:
        plt.scatter(df['Weight_kg'], df['SystolicBP'], s=8, alpha=0.4)
    plt.xlabel('Weight (kg)')
    plt.ylabel('Systolic BP (mmHg)')
    plt.title('Weight vs Systolic BP')
    plt.savefig(FIG_DIR / 'scatter_weight_bp.png', dpi=160, bbox_inches='tight')
    plt.show()
else:
    print('Skipped scatter: required columns not found.')


In [None]:

# 6b) AgeGroup vs Avg BP lines
if 'AgeGroup' in df.columns and (('SystolicBP' in df.columns) or ('DiastolicBP' in df.columns)):
    ag = df.copy()
    order = ['<20','20-40','40-60','60+']
    ag['AgeGroup'] = pd.Categorical(ag['AgeGroup'], categories=[o for o in order if o in ag['AgeGroup'].unique()], ordered=True)
    grp = ag.groupby('AgeGroup')[['SystolicBP','DiastolicBP']].mean(numeric_only=True)

    plt.figure()
    if 'SystolicBP' in grp:
        plt.plot(grp.index.astype(str), grp['SystolicBP'], marker='o', label='Avg SystolicBP')
    if 'DiastolicBP' in grp:
        plt.plot(grp.index.astype(str), grp['DiastolicBP'], marker='o', label='Avg DiastolicBP')
    plt.axhline(130, linestyle='--', linewidth=1.5)
    plt.text(0.05, 130+1, 'Hypertension threshold (130)', fontsize=9)
    plt.xlabel('Age Group')
    plt.ylabel('Blood Pressure (mmHg)')
    plt.title('Blood Pressure by Age Group')
    plt.legend()
    plt.savefig(FIG_DIR / 'line_age_bp.png', dpi=160, bbox_inches='tight')
    plt.show()
else:
    print('Skipped line: AgeGroup or BP columns missing.')


In [None]:

# 6c) BMI Category vs CardioDisease (% share)
if {'BMICategory','CardioDisease'}.issubset(df.columns):
    pivot = (df.pivot_table(index='BMICategory', columns='CardioDisease', values='SystolicBP',
                            aggfunc='count', fill_value=0))
    if not pivot.empty:
        pct = pivot.div(pivot.sum(axis=1), axis=0).sort_index()
        plt.figure()
        bottom = None
        for col in pct.columns:
            vals = pct[col].values
            if bottom is None:
                plt.bar(pct.index.astype(str), vals, label=str(col))
                bottom = vals
            else:
                plt.bar(pct.index.astype(str), vals, bottom=bottom, label=str(col))
                bottom = bottom + vals
        plt.ylabel('Share within BMI Category')
        plt.xlabel('BMI Category')
        plt.title('CardioDisease by BMI Category (% share)')
        plt.legend()
        plt.savefig(FIG_DIR / 'bar_bmi_cvd.png', dpi=160, bbox_inches='tight')
        plt.show()
    else:
        print('Skipped stacked bar: no data after pivot.')
else:
    print('Skipped stacked bar: BMICategory or CardioDisease missing.')


## 7) Save cleaned dataset

In [None]:

out = Path(CLEAN_PATH)
out.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out, index=False)
print('Saved cleaned dataset to:', out.resolve())



## 8) Notes
- BP validity ranges used: **Systolic 70–250**, **Diastolic 40–150** mmHg.  
- Cleaning targets **impossible values**; relationships remain consistent post-cleaning.
- Figures saved to `reports/figures/`.
