In [8]:
import pandas as pd
import numpy as np

# ----------------------------
# 1. Load Data
# ----------------------------
df = pd.read_csv("../data/raw/heart_disease.csv")

# ----------------------------
# 2. Select Relevant Columns 
# ----------------------------
relevant_cols = [
    'male', 'age', 'currentSmoker', 'cigsPerDay', 
    'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 
    'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'
]
df = df[relevant_cols]

# ----------------------------
# 3. Fill Missing Values
# ----------------------------
# Numerical
num_cols = df.select_dtypes(include=np.number).columns.tolist()
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical
cat_cols = ['BPMeds']
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# ----------------------------
# 4. Outlier Capping (1st/99th percentile)
# ----------------------------
for col in num_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)

# ----------------------------
# 5. Ensure No Missing Values
# ----------------------------
assert df.isna().sum().sum() == 0, "Data still contains missing values!"

# ----------------------------
# 6. Save Clean Dataset
# ----------------------------
df.to_csv("../data/processed/heart_disease_clean_v2.csv", index=False)
print("Enhanced dataset saved as '../data/processed/heart_disease_clean_v2.csv'")

# ----------------------------
# 7. Optional: Class Distribution
# ----------------------------
print(df['TenYearCHD'].value_counts(normalize=True))

df.head(10)


Enhanced dataset saved as '../data/processed/heart_disease_clean_v2.csv'
TenYearCHD
0    0.848113
1    0.151887
Name: proportion, dtype: float64


Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0
5,0,43,0,0.0,0.0,0,1,0,228.0,180.0,110.0,30.3,77.0,99.0,0
6,0,63,0,0.0,0.0,0,0,0,205.0,138.0,71.0,33.11,60.0,85.0,1
7,0,45,1,20.0,0.0,0,0,0,313.0,100.0,71.0,21.68,79.0,78.0,0
8,1,52,0,0.0,0.0,0,1,0,260.0,141.5,89.0,26.36,76.0,79.0,0
9,1,43,1,30.0,0.0,0,1,0,225.0,162.0,107.0,23.61,93.0,88.0,0
