In [17]:
import pandas as pd
import numpy as np

# ----------------------------
# 1. Load Data
# ----------------------------
df = pd.read_csv("../data/raw/heart_disease.csv")

# ----------------------------
# 2. Select Relevant Columns
# ----------------------------
relevant_cols = [
    'male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 
    'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 
    'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'
]
df = df[relevant_cols]

# ----------------------------
# 3. Fill Missing Values
# ----------------------------
# Numerical
num_cols = df.select_dtypes(include=np.number).columns.tolist()
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical
cat_cols = ['education', 'BPMeds']
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# ----------------------------
# 4. Outlier Capping (1st/99th percentile)
# ----------------------------
for col in num_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)

# ----------------------------
# 5. Feature Engineering
# ----------------------------
# Pulse pressure
df['pulse_pressure'] = df['sysBP'] - df['diaBP']

# Approximate smoking exposure (pack-years)
df['smoking_pack_years'] = df['cigsPerDay'] * (df['age'] - 18).clip(lower=0)

# ----------------------------
# 6. Ensure No Missing Values
# ----------------------------
assert df.isna().sum().sum() == 0, "Data still contains missing values!"

# ----------------------------
# 7. Save Enhanced Dataset
# ----------------------------
df.to_csv("../data/processed/heart_disease_clean_v2.csv", index=False)
print("Enhanced dataset saved as '../data/processed/heart_disease_clean_v2.csv'")

# ----------------------------
# 8. Optional: Class Distribution
# ----------------------------
print("CHD incidence:")
print(df['TenYearCHD'].value_counts(normalize=True))

df.head(10)


Enhanced dataset saved as '../data/processed/heart_disease_clean_v2.csv'
CHD incidence:
TenYearCHD
0    0.848113
1    0.151887
Name: proportion, dtype: float64


AttributeError: module 'pandas' has no attribute 'head'