 ## Phase 1 â€” Data Preprocessing

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [7]:
df = pd.read_csv("../data/heart-disease-dataset.csv")
df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,oldpeak,st_slope,target
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,1.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,1.0
4,54.0,1.0,3.0,150.0,195.0,0.0,0.0,122.0,0.0,0.0,1.0,0.0


In [8]:
print("Shape:", df.shape)
df.head()

Shape: (1190, 12)


Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,oldpeak,st_slope,target
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,1.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,1.0
4,54.0,1.0,3.0,150.0,195.0,0.0,0.0,122.0,0.0,0.0,1.0,0.0


In [9]:
#  Check for missing values in each column
df.isnull().sum()

age                    0
sex                    0
chest_pain_type        0
resting_bp_s           0
cholesterol            0
fasting_blood_sugar    0
resting_ecg            0
max_heart_rate         0
exercise_angina        0
oldpeak                0
st_slope               0
target                 0
dtype: int64

In [10]:
#  Identify numeric and categorical columns
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
if "target" in num_cols:
    num_cols.remove("target")

cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['age', 'sex', 'chest_pain_type', 'resting_bp_s', 'cholesterol', 'fasting_blood_sugar', 'resting_ecg', 'max_heart_rate', 'exercise_angina', 'oldpeak', 'st_slope']
Categorical columns: []


In [11]:
# - Numeric features: fill missing with median, then scale
# - Categorical features: fill missing with most frequent, then one-hot encode
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine into a ColumnTransformer
if len(cat_cols) > 0:
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols)
        ]
    )
else:
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols)
        ]
    )


In [12]:
#  Apply preprocessing pipeline to dataset
X = df.drop(columns=["target"])
y = df["target"]

X_processed = preprocessor.fit_transform(X)

# Get feature names after transformation
try:
    ohe_features = preprocessor.named_transformers_["cat"]["encoder"].get_feature_names_out(cat_cols)
except Exception:
    ohe_features = np.array([])

all_features = np.concatenate([num_cols, ohe_features]) if len(ohe_features)>0 else np.array(num_cols)

# Convert to DataFrame
if hasattr(X_processed, "toarray"):
    X_arr = X_processed.toarray()
else:
    X_arr = X_processed

df_cleaned = pd.DataFrame(X_arr, columns=all_features)
df_cleaned["target"] = y.values

print("Cleaned dataset shape:", df_cleaned.shape)
df_cleaned.head()


Cleaned dataset shape: (1190, 12)


Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,oldpeak,st_slope,target
0,-1.466728,0.555995,-1.318351,0.427328,0.775674,-0.520929,-0.802672,1.265039,-0.795219,-0.849792,-1.023217,0.0
1,-0.5046,-1.798576,-0.248932,1.516587,-0.299512,-0.520929,-0.802672,0.637758,-0.795219,0.071119,0.615583,1.0
2,-1.787437,0.555995,-1.318351,-0.117301,0.716489,-0.520929,0.346762,-1.636136,-0.795219,-0.849792,-1.023217,0.0
3,-0.611503,-1.798576,0.820487,0.318402,0.035867,-0.520929,-0.802672,-1.244085,1.257515,0.531575,0.615583,1.0
4,0.029915,0.555995,-0.248932,0.971958,-0.15155,-0.520929,-0.802672,-0.695214,-0.795219,-0.849792,-1.023217,0.0


In [18]:
import os
import joblib

os.makedirs("../data", exist_ok=True)
os.makedirs("../models", exist_ok=True)

In [19]:
df_cleaned.to_csv("../data/heart_disease_cleaned.csv", index=False)
joblib.dump(preprocessor, "../models/preprocessor_pipeline.pkl")

['../models/preprocessor_pipeline.pkl']