In [57]:
import pandas as pd

from sklearn.model_selection import train_test_split 

In [58]:
df = pd.read_csv('../data/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [59]:
# binary encode all levels of disease
df['target'] = (df['target'] > 0).astype(int)


### handle missing values

In [60]:
df.isnull().sum()


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64

In [61]:
# Fill missing numerical values in 'ca' with the median
df['ca'] = df['ca'].fillna(df['ca'].median())

# Fill missing categorical values in 'thal' with the mode
df['thal']  = df['thal'].fillna(df['thal'].mode()[0])


#### Feature Scaling (Standardization)

In [86]:
from sklearn.preprocessing import StandardScaler

# Select numeric features to scale
num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Fit and transform the selected columns
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


In [63]:
df[num_cols].describe()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak
count,303.0,303.0,303.0,303.0,303.0
mean,-1.465641e-18,4.426236e-16,2.345026e-16,-1.172513e-16,2.3450260000000003e-17
std,1.001654,1.001654,1.001654,1.001654,1.001654
min,-2.819115,-2.145037,-2.334877,-3.442067,-0.8968617
25%,-0.7135564,-0.6652997,-0.690503,-0.7053073,-0.8968617
50%,0.1729945,-0.0961698,-0.1101357,0.1485618,-0.2067053
75%,0.7270888,0.4729601,0.5476139,0.7178079,0.4834512
max,2.500191,3.887739,6.138485,2.294182,4.451851


In [64]:
# # we have to one hot encode types/ levels of chest pain
# # df['cp'].unique() --> [1,2,3,4]

# df['cp_1'] = (df['cp'] == 1).astype(int)
# df['cp_2'] = (df['cp'] == 2).astype(int)
# df['cp_3'] = (df['cp'] == 3).astype(int)
# df['cp_4'] = (df['cp'] == 4).astype(int)

# df.drop('cp', axis=1, inplace=True) # drop original after


In [65]:
# #df.thal.unique()

# df['thal_3'] = (df['thal'] == 3).astype(int)
# df['thal_6'] = (df['thal'] == 6).astype(int)
# df['thal_7'] = (df['thal'] == 7).astype(int)

# df.drop('thal', axis = 1, inplace = True)


In [66]:
# #df.slope.unique()

# df['slope_1'] = (df['slope'] == 1).astype(int)
# df['slope_2'] = (df['slope'] == 2).astype(int)
# df['slope_3'] = (df['slope'] == 3).astype(int)

# df.drop('slope', axis = 1, inplace = True)

In [67]:


# #df.restecg.unique()

# df['restecg_0'] = (df['restecg'] == 0).astype(int)
# df['restecg_1'] = (df['restecg'] == 1).astype(int)
# df['restecg_2'] = (df['restecg'] == 2).astype(int)

# df.drop('restecg', axis = 1, inplace = True)

In [68]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.948726,1.0,1.0,0.757525,-0.264900,1.0,2.0,0.017197,0.0,1.087338,3.0,0.0,6.0,0
1,1.392002,1.0,4.0,1.611220,0.760415,0.0,2.0,-1.821905,1.0,0.397182,2.0,3.0,3.0,1
2,1.392002,1.0,4.0,-0.665300,-0.342283,0.0,2.0,-0.902354,1.0,1.346147,2.0,2.0,7.0,1
3,-1.932564,1.0,3.0,-0.096170,0.063974,0.0,0.0,1.637359,0.0,2.122573,3.0,0.0,3.0,0
4,-1.489288,0.0,2.0,-0.096170,-0.825922,0.0,2.0,0.980537,0.0,0.310912,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,-1.046013,1.0,1.0,-1.234430,0.334813,0.0,0.0,-0.770990,0.0,0.138373,2.0,0.0,7.0,1
299,1.502821,1.0,4.0,0.700612,-1.038723,1.0,0.0,-0.376896,0.0,2.036303,2.0,2.0,7.0,1
300,0.283813,1.0,4.0,-0.096170,-2.238149,0.0,0.0,-1.515388,1.0,0.138373,2.0,1.0,7.0,1
301,0.283813,0.0,2.0,-0.096170,-0.206864,0.0,2.0,1.068113,0.0,-0.896862,2.0,1.0,3.0,1


## Manual One‑Hot Encoding: Caveats & Best Practices

1. **Dummy‑Variable Trap (Multicollinearity)**
   - Creating _k_ dummies for _k_ categories (e.g., `cp_1`, `cp_2`, `cp_3`, `cp_4`) makes them sum to 1 for every row.
   - **Problem:** Linear models (e.g., Logistic Regression) can’t handle perfectly collinear inputs.
   - **Fix:** Drop one dummy per feature:
     ```python
     df.drop('cp_1', axis=1, inplace=True)
     ```
   
2. **Maintainability & Brevity**
   - Hand‑coding each dummy is verbose and error‑prone.
   - **Recommended:** Use pandas built‑in with `drop_first=True`:
     ```python
     df = pd.get_dummies(
       df,
       columns=['cp', 'restecg', 'slope', 'thal'],
       drop_first=True
     )
     ```

3. **Model Compatibility**
   - **Tree‑based models** (Random Forest, XGBoost) aren’t sensitive to multicollinearity.
   - **Linear models** require dropping one dummy to avoid singular matrices.

---  
*Use this pattern to keep your pipeline clean, concise, and compatible with any estimator.*  


In [69]:
df = pd.get_dummies(df,
                    columns=['cp','restecg','slope','thal'],
                    drop_first=True)


In [73]:
df

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,cp_2.0,cp_3.0,cp_4.0,restecg_1.0,restecg_2.0,slope_2.0,slope_3.0,thal_6.0,thal_7.0
0,0.948726,1.0,0.757525,-0.264900,1.0,0.017197,0.0,1.087338,0.0,0,False,False,False,False,True,False,True,True,False
1,1.392002,1.0,1.611220,0.760415,0.0,-1.821905,1.0,0.397182,3.0,1,False,False,True,False,True,True,False,False,False
2,1.392002,1.0,-0.665300,-0.342283,0.0,-0.902354,1.0,1.346147,2.0,1,False,False,True,False,True,True,False,False,True
3,-1.932564,1.0,-0.096170,0.063974,0.0,1.637359,0.0,2.122573,0.0,0,False,True,False,False,False,False,True,False,False
4,-1.489288,0.0,-0.096170,-0.825922,0.0,0.980537,0.0,0.310912,0.0,0,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,-1.046013,1.0,-1.234430,0.334813,0.0,-0.770990,0.0,0.138373,0.0,1,False,False,False,False,False,True,False,False,True
299,1.502821,1.0,0.700612,-1.038723,1.0,-0.376896,0.0,2.036303,2.0,1,False,False,True,False,False,True,False,False,True
300,0.283813,1.0,-0.096170,-2.238149,0.0,-1.515388,1.0,0.138373,1.0,1,False,False,True,False,False,True,False,False,True
301,0.283813,0.0,-0.096170,-0.206864,0.0,1.068113,0.0,-0.896862,1.0,1,True,False,False,False,True,True,False,False,False


In [74]:
from sklearn.model_selection import train_test_split

# Separate features & target
X = df.drop('target', axis=1)
y = df['target']

# Split 80% train / 20% test, stratify to keep class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Quick sanity check
print("Train shape:", X_train.shape, y_train.shape)
print("Test shape: ", X_test.shape,  y_test.shape)


Train shape: (242, 18) (242,)
Test shape:  (61, 18) (61,)


In [87]:

import joblib
# Save preprocessing & split
joblib.dump((X_train, X_test, y_train, y_test), "../models/split_data.pkl")
joblib.dump(scaler, "../models/scaler.pkl")


['../models/scaler.pkl']