# DATA CLEANING & PREPROCESSING

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Load Dataset

In [3]:
df = pd.read_csv("D:\\SEM - 6\\MLDL\\ML\\Cardio-ML-Project\\data\\cardio_train.csv", sep=";")
print("Initial Shape:", df.shape)
df.head()

Initial Shape: (70000, 13)


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


Convert Age from Days to Years

In [4]:
df['age_years'] = (df['age'] / 365.25).astype(int)
df.drop(columns=['age'], inplace=True)

In [5]:
df.head()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55
2,2,1,165,64.0,130,70,3,1,0,0,0,1,51
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48
4,4,1,156,56.0,100,60,1,1,0,0,0,0,47


Handle Invalid Blood Pressure Values

In [6]:
print("Shape befor removing outliers:", df.shape)

Shape befor removing outliers: (70000, 13)


In [7]:
df = df[(df['ap_hi'] >= 50) & (df['ap_hi'] <= 250)]
df = df[(df['ap_lo'] >= 30) & (df['ap_lo'] <= 150)]
df = df[df['ap_hi'] > df['ap_lo']]

In [8]:
print("Shape after removing outliers:", df.shape)

Shape after removing outliers: (68673, 13)


Handle Height and Weight Outliers

In [9]:
df = df[(df['height'] >= 120) & (df['height'] <= 220)]
df = df[(df['weight'] >= 30) & (df['weight'] <= 200)]

In [10]:
print("Shape after removing outliers:", df.shape)

Shape after removing outliers: (68617, 13)


# Feature Engineering : BMI Calculation

In [11]:
# Calculate BMI and add as a new feature
df['BMI'] = df['weight'] / ((df['height'] / 100) ** 2)

In [12]:
df.head()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679
2,2,1,165,64.0,130,70,3,1,0,0,0,1,51,23.507805
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479
4,4,1,156,56.0,100,60,1,1,0,0,0,0,47,23.011177


Remove unrealistic BMI values

In [13]:
df = df[(df['BMI'] >= 10) & (df['BMI'] <= 60)]

In [14]:
print("Shape after removing outliers:", df.shape)

Shape after removing outliers: (68594, 14)


# Encode Categorical Variables

Gender Encoding

In [15]:
df['gender'] = df['gender'].map({1: 0, 2: 1})
df.head()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI
0,0,1,168,62.0,110,80,1,1,0,0,1,0,50,21.96712
1,1,0,156,85.0,140,90,3,1,0,0,1,1,55,34.927679
2,2,0,165,64.0,130,70,3,1,0,0,0,1,51,23.507805
3,3,1,169,82.0,150,100,1,1,0,0,1,1,48,28.710479
4,4,0,156,56.0,100,60,1,1,0,0,0,0,47,23.011177


Cholesterol & Glucose Encoding (Ordinal)

In [16]:
# Maintain order while making features numeric.
df['cholesterol'] = df['cholesterol'] - 1
df['gluc'] = df['gluc'] - 1
df.head()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI
0,0,1,168,62.0,110,80,0,0,0,0,1,0,50,21.96712
1,1,0,156,85.0,140,90,2,0,0,0,1,1,55,34.927679
2,2,0,165,64.0,130,70,2,0,0,0,0,1,51,23.507805
3,3,1,169,82.0,150,100,0,0,0,0,1,1,48,28.710479
4,4,0,156,56.0,100,60,0,0,0,0,0,0,47,23.011177


Binary Lifestyle Features Validation

In [17]:
binary_cols = ['smoke', 'alco', 'active']
df[binary_cols] = df[binary_cols].astype(int)
print("Data types after conversion:", df.dtypes[binary_cols])
df[binary_cols]

Data types after conversion: smoke     int64
alco      int64
active    int64
dtype: object


Unnamed: 0,smoke,alco,active
0,0,0,1
1,0,0,1
2,0,0,0
3,0,0,1
4,0,0,0
...,...,...,...
69995,1,0,1
69996,0,0,1
69997,0,1,0
69998,0,0,0


drop id column

In [18]:
df = df.drop(columns=['id'])
df

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI
0,1,168,62.0,110,80,0,0,0,0,1,0,50,21.967120
1,0,156,85.0,140,90,2,0,0,0,1,1,55,34.927679
2,0,165,64.0,130,70,2,0,0,0,0,1,51,23.507805
3,1,169,82.0,150,100,0,0,0,0,1,1,48,28.710479
4,0,156,56.0,100,60,0,0,0,0,0,0,47,23.011177
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,1,168,76.0,120,80,0,0,1,0,1,0,52,26.927438
69996,0,158,126.0,140,90,1,1,0,0,1,1,61,50.472681
69997,1,183,105.0,180,90,2,0,0,1,0,1,52,31.353579
69998,0,163,72.0,135,80,0,1,0,0,0,1,61,27.099251


In [19]:
#Remove duplicates
print("Before dropping duplicates:", df.shape)
df = df.drop_duplicates()
print("After dropping duplicates:", df.shape)

Before dropping duplicates: (68594, 13)
After dropping duplicates: (65408, 13)


Separate Features and Target

In [21]:
X = df.drop(columns=['cardio'])
y = df['cardio']

Train–Test Split (Stratified)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

Feature Scaling

In [23]:
import pickle
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

num_cols = [
    'age_years', 'height', 'weight',
    'ap_hi', 'ap_lo', 'BMI'
]

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

with open("D:\\SEM - 6\\MLDL\\ML\\Cardio-ML-Project\\model\\scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

Final Shape Verification

In [24]:
print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)
X_train.head()

Train Shape: (52326, 12)
Test Shape: (13082, 12)


Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,age_years,BMI
36276,0,-0.799438,-0.169225,-0.119302,0.376095,0,2,0,0,1,-0.119974,0.239831
6601,0,0.073824,-0.030408,-1.591372,-0.14509,0,0,0,0,1,0.320633,-0.075318
16717,0,0.44808,-0.516268,-0.413716,-0.14509,0,0,0,0,1,-0.119974,-0.728573
689,0,-0.549934,1.427173,0.175112,-0.14509,0,0,0,0,0,-1.441797,1.808872
24499,0,0.323328,0.941313,1.647183,-0.14509,2,0,0,0,1,0.90811,0.754529


Save Preprocessed Data

In [25]:
X_train.to_csv("D:\\SEM - 6\\MLDL\\ML\\Cardio-ML-Project\\data\\X_train_final.csv", index=False)
X_test.to_csv("D:\\SEM - 6\\MLDL\\ML\\Cardio-ML-Project\\data\\X_test_final.csv", index=False)
y_train.to_csv("D:\\SEM - 6\\MLDL\\ML\\Cardio-ML-Project\\data\\y_train_final.csv", index=False)
y_test.to_csv("D:\\SEM - 6\\MLDL\\ML\\Cardio-ML-Project\\data\\y_test_final.csv", index=False)