In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np
import pickle

# Feature engineering functions
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

def bp_category(row):
    sys = row['ap_hi']
    dia = row['ap_lo']
    if sys < 120 and dia < 80:
        return 'Normal'
    elif 120 <= sys < 130 and dia < 80:
        return 'Elevated'
    elif (130 <= sys < 140) or (80 <= dia < 90):
        return 'Hypertension Stage 1'
    elif (140 <= sys) or (90 <= dia):
        return 'Hypertension Stage 2'
    else:
        return 'Unknown'

# Load data
df = pd.read_csv('./datasets/Cardiovascular Disease Prediction.csv', sep=';')

# Feature engineering
df['age'] = (df['age'] / 365).astype(int)
df['height'] = df['height'] / 100
df = df[df['ap_lo'] < df['ap_hi']]
df = df[(df['ap_hi'] > 50) & (df['ap_hi'] < 300)]
df = df[(df['ap_lo'] > 30) & (df['ap_lo'] < 200)]
df['BMI'] = df['weight'] / (df['height'] ** 2)
df = df[(df['BMI'] > 10) & (df['BMI'] < 50)]
df['BMI'] = df['BMI'].apply(bmi_category)
df['BP'] = df.apply(bp_category, axis=1)

# Feature categorization
target = 'cardio'
id_col = 'id'
features = df.drop(columns=[target, id_col])
cat_cols = [col for col in features.columns if df[col].nunique() <= 5 or col in ['BMI', 'BP']]
num_cols = [col for col in features.columns if col not in cat_cols]

X = df.drop(columns=[target, id_col])
y = df[target]


In [2]:
X.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,BMI,BP
0,50,2,1.68,62.0,110,80,1,1,0,0,1,Normal,Hypertension Stage 1
1,55,1,1.56,85.0,140,90,3,1,0,0,1,Obese,Hypertension Stage 2
2,51,1,1.65,64.0,130,70,3,1,0,0,0,Normal,Hypertension Stage 1
3,48,2,1.69,82.0,150,100,1,1,0,0,1,Overweight,Hypertension Stage 2
4,47,1,1.56,56.0,100,60,1,1,0,0,0,Normal,Normal


In [19]:
X.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
count,68432.0,68432.0,68432.0,68432.0,68432.0,68432.0,68432.0,68432.0,68432.0,68432.0,68432.0
mean,52.827303,1.349018,1.644468,73.963291,126.644786,81.290902,1.364084,1.224982,0.088073,0.053352,0.803381
std,6.769503,0.476663,0.078558,13.93799,16.668724,9.442179,0.678444,0.570755,0.283403,0.224737,0.397444
min,29.0,1.0,1.2,28.0,60.0,40.0,1.0,1.0,0.0,0.0,0.0
25%,48.0,1.0,1.59,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0
50%,53.0,1.0,1.65,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0
75%,58.0,2.0,1.7,82.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0
max,64.0,2.0,2.5,180.0,240.0,182.0,3.0,3.0,1.0,1.0,1.0


In [3]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(drop='first'), cat_cols)
])

pipe = Pipeline([
    ("preprocess", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))
])


In [4]:
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10],
    'clf__min_samples_split': [2, 5],
    'clf__min_samples_leaf': [1, 2],
    'clf__max_features': ['sqrt', 'log2'],
    'clf__class_weight': [None, 'balanced']
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)


In [5]:
print("Best Parameters:\n")
for k, v in grid.best_params_.items():
    print(f"{k}: {v}")

print("\nClassification Report:\n")
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:", accuracy_score(y_test, y_pred))


Best Parameters:

clf__class_weight: balanced
clf__max_depth: 10
clf__max_features: sqrt
clf__min_samples_leaf: 1
clf__min_samples_split: 2
clf__n_estimators: 100

Classification Report:

              precision    recall  f1-score   support

           0       0.71      0.80      0.75      6901
           1       0.77      0.67      0.72      6786

    accuracy                           0.74     13687
   macro avg       0.74      0.74      0.74     13687
weighted avg       0.74      0.74      0.74     13687


Accuracy Score: 0.7376342514795061


In [10]:
with open("./best_random_forest_pipeline.pkl", "wb") as f:
    pickle.dump(grid.best_estimator_, f)
print("Model saved as 'best_random_forest_pipeline.pkl'")

Model saved as 'best_random_forest_pipeline.pkl'


In [4]:
with open("./best_random_forest_pipeline.pkl", "rb") as f:
    model = pickle.load(f)

In [20]:
import pandas as pd

sample_data = {
    "age": 30,
    "height": 175,
    "weight": 70,
    "gender": 0,
    "ap_hi": 120,
    "ap_lo": 80,
    "cholesterol": 1,
    "gluc": 1,
    "smoke": 0,
    "alco": 0,
    "active": 1,
    "BMI": "Underweight",
    "BP": "Normal"
}

df = pd.DataFrame([sample_data])


In [21]:
# You can do transform separately if you want to isolate the problem
model.predict(df)  # Or model.predict_proba(df)


ValueError: Found unknown categories [np.int64(0)] in column 0 during transform

In [18]:
model.predict_proba(df)

array([[0.72403341, 0.27596659]])

In [18]:
X.info

<bound method DataFrame.info of        age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0       50       2    1.68    62.0    110     80            1     1      0   
1       55       1    1.56    85.0    140     90            3     1      0   
2       51       1    1.65    64.0    130     70            3     1      0   
3       48       2    1.69    82.0    150    100            1     1      0   
4       47       1    1.56    56.0    100     60            1     1      0   
...    ...     ...     ...     ...    ...    ...          ...   ...    ...   
69994   57       1    1.65    80.0    150     80            1     1      0   
69995   52       2    1.68    76.0    120     80            1     1      1   
69997   52       2    1.83   105.0    180     90            3     1      0   
69998   61       1    1.63    72.0    135     80            1     2      0   
69999   56       1    1.70    72.0    120     80            2     1      0   

       alco  active         BMI