In [14]:
import pandas as pd
import seaborn as sn
import numpy as np
import seaborn as sn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score

In [30]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,sex,chest pain type,fasting blood sugar,resting ecg,exercise angina,oldpeak,ST slope,target,cholesterol_bp,age_max_hr
0,1,2,0,0,0,0.0,1,0,2.049645,6880
1,0,3,0,0,0,1.0,2,1,1.118012,7644
2,1,2,0,1,0,0.0,1,0,2.160305,3626
3,0,4,0,0,1,1.5,2,1,1.539568,5184
4,1,3,0,0,0,0.0,1,0,1.291391,6588


In [16]:
x = df.drop('target', axis=1)
y = df['target']

In [17]:
x

Unnamed: 0,sex,chest pain type,fasting blood sugar,resting ecg,exercise angina,oldpeak,ST slope,cholesterol_bp,age_max_hr
0,1,2,0,0,0,0.0,1,2.049645,6880
1,0,3,0,0,0,1.0,2,1.118012,7644
2,1,2,0,1,0,0.0,1,2.160305,3626
3,0,4,0,0,1,1.5,2,1.539568,5184
4,1,3,0,0,0,0.0,1,1.291391,6588
...,...,...,...,...,...,...,...,...,...
1185,1,1,0,0,0,1.2,2,2.378378,5940
1186,1,4,1,0,0,3.4,2,1.331034,9588
1187,1,4,0,0,1,1.2,2,1.000000,6555
1188,0,2,0,2,0,0.0,2,1.801527,9918


In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [19]:
scl = StandardScaler()
x_train = scl.fit_transform(x_train)
x_test = scl.transform(x_test)

In [20]:
models = {
    'svc' : SVC(),
    'LogisticRegression': LogisticRegression(),
    'KNN' : KNeighborsClassifier(),
    'DT' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier()
}

In [21]:
def validation(true, predicted):
    acc = accuracy_score(true, predicted)
    f1 = f1_score(true, predicted, average='weighted')
    recall = recall_score(true, predicted)
    roc = roc_auc_score(true, predicted)
    return acc, f1, recall, roc

In [22]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)
    
    y_tr_p = model.predict(x_train)
    y_te_p = model.predict(x_test)
    
    model_tr_acc, model_tr_f1, model_tr_re, model_tr_roc = validation(y_train, y_tr_p)
    model_te_acc, model_te_f1, model_te_re, model_te_roc = validation(y_test, y_te_p)
    
    print('Model selection for "Training Dataset for ->', list(models.keys())[i])
    print('Accuracy : {:.4f}'.format(model_tr_acc))
    print('F1 Score: {:.4f}'.format(model_tr_f1))
    print('Recall Score: {:.4f}'.format(model_tr_re))
    print('ROC AUC Score : {:.4f}'.format(model_tr_roc))
    
    print('-------------------------------------------------------------')
    
    print('Model selection for "Test Dataset for ->', list(models.keys())[i])
    print('Accuracy : {:.4f}'.format(model_te_acc))
    print('F1 Score: {:.4f}'.format(model_te_f1))
    print('Recall Score: {:.4f}'.format(model_te_re))
    print('ROC AUC Score : {:.4f}'.format(model_te_roc))
    
    print('-------------------------------------------------------------')
    

Model selection for "Training Dataset for -> svc
Accuracy : 0.8782
F1 Score: 0.8780
Recall Score: 0.9016
ROC AUC Score : 0.8770
-------------------------------------------------------------
Model selection for "Test Dataset for -> svc
Accuracy : 0.8782
F1 Score: 0.8775
Recall Score: 0.9237
ROC AUC Score : 0.8730
-------------------------------------------------------------
Model selection for "Training Dataset for -> LogisticRegression
Accuracy : 0.8382
F1 Score: 0.8382
Recall Score: 0.8494
ROC AUC Score : 0.8377
-------------------------------------------------------------
Model selection for "Test Dataset for -> LogisticRegression
Accuracy : 0.8613
F1 Score: 0.8613
Recall Score: 0.8779
ROC AUC Score : 0.8595
-------------------------------------------------------------
Model selection for "Training Dataset for -> KNN
Accuracy : 0.8834
F1 Score: 0.8834
Recall Score: 0.8835
ROC AUC Score : 0.8834
-------------------------------------------------------------
Model selection for "Test Da

In [23]:
rf_param_grid= {
    'max_depth': [5, 8, 15, None, 10],
    'max_features': [5, 7, 'auto', 8],  
    'min_samples_split': [2, 8, 12, 15, 20],  
    'min_samples_leaf' :  [1, 2, 4, 6, 8, 10],
    'n_estimators': [200,300, 500, 1000],
    'bootstrap': [True, False]
}


In [24]:
rand_cv_model = [('Random Forest', RandomForestClassifier(), rf_param_grid)]

In [25]:
model_param = {}
for name, model, param in rand_cv_model:
    grid = GridSearchCV(estimator=model, param_grid=rf_param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-2)
    grid.fit(x_train, y_train)
    model_param[name] = grid.best_params_
    
for i in model_param:
    print(f'-----------------Best Parameter for {i}--------------------')
    print(model_param[i])
    

Fitting 5 folds for each of 4800 candidates, totalling 24000 fits


6000 fits failed out of a total of 24000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2746 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hareesh\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hareesh\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "C:\Users\Hareesh\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Hareesh\AppData\Roaming\Python\Python311\site-packages\sklearn\utils\_

-----------------Best Parameter for Random Forest--------------------
{'bootstrap': True, 'max_depth': 15, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [27]:
model_1 = {
    'Random forest' :RandomForestClassifier(n_estimators=300, min_samples_split=2, max_features=5, max_depth=15, bootstrap=True, min_samples_leaf=1)
}

for i in range(len(list(model_1))):
    model = list(model_1.values())[i]
    model.fit(x_train, y_train)
    
    y_tr_p = model.predict(x_train)
    y_te_p = model.predict(x_test)
    
    model_tr_acc, model_tr_f1, model_tr_re, model_tr_roc = validation(y_train, y_tr_p)
    model_te_acc, model_te_f1, model_te_re, model_te_roc = validation(y_test, y_te_p)
    
    print('Model selection for "Training Dataset for ->', list(model_1.keys())[i])
    print('Accuracy : {:.4f}'.format(model_tr_acc))
    print('F1 Score: {:.4f}'.format(model_tr_f1))
    print('Recall Score: {:.4f}'.format(model_tr_re))
    print('ROC AUC Score : {:.4f}'.format(model_tr_roc))
    
    print('-------------------------------------------------------------')
    
    print('Model selection for "Test Dataset for ->', list(model_1.keys())[i])
    print('Accuracy : {:.4f}'.format(model_te_acc))
    print('F1 Score: {:.4f}'.format(model_te_f1))
    print('Recall Score: {:.4f}'.format(model_te_re))
    print('ROC AUC Score : {:.4f}'.format(model_te_roc))
    
    print('-------------------------------------------------------------')
    

Model selection for "Training Dataset for -> Random forest
Accuracy : 1.0000
F1 Score: 1.0000
Recall Score: 1.0000
ROC AUC Score : 1.0000
-------------------------------------------------------------
Model selection for "Test Dataset for -> Random forest
Accuracy : 0.9286
F1 Score: 0.9281
Recall Score: 0.9771
ROC AUC Score : 0.9231
-------------------------------------------------------------


In [28]:
custom_input = {
    'sex': 0,  # 1 for male, 0 for female
    'chest pain type': 3,  # Chest pain type (1-4)
    'fasting blood sugar': 0.3,  # '0' if <= 120 mg/dl, '1' if > 120 mg/dl
    'resting ecg': 0,  # Resting ECG results (0-2)
    'exercise angina': 1,  # 0 for no, 1 for yes
    'oldpeak': 0,  # ST depression induced by exercise
    'ST slope': 0,  # Slope of the peak exercise ST segment (0-2)
    'cholesterol_bp': 1,  # Cholesterol/blood pressure ratio
    'age_max_hr': 5000  # Age * max heart rate
}


inp_df = pd.DataFrame([custom_input])

inp_scl = scl.transform(inp_df)

prediction = model.predict(inp_scl)
prediction_prob = model.predict_proba(inp_scl)

print('Custom Inpute')
print('Prediction (0 = No Heart Disease, 1 = Heart Disease):', int(prediction[0]))
print('Prediction Probabilities ([No Heart Disease, Heart Disease]):', prediction_prob[0])

Custom Inpute
Prediction (0 = No Heart Disease, 1 = Heart Disease): 0
Prediction Probabilities ([No Heart Disease, Heart Disease]): [0.62 0.38]


In [29]:
import joblib
joblib.dump(model, 'rf_model.pkl')
joblib.dump(scl, 'rf_scl.pkl')

['rf_scl.pkl']