# Heart Disease Prediction

In [95]:
import pandas as pd
import numpy as np

In [96]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [98]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [99]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [100]:
np.unique(df["RestingECG"])

array(['LVH', 'Normal', 'ST'], dtype=object)

In [101]:
np.unique(df["ExerciseAngina"])

array(['N', 'Y'], dtype=object)

In [102]:
np.unique(df["ST_Slope"])

array(['Down', 'Flat', 'Up'], dtype=object)

In [103]:
df["Sex"] = le.fit_transform(df["Sex"])
df["ExerciseAngina"] = le.fit_transform(df["ExerciseAngina"])
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [104]:
np.unique(df["ExerciseAngina"])

array([0, 1])

In [105]:
df = pd.get_dummies(df, drop_first=True, dtype = int)
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,1,0,0,1,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,1,0,1,0,1,0
2,37,1,130,283,0,98,0,0.0,0,1,0,0,0,1,0,1
3,48,0,138,214,0,108,1,1.5,1,0,0,0,1,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,1,0,1,0,0,1


In [106]:
X = df.drop(["HeartDisease"], axis = 1)
y = df["HeartDisease"]

In [107]:
X.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,1,0,0,1,0,0,1
1,49,0,160,180,0,156,0,1.0,0,1,0,1,0,1,0
2,37,1,130,283,0,98,0,0.0,1,0,0,0,1,0,1
3,48,0,138,214,0,108,1,1.5,0,0,0,1,0,1,0
4,54,1,150,195,0,122,0,0.0,0,1,0,1,0,0,1


In [108]:
X.shape

(918, 15)

In [109]:
y[:5]

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [110]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_scaled[:5]

array([[-1.4331398 ,  0.51595242,  0.41090889,  0.82507026, -0.55134134,
         1.38292822, -0.8235563 , -0.83243239,  2.07517671, -0.53283777,
        -0.22967867,  0.81427482, -0.49044933, -1.00218103,  1.15067399],
       [-0.47848359, -1.93816322,  1.49175234, -0.17196105, -0.55134134,
         0.75415714, -0.8235563 ,  0.10566353, -0.48188667,  1.87674385,
        -0.22967867,  0.81427482, -0.49044933,  0.99782372, -0.86905588],
       [-1.75135854,  0.51595242, -0.12951283,  0.7701878 , -0.55134134,
        -1.52513802, -0.8235563 , -0.83243239,  2.07517671, -0.53283777,
        -0.22967867, -1.22808661,  2.03894663, -1.00218103,  1.15067399],
       [-0.5845565 , -1.93816322,  0.30282455,  0.13903954, -0.55134134,
        -1.13215609,  1.21424608,  0.57471149, -0.48188667, -0.53283777,
        -0.22967867,  0.81427482, -0.49044933,  0.99782372, -0.86905588],
       [ 0.05188098,  0.51595242,  0.95133062, -0.0347549 , -0.55134134,
        -0.5819814 , -0.8235563 , -0.83243239, 

In [111]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [112]:
len(X_train)

734

In [113]:
len(X_test)

184

# Model Selection

In [115]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [116]:
model_params = {
    "logistic_regression" : {
        "model" : LogisticRegression(),
        "params" : {
            "C" : [0.1, 1, 10]
        }
    },
    "svc" : {
        "model" : SVC(gamma = "auto"),
        "params" : {
            "C" : [0.1, 1, 10],
            "kernel" : ["linear", "rbf"]
        }
    },
    "random_forest" : {
        "model" : RandomForestClassifier(),
        "params" : {
            "n_estimators" : [10,50,100]
        }
    },
    "decision_tree" : {
        "model" : DecisionTreeClassifier(),
        "params" : {
            "max_depth" : [None, 5, 10]
        }
    },
    "multinomialNB" : {
        "model" : MultinomialNB(),
        "params" : {
            "alpha" : [0.01, 0.1, 1, 10]
        }
    },
    "gaussianNB" : {
        "model" : GaussianNB(),
        "params" : {
            "var_smoothing" : [1e-8, 1e-9, 1e-10]
        }
    }
}

In [117]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [118]:
scores = []

for model, model_param in model_params.items():
    clf = GridSearchCV(model_param["model"], model_param["params"], cv = 5)
    clf.fit(X_train, y_train)
    scores.append({
        "model_name" : model,
        "best_parameter" : clf.best_params_,
        "best_score" : clf.best_score_
    })
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

ValueError: 
All the 20 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\manas\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\manas\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\manas\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 759, in fit
    self._count(X, Y)
  File "C:\Users\manas\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 881, in _count
    check_non_negative(X, "MultinomialNB (input X)")
  File "C:\Users\manas\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1689, in check_non_negative
    raise ValueError("Negative values in data passed to %s" % whom)
ValueError: Negative values in data passed to MultinomialNB (input X)
