# Heart disease prediction 

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# load dataset

In [2]:
data=pd.read_csv("heart.csv")

# check if there is any missing values

In [3]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [5]:
#get target data
y=data['target']

#load X variable into a Pandas DataFrame with columns
X=data.drop(['target'],axis=1)

In [6]:
print(f'X:{X.shape}')

X:(303, 13)


# divide the data into train and test set

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=None)

In [8]:
print(f'X_train:{X_train.shape}')
print(f'X_test:{X_test.shape}')
print(f'y_train:{y_train.shape}')
print(f'y_test:{y_test.shape}')

X_train:(242, 13)
X_test:(61, 13)
y_train:(242,)
y_test:(61,)


# Build Random Forest Model with hyperparameters

In [9]:
# number of trees in random forest
n_estimators=[int(x) for x in np.linspace(start=10,stop=80,num=10)]
# number of features to consider at every split
max_features=['auto','sqrt']
#maximum number of levels in tree
max_depth=[2,4,6,8,10,12]
#minimum number of samples required to split a node
min_samples_split=[2,5]
#minimum samples required at each leaf node
min_samples_leaf=[1,2]
#method of selecting samples for training each tree
boostrap=[True,False]

In [10]:
#create a param grid
param_grid={'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth':max_depth,
             'min_samples_split':min_samples_split,
             'min_samples_leaf':min_samples_leaf,
             'bootstrap':boostrap}
print(param_grid)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4, 6, 8, 10, 12], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [11]:
rf=RandomForestClassifier()

# grid search CV

In [12]:
from sklearn.model_selection import GridSearchCV
rf_Grid=GridSearchCV(estimator=rf,param_grid=param_grid,cv=3,verbose=2,n_jobs=4)

In [13]:
rf_Grid.fit(X_train,y_train)
import warnings
warnings.filterwarnings('ignore')

Fitting 3 folds for each of 960 candidates, totalling 2880 fits


  warn(


In [14]:
rf_Grid.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 41}

# check accuracy

In [15]:
print(f'Train Accuracy - : {rf_Grid.score(X_train,y_train):.3f}')
print(f'Test Accuracy - : {rf_Grid.score(X_test,y_test):.3f}')

Train Accuracy - : 0.975
Test Accuracy - : 0.820


# randomised search cv

In [16]:
from sklearn.model_selection import RandomizedSearchCV
rf_RandomGrid=RandomizedSearchCV(estimator=rf,param_distributions=param_grid,cv=3,verbose=2,n_jobs=4)

In [17]:
rf_RandomGrid.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [18]:
rf_RandomGrid.best_params_

{'n_estimators': 25,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 6,
 'bootstrap': True}

In [19]:
print(f'Train Accuracy - : {rf_RandomGrid.score(X_train,y_train):.3f}')
print(f'Test Accuracy - : {rf_RandomGrid.score(X_test,y_test):.3f}')

Train Accuracy - : 0.979
Test Accuracy - : 0.836


In [20]:
temp=pd.DataFrame(rf_Grid.predict_proba(X_test).tolist(),columns=rf_Grid.classes_)

In [21]:
temp['max_prob']=temp.max(axis=1)

In [22]:
temp

Unnamed: 0,0,1,max_prob
0,0.109079,0.890921,0.890921
1,0.113821,0.886179,0.886179
2,0.249729,0.750271,0.750271
3,0.301394,0.698606,0.698606
4,0.487398,0.512602,0.512602
...,...,...,...
56,0.770325,0.229675,0.770325
57,0.775407,0.224593,0.775407
58,0.000000,1.000000,1.000000
59,0.843496,0.156504,0.843496


In [23]:
idx=temp[temp.max_prob>=0.90].index

In [24]:
import pickle

In [25]:
filename='heart_disease.sav'
pickle.dump(rf_Grid,open(filename,'wb'))

In [26]:
loaded_model=pickle.load(open('heart_disease.sav','rb'))