<a href="https://colab.research.google.com/github/Mann-tech13/Heart-diseases-prediction/blob/master/heart_diseases_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"mann1234","key":"32e8c5653b84fe2641f27216c88bcdc8"}'}

In [None]:
!mkdir -p ~/ .kaggle
!cp -r kaggle.json ~/ .kaggle/
!chmod 600 ~/ .kaggle/kaggle.json

In [None]:
!mv .kaggle /root/
! kaggle datasets download -d fedesoriano/heart-failure-prediction

Downloading heart-failure-prediction.zip to /content
  0% 0.00/8.56k [00:00<?, ?B/s]
100% 8.56k/8.56k [00:00<00:00, 17.8MB/s]


In [None]:
!unzip heart-failure-prediction.zip

Archive:  heart-failure-prediction.zip
  inflating: heart.csv               


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score

In [None]:
df = pd.read_csv('heart.csv')

In [None]:
# returns first n rows (default 5)
df.head(6)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0


In [None]:
# displays summaries of dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [None]:
# adds dummy columns
df = pd.get_dummies(df, columns=['RestingECG','ChestPainType','ST_Slope'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   Sex                918 non-null    object 
 2   RestingBP          918 non-null    int64  
 3   Cholesterol        918 non-null    int64  
 4   FastingBS          918 non-null    int64  
 5   MaxHR              918 non-null    int64  
 6   ExerciseAngina     918 non-null    object 
 7   Oldpeak            918 non-null    float64
 8   HeartDisease       918 non-null    int64  
 9   RestingECG_LVH     918 non-null    uint8  
 10  RestingECG_Normal  918 non-null    uint8  
 11  RestingECG_ST      918 non-null    uint8  
 12  ChestPainType_ASY  918 non-null    uint8  
 13  ChestPainType_ATA  918 non-null    uint8  
 14  ChestPainType_NAP  918 non-null    uint8  
 15  ChestPainType_TA   918 non-null    uint8  
 16  ST_Slope_Down      918 non

In [None]:
# Label Encode
# Encodes values to numeric values 
# For eg: Male --> 1, Female --> 0
for binfeat in ['Sex','ExerciseAngina']:        
    df[binfeat] = LabelEncoder().fit_transform(df[binfeat])

In [None]:
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,0,1,0,0,1,0,0,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,1,0,0,0,1,0,0,1,0
2,37,1,130,283,0,98,0,0.0,0,0,0,1,0,1,0,0,0,0,1
3,48,0,138,214,0,108,1,1.5,1,0,1,0,1,0,0,0,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,1,0,0,0,1,0,0,0,1


In [None]:
X = df.drop(columns='HeartDisease')
Y = df['HeartDisease']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=48, stratify=Y)

In [None]:
X_train

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
216,39,1,110,280,0,150,0,0.0,0,1,0,1,0,0,0,0,1,0
647,37,0,120,215,0,170,0,0.0,0,1,0,0,0,1,0,0,0,1
812,54,0,110,214,0,158,0,1.6,0,1,0,0,0,1,0,0,1,0
523,59,1,124,160,0,117,1,1.0,0,1,0,1,0,0,0,0,1,0
745,63,0,108,269,0,169,1,1.8,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,60,1,120,246,0,135,0,0.0,1,0,0,0,0,1,0,0,0,1
847,45,1,115,260,0,185,0,0.0,1,0,0,1,0,0,0,0,0,1
19,36,1,120,267,0,160,0,3.0,0,1,0,0,1,0,0,0,1,0
574,69,1,142,210,1,112,1,1.5,0,0,1,1,0,0,0,0,1,0


In [None]:
y_train

216    1
647    0
812    0
523    1
745    1
      ..
280    0
847    0
19     1
574    1
468    1
Name: HeartDisease, Length: 734, dtype: int64

In [None]:
X_test

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
71,44,1,130,215,0,135,0,0.0,0,1,0,0,1,0,0,0,0,1
629,57,0,128,303,0,159,0,0.0,1,0,0,1,0,0,0,0,0,1
818,51,1,140,299,0,173,1,1.6,0,1,0,1,0,0,0,0,0,1
527,61,1,139,283,0,135,0,0.3,0,1,0,0,1,0,0,0,0,1
103,40,1,120,466,1,152,1,1.0,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,51,1,130,179,0,100,0,0.0,0,1,0,1,0,0,0,0,0,1
415,66,0,155,0,1,90,0,0.0,0,1,0,1,0,0,0,0,1,0
744,60,1,117,230,1,160,1,1.4,0,1,0,1,0,0,0,0,0,1
203,31,0,100,219,0,150,0,0.0,0,0,1,0,1,0,0,0,0,1


In [None]:
y_test

71     0
629    0
818    1
527    0
103    1
      ..
101    0
415    1
744    1
203    0
368    1
Name: HeartDisease, Length: 184, dtype: int64

In [None]:
rf = RandomForestClassifier()
param_grid = {
                 'n_estimators': [10, 50, 100, 500, 1000],
                 'max_depth': [2, 5, 7, 9, 10]
             }
grid_rf = GridSearchCV(rf, param_grid, cv=10)
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rand

In [None]:
grid_rf.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=9, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = grid_rf.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

array([[70, 12],
       [ 8, 94]])

In [None]:
accuracy_score(y_test, y_pred)

0.8913043478260869