# Heart Disease Prediction using Random Forest

In [1]:
#installing Modules
# %pip install pandas numpy matplotlib scikit-learn

#importing Modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

### 1. Data preparation and Data exploration

In [2]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,2
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,1
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,2
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,1
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,1


In [3]:
df.shape

(1389, 14)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1389 entries, 0 to 1388
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1389 non-null   int64  
 1   sex       1389 non-null   int64  
 2   cp        1389 non-null   int64  
 3   trestbps  1389 non-null   int64  
 4   chol      1389 non-null   int64  
 5   fbs       1389 non-null   int64  
 6   restecg   1389 non-null   int64  
 7   thalach   1389 non-null   int64  
 8   exang     1389 non-null   int64  
 9   oldpeak   1389 non-null   float64
 10  slope     1389 non-null   int64  
 11  ca        1389 non-null   int64  
 12  thal      1389 non-null   int64  
 13  target    1389 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 152.0 KB


In [5]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0
mean,54.299496,0.686825,1.410367,131.37509,246.794096,0.150468,0.62563,150.00504,0.323254,1.040965,1.438445,0.704824,2.770338,1.517639
std,9.064189,0.463952,1.335637,17.448722,51.995061,0.357658,0.673354,22.878909,0.467887,1.150091,0.622574,1.001072,1.388959,0.499869
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,47.0,0.0,0.0,120.0,211.0,0.0,0.0,134.0,0.0,0.0,1.0,0.0,2.0,1.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,3.0,2.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,168.0,1.0,1.6,2.0,1.0,3.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0,7.0,2.0


In [6]:
#Check null values
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

### 2. Modelling

In [7]:
# Initialize data and target
target = df['target']
features = df.drop(['target'], axis = 1)

In [8]:
# Split the data into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 0)

### Train and Evaluate model

In [9]:
# Train and evaluate model
def fit_eval_model(model, train_features, y_train, test_features, y_test):
    
    """
    Function: train and evaluate a machine learning classifier.
    Args:
      model: machine learning classifier
      train_features: train data extracted features
      y_train: train data lables
      test_features: train data extracted features
      y_test: train data lables
    Return:
      results(dictionary): a dictionary of classification report
    """
    results = {}
    
    # Train the model
    model.fit(train_features, y_train)
    
    # Test the model
    train_predicted = model.predict(train_features)
    test_predicted = model.predict(test_features)
    
     # Classification report and Confusion Matrix
    results['classification_report'] = classification_report(y_test, test_predicted)
    results['confusion_matrix'] = confusion_matrix(y_test, test_predicted)
        
    return results, model

### Random Forest

In [17]:
rf = RandomForestClassifier(random_state = 1)

result = fit_eval_model(rf, X_train, y_train, X_test, y_test)
for key, value in result[0].items():
    print(key, ':\n', value)

classification_report :
               precision    recall  f1-score   support

           1       0.93      0.95      0.94       133
           2       0.95      0.93      0.94       145

    accuracy                           0.94       278
   macro avg       0.94      0.94      0.94       278
weighted avg       0.94      0.94      0.94       278

confusion_matrix :
 [[126   7]
 [ 10 135]]


### Save Model

In [18]:
with open('model.pkl', 'wb') as file:
    pickle.dump(rf, file)

### Predicting

In [19]:
print(rf.predict([[57,1,4,140,192,0,0,148,0,0.4,2,0,6]]))
print(rf.predict([[48,1,4,130,256,1,2,150,1,0,1,2,7]]))

[1]
[2]


