<a href="https://colab.research.google.com/github/LucasMelvin15/Machine-Learning-Projects/blob/main/03_Heart_Disease_Classifier(sklearn).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Random Forest Classifier for Heart Disease

The heart disease dataset provides a comprehensive look into cardiovascular health by analyzing various key attributes, including age, sex, chest pain type (cp), resting blood pressure (trestbps), cholesterol levels (chol), fasting blood sugar (fbs), resting electrocardiographic results (restecg), maximum heart rate achieved (thalach), exercise-induced angina (exang), ST depression induced by exercise relative to rest (oldpeak), the slope of the peak exercise ST segment (slope), number of major vessels (ca), and thalassemia (thal).

In [1]:
# imports
import pandas as pd
import numpy as np
import sklearn

## 1. Get data ready

In [2]:
from google.colab import files
uploaded = files.upload()


Saving heart-disease.csv to heart-disease.csv


In [3]:
uploaded.keys()

dict_keys(['heart-disease.csv'])

In [4]:
heart_disease = pd.read_csv('heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
# Create X(feature columns)
X = heart_disease.drop("target", axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [8]:
# create the target column
y = heart_disease['target']
y.head(), y.value_counts()

(0    1
 1    1
 2    1
 3    1
 4    1
 Name: target, dtype: int64,
 target
 1    165
 0    138
 Name: count, dtype: int64)

## 2. Choose a model and hyperparameters

In [9]:
from sklearn.ensemble import RandomForestClassifier
#instanciate the class
clf = RandomForestClassifier()
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## 3. Fit the model to the training data

In [10]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20) # 20% test data

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [11]:
# Fit the model to the training data
clf.fit(X_train, y_train)

In [13]:
X_test[:10]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
183,58,1,2,112,230,0,0,165,0,2.5,1,1,3
172,58,1,1,120,284,0,0,160,0,1.8,1,0,2
65,35,0,0,138,183,0,1,182,0,1.4,2,0,2
193,60,1,0,145,282,0,0,142,1,2.8,1,2,3
101,59,1,3,178,270,0,0,145,0,4.2,0,0,3
220,63,0,0,150,407,0,0,154,0,4.0,1,3,3
86,68,1,2,118,277,0,1,151,0,1.0,2,1,3
182,61,0,0,130,330,0,0,169,0,0.0,2,0,2
126,47,1,0,112,204,0,1,143,0,0.1,2,0,2
253,67,1,0,100,299,0,0,125,1,0.9,1,2,2


In [14]:
# Make a prediction
y_preds = clf.predict(X_test) # it should predict the target
y_preds

array([0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1])

## 4. Evaluate the model

In [15]:
# Evaluate the model on training data
clf.score(X_train, y_train) # it shows how well it has learned on the training data

1.0

In [16]:
# Evaluate the model on testing data
clf.score(X_test, y_test)

0.819672131147541

In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# create a classification report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.88      0.74      0.81        31
           1       0.77      0.90      0.83        30

    accuracy                           0.82        61
   macro avg       0.83      0.82      0.82        61
weighted avg       0.83      0.82      0.82        61

