In [None]:
# Machine Learning Analysis | Random Forest | 

In [2]:
# import dependencies
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# import dataset

df = pd.read_csv("cardio_df_ultimate.csv")
df = df.drop(columns=["Unnamed: 0"], axis=1)
df.head(3)

Unnamed: 0,age_years,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50,2,168,62.0,110,80,1,1,0,0,1,0,21.97
1,55,1,156,85.0,140,90,3,1,0,0,1,1,34.93
2,51,1,165,64.0,130,70,3,1,0,0,0,1,23.51


In [4]:
# Define features
X= df.drop("cardio", axis=1)
X.head(3)

Unnamed: 0,age_years,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,gluc,smoke,alco,active,BMI
0,50,2,168,62.0,110,80,1,1,0,0,1,21.97
1,55,1,156,85.0,140,90,3,1,0,0,1,34.93
2,51,1,165,64.0,130,70,3,1,0,0,0,23.51


In [6]:
# Define target
y= df["cardio"].ravel()
print(y)

[0 1 1 ... 1 1 0]


In [7]:
# Split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   random_state=13)


In [8]:
# Create Standard Scaler Instance | 
scaler = StandardScaler()
# Fit the Standard Scaler with the training data 
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create RF classifier | Instance the RF 
random_forest = RandomForestClassifier(n_estimators = 128, random_state=13)

In [10]:
# Fit the RF
random_forest = random_forest.fit(X_train_scaled, y_train)

In [12]:
# Predictions w/ test scaled features 
predictions= random_forest.predict(X_test_scaled)
predictions

array([0, 0, 0, ..., 0, 1, 0])

In [15]:
# Eval the model utilizing confusion matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

# 0 = CVD not present, 1 = CVD present **

cm_df = pd.DataFrame(
                    cm, index=["No CVD (0)","Has CVD (1)"], columns=["Predicted No CVD (0)", "Predicted CVD (1)"])
cm_df

Unnamed: 0,Predicted No CVD (0),Predicted CVD (1)
No CVD (0),5708,2369
Has CVD (1),2412,5589


In [16]:
# Confusion Martix Information

print(5708 + 2412 + 2369 + 5589)
# n = 16078

# TN = 5708 | .355
# FP = 2369 | .147
# FN = 2412 | .150
# TP = 5589 | .347

# Accuracy Score-> TN+TP/(n) = .702

# Precision Score -> TP/(FP+TP) = .702

16078


In [17]:
# get Accuracy Score

acc_score = accuracy_score(y_test, predictions)
print("Accuracy Score : ", acc_score)

Accuracy Score :  0.7026371439233735


In [21]:
# Results (CM + Classification Report)

print("Random Forest Confusion Martix")
display(cm_df)

print(f"Accuracy Score :  {acc_score}")
print("Classification Report")

print(classification_report(y_test, predictions))

Random Forest Confusion Martix


Unnamed: 0,Predicted No CVD (0),Predicted CVD (1)
No CVD (0),5708,2369
Has CVD (1),2412,5589


Accuracy Score :  0.7026371439233735
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.71      0.70      8077
           1       0.70      0.70      0.70      8001

    accuracy                           0.70     16078
   macro avg       0.70      0.70      0.70     16078
weighted avg       0.70      0.70      0.70     16078



In [22]:
# Calculating Feature Importance 

importances = random_forest.feature_importances_
importances

array([0.16268   , 0.0208438 , 0.13495387, 0.1409506 , 0.17080753,
       0.0827572 , 0.04046914, 0.01941616, 0.01015463, 0.00814345,
       0.01643137, 0.19239225])

In [24]:
print("Random Forest Model Ranked Feature Importances")
sorted(zip(random_forest.feature_importances_, X.columns), reverse=True)


Random Forest Model Ranked Feature Importances


[(0.19239224699839041, 'BMI'),
 (0.17080753220086137, 'systolic_bp'),
 (0.1626799968569762, 'age_years'),
 (0.14095060026048528, 'weight'),
 (0.1349538744288333, 'height'),
 (0.08275720171795962, 'diastolic_bp'),
 (0.040469142323415905, 'cholesterol'),
 (0.020843797410909783, 'gender'),
 (0.01941615963812854, 'gluc'),
 (0.016431365021008643, 'active'),
 (0.010154629134363402, 'smoke'),
 (0.008143454008667535, 'alco')]