This program uses the scikit learn library to classify someone as either having a cardiovascular problem or not.


In [None]:
# Installs and Inputs needed

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load Cancer Data - The dataset can be found under README

from google.colab import files
uploaded = files.upload()

In [None]:
# Store the data into a variable called dataset

dataset = pd.read_csv('cardio.csv')

# Prints the first 7 rows of data to verify that it is working 
# It should display a table, if you see semicolons, refer to README under
# Note for the solution

dataset.head(7)

In [None]:
# Get the shape of the data
# It should return (70000, 13), if it displays (70000,1), refer to the above 
# CODEBLOCK for the fix.

dataset.shape

In [None]:
# In the event that the dataset updates 
# Check for any null or missing values in your dataset
# Should return False - 7/14/2020

dataset.isnull().values.any()


In [None]:
# Displays some basic statistics using the Panda library

dataset.describe()

In [None]:
# Returns the count of people with cardiovascular disease (1) and without (0)
dataset['cardio'].value_counts()

In [None]:
# Creates a years column 

dataset['years'] = (dataset['age']/365).round(0)

# Converts the years column into an integer

dataset['years'] = pd.to_numeric(dataset['years'], downcast = 'integer')

# Visualize the data using a colorblind palette
sns.countplot(x= 'years', hue= 'cardio', data= dataset, palette= 'colorblind', edgecolor= sns.color_palette('dark', n_colors = 1))

In [None]:
# Prepare data for machine learning - Removes the years column as its redundant
#                                   - Removes the ID column as its pointless
dataset = dataset.drop('years', axis=1)
dataset = dataset.drop('id', axis=1)

In [None]:
# Returns the correlation

dataset.corr()

In [None]:
# Visualizes this correlation using a heatmap

plt.figure(figsize=(7,7))
sns.heatmap(dataset.corr(), annot= True, fmt= '.0%')

In [33]:
# Splits the data into feature data and target data
# Target: Is the output of the data, in this whether someone suffers from heart diseases
# Features: Is the input data, in this case the attributes collected from the patients

# Takes in all the attributes except for the Cardio column
x = dataset.iloc[:, :-1].values
# Takes the Cardio column
y = dataset.iloc[:, -1].values

In [34]:
# Splits the data into 75:25 Training:Testing

# This is so that we can later on test whether our machine learning technique was accurate or not 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.25, random_state = 1)

In [35]:
# Feature Scaling 
# Normalizes the values so that one data point doesn't have too much power over the rest.

sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
# Employs the Random forest classifier
# Why: - Scalable
#      - Resists overfitting

forest = RandomForestClassifier(n_estimators= 10, criterion= 'entropy', random_state= 1)
forest.fit(x_train, y_train)

In [None]:
# Test the accuracy of the model on the training dataset - 97.9% Accurate

model = forest
model.score(x_train, y_train)

In [None]:
# Test the accuracy of the model on the testing dataset - 97.9% Accurate
cm = confusion_matrix(y_test, model.predict(x_test))

tn = cm[0][0] #True Negative
tp = cm[1][1] #True Positive
fn = cm[1][0] #False Negative
fp = cm[0][1] #False Positive

# Print the confusion matrix
print(cm)

#Print models accuracy on test data - 70.2% Accurate
print('Model Test Accuracy = {}'.format((tp+tn)/(tp+tn+fn+fp)))