# Cardiovascular Disease (CVD) PREDICTOR 


## Data processing and anaylsis

In [None]:
import pandas as pd 
import matplotlib.pyplot
import seaborn as sns
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.filterwarnings('ignore')

df = pd.read_csv("./heart_data.csv")
df

In [None]:
df.info()


# Features
The features selected to be looked at are: Age, Sex, Max Heart Rate and Exercise Angina (pain)

#  Age and Heart Disease


In [None]:


fig, ax = matplotlib.pyplot.subplots(1,2, figsize = (15,5))
sns.histplot(data=df, x="Age", binwidth=1, kde=True,ax=ax[0])
sns.histplot(data=df, x="Age", hue="HeartDisease", binwidth=1, kde=True, ax=ax[1])



#  Sex and Heart Disease

In [None]:


fig, ax = matplotlib.pyplot.subplots(1,1, figsize = (7,5))
sns.countplot(data=df, x="Sex", hue="HeartDisease", ax=ax)

# MaxHR and Heart Disease

In [None]:
fig, ax = matplotlib.pyplot.subplots(1, 1, figsize=(7,5))
sns.histplot(data=df,x='MaxHR',hue='HeartDisease',binwidth=5, kde=True, ax=ax)

# Exercise Angina (pain) and Heart Disease

In [None]:
fig, ax = matplotlib.pyplot.subplots(1, 1, figsize=(7,5))
sns.countplot(data=df, x='ExerciseAngina', hue='HeartDisease', ax=ax)

# Model Development

In [None]:
test, train = train_test_split(df, test_size =0.333, random_state=41)
X = df[['Age','MaxHR','Sex','ExerciseAngina']].copy()
y = df[['HeartDisease']].copy()

X_train = train[['Age','MaxHR','Sex','ExerciseAngina']].copy()
y_train = train[['HeartDisease']].copy()

X_test = test[['Age', 'MaxHR', 'Sex', 'ExerciseAngina']].copy()
y_test = test[['HeartDisease']].copy()

min_max_scaler = preprocessing.MinMaxScaler()

# Processing Data
# Replacing M and F to 1 and 0, and Y and N to 1 and 0
X[['Age', 'MaxHR']] = min_max_scaler.fit_transform(X[['Age', 'MaxHR']])
X['Sex'] = (X['Sex'] == 'M').replace({True: 1, False: 0})
X['ExerciseAngina'] = (X['ExerciseAngina'] == 'Y').replace({True: 1, False: 0})

X_train[['Age', 'MaxHR']] = min_max_scaler.fit_transform(X_train[['Age', 'MaxHR']])
X_train['Sex'] = (X_train['Sex'] == 'M').replace({True: 1, False: 0})
X_train['ExerciseAngina'] = (X_train['ExerciseAngina'] == 'Y').replace({True: 1, False: 0})

X_test[['Age', 'MaxHR']] = min_max_scaler.transform(X_test[['Age', 'MaxHR']]) # just transform, don't fit
X_test['Sex'] = (X_test['Sex'] == 'M').replace({True: 1, False: 0})
X_test['ExerciseAngina'] = (X_test['ExerciseAngina'] == 'Y').replace({True: 1, False: 0})


# Model training

lr_classifier = LogisticRegression(solver='liblinear', random_state=41)
lr_classifier.fit(X_train, y_train)

lr_classifier.score(X_test, y_test)




The model achieved a score of 77%

# Analysis

# Confusion Matrix Display
1 here represents heart disease, and 0 represents no heart disease

In [None]:
predictions = lr_classifier.predict(X)


ConfusionMatrixDisplay.from_estimator(lr_classifier, X_test, y_test)



# K folds Cross Validation

In [None]:
k_folds = KFold(n_splits = 5, shuffle=True)
# The number of folds determines the test/train split for each iteration. 
# So 5 folds has 5 different mutually exclusive training sets. 
# That's a 1 to 4 (or .20 to .80) testing/training split for each of the 5 iterations.

scores = cross_val_score(lr_classifier, X, y)
# This shows the average score. Print 'scores' to see an array of individual iteration scores.
print("Average Score: ", scores.mean())

# Exporting the Model for Streamlit

In [None]:

pickle.dump(lr_classifier, open("./production.sav", 'wb'))
pickle.dump(min_max_scaler, open("./min_max_scaler.sav", 'wb'))