# 1. Main packages import

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import joblib

from sklearn import datasets
from sklearn.model_selection import train_test_split

# 2. Data preparation

In [8]:
#Importing dataset
heart_disease = pd.read_csv("heart_disease_health_indicators_BRFSS2015.csv")

#Dropping variables that have not been selected 

heart_disease = heart_disease.drop(['Fruits','Veggies','HvyAlcoholConsump','AnyHealthcare','NoDocbcCost',] , axis = 1)

#Split in features and labels ; X = independant variables ;  y = dependant variable
X = heart_disease[['HighBP','HighChol','CholCheck','BMI','Smoker','Stroke','Diabetes','PhysActivity','GenHlth','MentHlth','PhysHlth','DiffWalk','Sex','Age','Education','Income']]
y = heart_disease["HeartDiseaseorAttack"]
print(X.shape)
print(y.shape)

(253680, 16)
(253680,)


In [9]:
#Defining the proportion of data that will be used to verify the model = 30% of the data.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(177576, 16)
(76104, 16)
(177576,)
(76104,)


In [10]:
heart_disease.head(10)

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
5,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,2.0,0.0,2.0,0.0,1.0,10.0,6.0,8.0
6,0.0,1.0,0.0,1.0,30.0,1.0,0.0,0.0,0.0,3.0,0.0,14.0,0.0,0.0,9.0,6.0,7.0
7,0.0,1.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0,1.0,0.0,11.0,4.0,4.0
8,1.0,1.0,1.0,1.0,30.0,1.0,0.0,2.0,0.0,5.0,30.0,30.0,1.0,0.0,9.0,5.0,1.0
9,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,3.0


# 3. Classification Model Creation

In [11]:
from sklearn import neighbors, metrics

In [12]:
#Defining the k value and selecting what weight will be given to close data points
knn = neighbors.KNeighborsClassifier(n_neighbors = 45, weights = 'uniform')

In [13]:
#Training the model
knn.fit(X_train, y_train)

In [14]:
#Creating predictions 
prediction = knn.predict(X_test)

#Finding out the model accuracy
accuracy = metrics.accuracy_score(y_test, prediction)

print(f"The accuracy of this classification model is {accuracy:.2%}")

The accuracy of this classification model is 90.50%


In [15]:
#Manually verifying the model
print("actual value :", y[15])
print("predicted value:", knn.predict(X)[15])

actual value : 0.0
predicted value: 0.0


In [16]:
#Using a dummy variable to be able to check any value
a = 1230
print("actual value:", y[a])
print("predicted value:", knn.predict(X)[a])

actual value: 0.0
predicted value: 0.0


## 3.1 Testing a higher k value

In [17]:
#Defining a higher k value
knn_2 = neighbors.KNeighborsClassifier(n_neighbors = 190, weights = 'uniform')

In [18]:
#Training the model
knn_2.fit(X_train, y_train)

In [19]:
#Creating predictions 
prediction_2 = knn_2.predict(X_test)

#Finding out the model accuracy
accuracy_2 = metrics.accuracy_score(y_test, prediction_2)

print(f"The accuracy of this classification model is {accuracy_2:.2%}")

The accuracy of this classification model is 90.49%


Taking a higher number of neighbors data points into consideration decreases the accuracy. Although the results are pretty close.

## 3.2 Giving more weight to closer data points

### 3.2.1 More weight to closer data points with the inital k value

In [20]:
knn_3 = neighbors.KNeighborsClassifier(n_neighbors = 45, weights = 'distance')

In [21]:
#Training the model
knn_3.fit(X_train, y_train)

In [22]:
#Creating predictions 
prediction_3 = knn_3.predict(X_test)

#Finding out the model accuracy
accuracy_3 = metrics.accuracy_score(y_test, prediction_3)

In [23]:
print(f"The accuracy of this classification model is {accuracy_3:.2%}")

The accuracy of this classification model is 90.17%


### 3.2.2 More wight to closer data points while increasing the k value

In [24]:
knn_4 = neighbors.KNeighborsClassifier(n_neighbors = 190, weights = 'distance')

In [25]:
#Training the model
knn_4.fit(X_train, y_train)

In [26]:
#Creating predictions 
prediction_4 = knn_4.predict(X_test)

#Finding out the model accuracy
accuracy_4 = metrics.accuracy_score(y_test, prediction_4)

print(f"The accuracy of this classification model is {accuracy_4:.2%}")

The accuracy of this classification model is 90.16%


## Classification model conclusions

Interestingly, increasing the k value or giving more weight to closer data points does not improve the model's accuracy.
After testing a few cases with various k value and trying to change the weight, we can see that:
- Models with the highest accuracy are using a "uniform" weight (giving the same importance) to all data points.
- Using a higher number of data points to consider (increasing the k value) does not imporve the accuracy either.

# 4. SVM Model Creation

When we have many features, Support Vector Machine models are usually more effective. Although the previous model yielded good results. I retry with an SVM model

In [27]:
from sklearn import svm

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
model = svm.SVC()
model.fit(X_train,y_train)

In [30]:
predictions_svm = model.predict(X_test)
acc = accuracy_score(y_test, predictions_svm)

In [31]:
print(f"The accuracy of the model is {acc:.2%}")

The accuracy of the model is 90.49%
