In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('CVD_cleaned.csv')
data.head()
data['Diabetes'].unique()
cols = list(data.columns)
cols

# Moving the dependent feature to the last column
data = data[['General_Health','Checkup','Exercise','Skin_Cancer','Other_Cancer','Depression','Diabetes','Arthritis','Sex','Age_Category','Height_(cm)',
 'Weight_(kg)','BMI','Smoking_History','Alcohol_Consumption','Fruit_Consumption','Green_Vegetables_Consumption', 'FriedPotato_Consumption', 'Heart_Disease']]
data.head()

Unnamed: 0,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,Heart_Disease
0,Poor,Within the past 2 years,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0,No
1,Very Good,Within the past year,No,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0,Yes
2,Very Good,Within the past year,Yes,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0,No
3,Poor,Within the past year,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0,Yes
4,Good,Within the past year,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0,No


### Ordinal Encoding 

In [3]:
from sklearn.preprocessing import OrdinalEncoder
#  encoding male and female
enc = OrdinalEncoder(categories=[['Male', 'Female']])
data['Sex'] = enc.fit_transform(data[['Sex']])

#Encoding checkup
Checkup = ['Never', '5 or more years ago', 'Within the past 5 years', 'Within the past 2 years', 'Within the past year' ]
enc4 = OrdinalEncoder(categories=[Checkup])
data['Checkup'] = enc4.fit_transform(data[['Checkup']])

data['General_Health'].unique()
# Encoding health
General_Health =[ 'Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
enc1 = OrdinalEncoder(categories=[General_Health])
data['General_Health'] = enc1.fit_transform(data[['General_Health']])
data['Age_Category'].unique()

# Encoding age group
# since the poor values are 0 stuff, The oldest age will start from 0 for the encoding
age = ['80+','75-79','70-74','65-69', '60-64','55-59','50-54','45-49','40-44','35-39','30-34','25-29','18-24']
enc2 = OrdinalEncoder(categories=[age])
data['Age_Category'] = enc2.fit_transform(data[['Age_Category']])

#Encoding Diabetes
Diabetes = ['Yes', 'No, pre-diabetes or borderline diabetes',
       'Yes, but female told only during pregnancy', 'No']
enc5 = OrdinalEncoder(categories=[Diabetes])
data['Diabetes'] = enc5.fit_transform(data[['Diabetes']])


#Encoding all yes and nos
yans = ['Yes', 'No']
enc3 = OrdinalEncoder(categories=[yans])
data['Exercise'] = enc3.fit_transform(data[['Exercise']])
data['Heart_Disease'] = enc3.fit_transform(data[['Heart_Disease']]) 
# Heart disease is the independent variable 0 means Yes, patient has heart disease, 1 means No, patient does not have heart disease
data['Skin_Cancer'] = enc3.fit_transform(data[['Skin_Cancer']])
data['Other_Cancer'] = enc3.fit_transform(data[['Other_Cancer']])
data['Depression'] = enc3.fit_transform(data[['Depression']])
data['Arthritis'] = enc3.fit_transform(data[['Arthritis']])
data['Smoking_History'] = enc3.fit_transform(data[['Smoking_History']])
# data['Exercise'] = data['Alcohol_Consumption'].unique()

### Feature selection

In [4]:
X = data.iloc[:, 0:18].values
y = data.iloc[:, -1].values

In [5]:
#splitting into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

### Feature Scaling

In [6]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

### Training and Performance metrics

In [86]:
# Using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(criterion='entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [87]:
# confusion matrix
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[  349  7127]
 [  425 84756]]


In [88]:
# applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies)
# the cv = 10 arguement means that the process should cross validate 10 times, the accuracies are printed, displayed in a list form
print(accuracies.mean()) 
# mean of accuracies
print(accuracies.std())
# standard deviation of accuracies

[0.91840888 0.91836263 0.91854764 0.91831637 0.91891767 0.9179926
 0.91785384 0.9183126  0.91849762 0.9183126 ]
0.9183522438612796
0.0002764989575490215


In [7]:
# Using XGBClassifier
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [8]:
# confusion matrix
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[  402  7074]
 [  419 84762]]


In [9]:
# applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies)
# the cv = 10 arguement means that the process should cross validate 10 times, the accuracies are printed, displayed in a list form
print(accuracies.mean()) 
# mean of accuracies
print(accuracies.std())
# standard deviation of accuracies

[0.91887142 0.91817761 0.91965772 0.91882516 0.91840888 0.91961147
 0.91850139 0.91798881 0.91891392 0.91905269]
0.9188009063617464
0.0005257634972669653


In [10]:
# Using Logistic Regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [11]:
# confusion matrix
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[  474  7002]
 [  446 84735]]


In [12]:
# applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies)
# the cv = 10 arguement means that the process should cross validate 10 times, the accuracies are printed, displayed in a list form
print(accuracies.mean()) 
# mean of accuracies
print(accuracies.std())
# standard deviation of accuracies

[0.9193802  0.91877891 0.92002775 0.91914894 0.91910268 0.9192877
 0.91827012 0.91905269 0.91923771 0.91993154]
0.9192218234132026
0.0004830035510337777
