In [1]:
%matplotlib inline
import pandas as pd
import sklearn
import numpy as np
from sklearn import linear_model



In [2]:
#reading Framingham dataset
f = pd.read_csv('framingham.csv')
#removing NA values
framingham = pd.DataFrame.dropna(f)
#printing to 5 obs of the Dataset
framingham.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
#printing shape of dataset
print(framingham.shape)

(3658, 16)


In [4]:
#checking dependent variable for unique values 
print(framingham['TenYearCHD'].unique())
#0 means no Cardic Heart diseases and 1 means CHD will occur after 10 years

[0 1]


In [5]:
framingham.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3658 entries, 0 to 4239
Data columns (total 16 columns):
male               3658 non-null int64
age                3658 non-null int64
education          3658 non-null float64
currentSmoker      3658 non-null int64
cigsPerDay         3658 non-null float64
BPMeds             3658 non-null float64
prevalentStroke    3658 non-null int64
prevalentHyp       3658 non-null int64
diabetes           3658 non-null int64
totChol            3658 non-null float64
sysBP              3658 non-null float64
diaBP              3658 non-null float64
BMI                3658 non-null float64
heartRate          3658 non-null float64
glucose            3658 non-null float64
TenYearCHD         3658 non-null int64
dtypes: float64(9), int64(7)
memory usage: 485.8 KB


In [6]:
framingham.corr(method='kendall')

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
male,1.0,-0.021224,-0.00454,0.206114,0.265207,-0.052124,-0.002312,0.001424,0.013819,-0.051401,-0.013553,0.050094,0.109485,-0.09237,-0.007167,0.091688
age,-0.021224,1.0,-0.140827,-0.17312,-0.15816,0.107841,0.042549,0.253487,0.090174,0.201879,0.268251,0.146161,0.100266,-0.003723,0.077605,0.191679
education,-0.00454,-0.140827,1.0,0.029396,0.020866,-0.013228,-0.026864,-0.079058,-0.041378,-0.01621,-0.100432,-0.051699,-0.109307,-0.038282,-0.017248,-0.06925
currentSmoker,0.206114,-0.17312,0.029396,1.0,0.846276,-0.051923,-0.03815,-0.108078,-0.041849,-0.042078,-0.110639,-0.101115,-0.135773,0.049487,-0.063807,0.019165
cigsPerDay,0.265207,-0.15816,0.020866,0.846276,1.0,-0.047113,-0.036634,-0.08762,-0.036644,-0.03127,-0.085596,-0.069228,-0.097957,0.050749,-0.0644,0.035587
BPMeds,-0.052124,0.107841,-0.013228,-0.051923,-0.047113,1.0,0.113125,0.26291,0.049066,0.072937,0.175091,0.151129,0.074218,-0.003341,0.009796,0.089152
prevalentStroke,-0.002312,0.042549,-0.026864,-0.03815,-0.036634,0.113125,1.0,0.066057,0.009625,0.01376,0.050986,0.046494,0.015117,-0.010413,0.008322,0.048366
prevalentHyp,0.001424,0.253487,-0.079058,-0.108078,-0.08762,0.26291,0.066057,1.0,0.080556,0.135906,0.573022,0.508336,0.234508,0.112523,0.072322,0.181387
diabetes,0.013819,0.090174,-0.041378,-0.041849,-0.036644,0.049066,0.009625,0.080556,1.0,0.029033,0.072512,0.038429,0.058516,0.051235,0.184088,0.093431
totChol,-0.051401,0.201879,-0.01621,-0.042078,-0.03127,0.072937,0.01376,0.135906,0.029033,1.0,0.159313,0.132646,0.101079,0.064741,0.022752,0.070664


In [7]:
#Spliting data sets iun test and training sets
from sklearn.model_selection import train_test_split
from sklearn import metrics #for accuracy calculation

framingham_data_headers = ["male", "age", "education", "currentSmoker", "cigsPerDay", "BPMeds", "prevalentStroke", "prevalentHyp", "diabetes", "totChol", "sysBP","diaBP","BMI","heartRate","glucose","TenYearCHD"]
 
train_x, test_x, train_y, test_y = train_test_split(framingham[framingham_data_headers[:-1]],framingham[framingham_data_headers[-1]], train_size=0.7)



In [9]:
#making Neural Network Models
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(train_x,train_y)

print ("NN Train Accuracy :: ",metrics.accuracy_score(train_y, mlp.predict(train_x)))
print ("NN Test Accuracy :: ", metrics.accuracy_score(test_y, mlp.predict(test_x)))


NN Train Accuracy ::  0.844140625
NN Test Accuracy ::  0.8642987249544627


In [8]:
#making logistic model 

# Train logistic regression
logr = linear_model.LogisticRegression()
logr.fit(train_x, train_y)

print ("Logistic Regression Train Accuracy :: ",metrics.accuracy_score(train_y, logr.predict(train_x)))
print ("Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, logr.predict(test_x)))


Logistic Regression Train Accuracy ::  0.847265625
Logistic regression Test Accuracy ::  0.8688524590163934


In [11]:
#making a decision tree

from sklearn.tree import DecisionTreeClassifier as DTC
dtc = DTC()
dtc.fit(train_x,train_y)

print ("decision tree Train Accuracy :: ",metrics.accuracy_score(train_y, dtc.predict(train_x)))
print ("decision tree Test Accuracy :: ", metrics.accuracy_score(test_y, dtc.predict(test_x)))


decision tree Train Accuracy ::  1.0
decision tree Test Accuracy ::  0.7668488160291439
