In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
heart_data = pd.read_csv("Heart_Disease_Prediction.csv")
heart_data.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


In [5]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
Age                        270 non-null int64
Sex                        270 non-null int64
Chest pain type            270 non-null int64
BP                         270 non-null int64
Cholesterol                270 non-null int64
FBS over 120               270 non-null int64
EKG results                270 non-null int64
Max HR                     270 non-null int64
Exercise angina            270 non-null int64
ST depression              270 non-null float64
Slope of ST                270 non-null int64
Number of vessels fluro    270 non-null int64
Thallium                   270 non-null int64
Heart Disease              270 non-null object
dtypes: float64(1), int64(12), object(1)
memory usage: 29.6+ KB


In [6]:
heart_data.isnull().sum()

Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64

In [7]:
heart_data.describe()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


In [8]:
heart_data['Heart Disease'].value_counts()

Absence     150
Presence    120
Name: Heart Disease, dtype: int64

Absence --> healthy heart
Presence --> diseased heart  

In [9]:
x = heart_data.drop(columns='Heart Disease',axis=1)
y = heart_data['Heart Disease']

In [10]:
print(x)

     Age  Sex  Chest pain type   BP  Cholesterol  FBS over 120  EKG results  \
0     70    1                4  130          322             0            2   
1     67    0                3  115          564             0            2   
2     57    1                2  124          261             0            0   
3     64    1                4  128          263             0            0   
4     74    0                2  120          269             0            2   
5     65    1                4  120          177             0            0   
6     56    1                3  130          256             1            2   
7     59    1                4  110          239             0            2   
8     60    1                4  140          293             0            2   
9     63    0                4  150          407             0            2   
10    59    1                4  135          234             0            0   
11    53    1                4  142          226    

In [11]:
print(y)

0      Presence
1       Absence
2      Presence
3       Absence
4       Absence
5       Absence
6      Presence
7      Presence
8      Presence
9      Presence
10      Absence
11      Absence
12      Absence
13     Presence
14      Absence
15      Absence
16     Presence
17     Presence
18      Absence
19      Absence
20     Presence
21      Absence
22      Absence
23      Absence
24      Absence
25      Absence
26      Absence
27      Absence
28     Presence
29      Absence
         ...   
240    Presence
241     Absence
242     Absence
243    Presence
244     Absence
245    Presence
246    Presence
247     Absence
248    Presence
249    Presence
250    Presence
251     Absence
252    Presence
253     Absence
254     Absence
255     Absence
256     Absence
257    Presence
258    Presence
259     Absence
260     Absence
261    Presence
262    Presence
263     Absence
264    Presence
265     Absence
266     Absence
267     Absence
268     Absence
269    Presence
Name: Heart Disease, Len

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [13]:
print(x.shape,x_train.shape,x_test.shape)

(270, 13) (216, 13) (54, 13)


In [14]:
model = LogisticRegression()

In [15]:
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction,y_train)

In [17]:
print(('Accuracy on Training data: ',training_data_accuracy))

('Accuracy on Training data: ', 0.8796296296296297)


In [18]:
x_train_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_train_prediction,y_test)

In [19]:
print(('Accuracy on Test data: ',test_data_accuracy))

('Accuracy on Test data: ', 0.8148148148148148)


Building Predictive System

In [29]:
input_data = (63,0,4,150,407,0,2,154,0,4,2,3,7)

#change input data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

#reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0] == 'Presence'):
    print("The person has a heart disease")
else:
    print("The person's heart is healthy")

['Presence']
The person has a heart disease
