Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import copy
import math

%matplotlib inline

Data Collection and Processing

In [2]:
#loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('/Users/manra/Downloads/heart.csv')

In [5]:
#print first 5 rows of the dataset
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
#print last 5 rows of the dataset
heart_data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1
1024,54,1,0,120,188,0,1,113,0,1.4,1,1,3,0


In [5]:
#number of rows and columns in the dataset
heart_data.shape

(1025, 14)

In [8]:
#getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [6]:
#Checking for missing values
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
#statistical measures about the data
heart_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [8]:
#checking the distribution of Target variable
heart_data['target'].value_counts()

1    526
0    499
Name: target, dtype: int64

1 ---> Defective Heart
0 ---> Healthy Heart

Spliting the features and targets

In [3]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [10]:
print(X)

      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0      52    1   0       125   212    0        1      168      0      1.0   
1      53    1   0       140   203    1        0      155      1      3.1   
2      70    1   0       145   174    0        1      125      1      2.6   
3      61    1   0       148   203    0        1      161      0      0.0   
4      62    0   0       138   294    1        1      106      0      1.9   
...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
1020   59    1   1       140   221    0        1      164      1      0.0   
1021   60    1   0       125   258    0        0      141      1      2.8   
1022   47    1   0       110   275    0        0      118      1      1.0   
1023   50    0   0       110   254    0        0      159      0      0.0   
1024   54    1   0       120   188    0        1      113      0      1.4   

      slope  ca  thal  
0         2   2     3  
1         0   0     3  
2  

In [11]:
print(Y)

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64


 Spliting the Data into Training data & Test Data

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [13]:
print(X.shape, X_train.shape, X_test.shape)

(1025, 13) (820, 13) (205, 13)


converting training features and targets into numpy array

In [5]:
X_train = X_train.to_numpy()
Y_train = Y_train.to_numpy()

converting testing features and targets into numpy array

In [1]:
X_test = X_test.to_numpy()
Y_test = Y_test.to_numpy()

NameError: name 'X_test' is not defined

Print first five data values of X_train and Y_train

In [11]:
print(X_train[:5])

[[ 52.    1.    0.  128.  204.    1.    1.  156.    1.    1.    1.    0.
    0. ]
 [ 64.    1.    2.  125.  309.    0.    1.  131.    1.    1.8   1.    0.
    3. ]
 [ 51.    0.    2.  140.  308.    0.    0.  142.    0.    1.5   2.    1.
    2. ]
 [ 52.    1.    3.  118.  186.    0.    0.  190.    0.    0.    1.    0.
    1. ]
 [ 40.    1.    0.  110.  167.    0.    0.  114.    1.    2.    1.    0.
    3. ]]


In [12]:
print(Y_train[:5])

[0 0 1 1 0]


In [7]:
X_features = ['age', 'sex', 'chest pain type', 'resting blood pressure', 'serum cholestrol', 'fasting blood sugar', 'resting electrocardiographic', 'max heart rate', 'agina','oldpeak','ST segment','major vessels','thal']

Model Training

Logistic Regression

Feature Scaling by z_score normalisation

In [15]:
def zscore_normalize_features(X):
    """
    computes  X, zcore normalized by column
    
    Args:
      X (ndarray (m,n))     : input data, m examples, n features
      
    Returns:
      X_norm (ndarray (m,n)): input normalized by column
      mu (ndarray (n,))     : mean of each feature
      sigma (ndarray (n,))  : standard deviation of each feature
    """
    # find the mean of each column/feature
    mu     = np.mean(X, axis=0)                 # mu will have shape (n,)
    # find the standard deviation of each column/feature
    sigma  = np.std(X, axis=0)                  # sigma will have shape (n,)
    # element-wise, subtract mu for that column from each example, divide by std for that column
    X_norm = (X - mu) / sigma      

    return X_norm

In [16]:
X_train =  zscore_normalize_features(X_train)
X_test =  zscore_normalize_features(X_test)

Sigmoid function

g(x)=(1)/(1+e^(-z))
z= wx + b

In [17]:
def sigmoid(z):
    """
    Compute the sigmoid of z

    Args:
        z (ndarray): A scalar, numpy array of any size.

    Returns:
        g (ndarray): sigmoid(z), with the same shape as z
         
    """
    g = 1/(1+ np.exp(-z))
    
    return g

Cost Function

𝑙𝑜𝑠𝑠(𝑓𝐰,𝑏(𝐱(𝑖)),𝑦(𝑖))=−𝑦(𝑖)log(𝑓𝐰,𝑏(𝐱(𝑖)))−(1−𝑦(𝑖))log(1−𝑓𝐰,𝑏(𝐱(𝑖)))

In [18]:
def compute_cost(X, y, w, b, *argv):

    m, n = X.shape
    total_cost = 0
    
    for i in range(m):
        z_wb = 0
        for j in range(n):
            z_wb += (w[j]*X[i][j])
        z_wb += b
        
        f_wb = sigmoid(z_wb)
        
        loss = (-y[i]*math.log(f_wb))-((1-y[i])*math.log(1-f_wb))
        
        total_cost += loss
        
    total_cost = total_cost/m

    return total_cost

Gradient Function

∂𝐽(𝐰,𝑏)/∂𝑤𝑗 = (1/𝑚)∑𝑖=0 to 𝑚−1(𝑓𝐰,𝑏(𝐱(𝑖))−𝑦(𝑖))𝑥(𝑖)𝑗
∂𝐽(𝐰,𝑏)/∂𝑏 = (1/𝑚)∑𝑖=0 to 𝑚−1(𝑓𝐰,𝑏(𝐱(𝑖))−𝑦(𝑖))

In [19]:
def compute_gradient(X, y, w, b, *argv): 
    m, n = X.shape
    dj_dw = np.zeros(w.shape)
    dj_db = 0.
    for i in range(m):
        z_wb = 0
        for j in range(n): 
            z_wb += (w[j]*X[i][j])
        z_wb += b
        f_wb = sigmoid(z_wb)
        
        dj_db_i = f_wb - y[i]
        dj_db += dj_db_i
        
        for j in range(n):
            dj_dw[j] += (f_wb - y[i])*X[i][j]
            
    dj_dw = dj_dw/m
    dj_db = dj_db/m
    
    return dj_db, dj_dw

Gradient Descent

repeat until convergence:{
            𝑤𝑗=𝑤𝑗−𝛼∂𝐽(𝐰,𝑏)/∂𝑤𝑗
            𝑏=𝑏−𝛼∂𝐽(𝐰,𝑏)/∂𝑏              for j := 0..n-1(1)
        }

In [20]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda_): 
    # number of training examples
    m = len(X)
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w_history = []
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db, dj_dw = gradient_function(X, y, w_in, b_in, lambda_)   

        # Update Parameters using w, b, alpha and gradient
        w_in = w_in - alpha * dj_dw               
        b_in = b_in - alpha * dj_db              
       
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            cost =  cost_function(X, y, w_in, b_in, lambda_)
            J_history.append(cost)

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters/10) == 0 or i == (num_iters-1):
            w_history.append(w_in)
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   ")
        
    return w_in, b_in, J_history, w_history #return w and J,w history for graphing

Predict Function

In [21]:
def predict(X, w, b): 
    # number of training examples
    m, n = X.shape   
    p = np.zeros(m)
   
    ### START CODE HERE ### 
    # Loop over each example
    for i in range(m):   
        z_wb = 0
        # Loop over each feature
        for j in range(n): 
            # Add the corresponding term to z_wb
#             print(z_wb)
#             print(i, j)
#             print(w[j])
#             print(X[i][j])
            z_wb += (w[j]*X[i][j])
        
        # Add bias term 
        z_wb += b
        
        # Calculate the prediction for this example
        f_wb = sigmoid(z_wb)

        # Apply the threshold
        if(f_wb >= 0.5):
            p[i] = 1
        else:
            p[i] = 0
    return p

Training the model , calculating the values of w and b in f(w,b) = wx + b

In [23]:
np.random.seed(1)
m, n = X_train.shape
initial_w = np.zeros(n)
initial_b = 0

# Some gradient descent settings
iterations = 10000
alpha = 0.001

w,b, J_history,_ = gradient_descent(X_train ,Y_train, initial_w, initial_b, 
                   compute_cost, compute_gradient, alpha, iterations, 0)


Iteration    0: Cost     0.69   
Iteration 1000: Cost     0.50   
Iteration 2000: Cost     0.44   
Iteration 3000: Cost     0.40   
Iteration 4000: Cost     0.39   
Iteration 5000: Cost     0.38   
Iteration 6000: Cost     0.37   
Iteration 7000: Cost     0.36   
Iteration 8000: Cost     0.36   
Iteration 9000: Cost     0.36   
Iteration 9999: Cost     0.35   


In [25]:
print(f"b,w found by gradient descent: {b:0.2f},{w} ")

b,w found by gradient descent: -0.02,[-0.17416954 -0.49539023  0.58834699 -0.1534396  -0.12101612  0.02395421
  0.13230742  0.44269496 -0.50757117 -0.4632981   0.28352167 -0.55558603
 -0.49053903] 


Building predictive system

In [27]:
input_data = (50,0,1,120,244,0,1,162,0,1.1,2,0,2)

#change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

#reshape the numpy array as we are predicting for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = predict(input_data_reshaped, w, b)
print(prediction)

if(prediction[0] == 0):
    print('The person does not have a heart disease')
else:
    print('The person have a heart disease')

[1.]
The person have a heart disease


Applying model on training data

In [30]:
p_train = predict(X_train, w, b)
print('Train Accuracy: %f'%(np.mean(p_train == Y_train) * 100))

Train Accuracy: 86.341463


Converting test data to numpy arrays

In [70]:
X_test = X_test.to_numpy()
Y_test = Y_test.to_numpy()

In [71]:
print(X_test)

[[ 0.1419897   0.6967835  -0.91133374 ... -0.61475342  0.23662426
  -1.97701199]
 [ 0.79123628 -1.43516602 -0.91133374 ... -0.61475342 -0.73353521
  -0.46474536]
 [-1.48112675  0.6967835  -0.91133374 ...  0.98049279 -0.73353521
   1.04752128]
 ...
 [-1.48112675 -1.43516602  0.08240784 ...  0.98049279 -0.73353521
  -0.46474536]
 [ 1.11585957  0.6967835  -0.91133374 ...  0.98049279 -0.73353521
   1.04752128]
 [-0.29084135  0.6967835  -0.91133374 ... -0.61475342 -0.73353521
  -3.48927862]]


Predicting answers for test data

In [74]:
m2,_ = X_test.shape
# print(m2, _)
p_test = predict(X_test, w, b)
for i in range(m2):
    print(f"prediction: {p_test[i]}, target value: {Y_test[i]}")

prediction: 0.0, target value: 0
prediction: 1.0, target value: 1
prediction: 1.0, target value: 0
prediction: 0.0, target value: 0
prediction: 0.0, target value: 1
prediction: 1.0, target value: 0
prediction: 1.0, target value: 1
prediction: 1.0, target value: 1
prediction: 0.0, target value: 0
prediction: 0.0, target value: 0
prediction: 1.0, target value: 1
prediction: 1.0, target value: 0
prediction: 1.0, target value: 1
prediction: 1.0, target value: 1
prediction: 1.0, target value: 0
prediction: 1.0, target value: 0
prediction: 1.0, target value: 1
prediction: 1.0, target value: 1
prediction: 1.0, target value: 1
prediction: 0.0, target value: 1
prediction: 1.0, target value: 0
prediction: 1.0, target value: 0
prediction: 0.0, target value: 0
prediction: 1.0, target value: 1
prediction: 1.0, target value: 1
prediction: 0.0, target value: 0
prediction: 0.0, target value: 0
prediction: 0.0, target value: 1
prediction: 1.0, target value: 1
prediction: 1.0, target value: 0
prediction

Calculating accuracy for test data

In [31]:
p_test = predict(X_test, w, b)
print('Test Accuracy: %f'%(np.mean(p_test == Y_test) * 100))

Test Accuracy: 80.000000


Model training Using inbuilt function of Scikit-learn

In [33]:
model = LogisticRegression()

In [34]:
#training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

LogisticRegression()

Model Evaluation

Accuracy Score

In [36]:
#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [37]:
print('Accuracy score on Training data : ', training_data_accuracy)

Accuracy score on Training data :  0.8585365853658536


In [38]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [39]:
print('Accuracy score on Test data : ', test_data_accuracy)

Accuracy score on Test data :  0.8097560975609757


Building a Predictive System

In [32]:
input_data = (51,1,0,140,298,0,1,122,1,4.2,1,3,3)

#change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

#reshape the numpy array as we are predicting for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0] == 0):
    print('The person does not have a heart disease')
else:
    print('The person have a heart disease')

[0]
The person does not have a heart disease
