# import all Dependencies

In [54]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Data Collection

In [3]:
heart_data = pd.read_csv("heart_statlog_cleveland_hungary_final.csv")

In [4]:
heart_data.head(5)

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [6]:
heart_data.shape

(1190, 12)

In [7]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB


In [9]:
heart_data.isnull().sum()

age                    0
sex                    0
chest pain type        0
resting bp s           0
cholesterol            0
fasting blood sugar    0
resting ecg            0
max heart rate         0
exercise angina        0
oldpeak                0
ST slope               0
target                 0
dtype: int64

In [10]:
heart_data = heart_data.dropna()

In [11]:
# statistical measure about data 
heart_data.describe()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
count,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0
mean,53.720168,0.763866,3.232773,132.153782,210.363866,0.213445,0.698319,139.732773,0.387395,0.922773,1.62437,0.528571
std,9.358203,0.424884,0.93548,18.368823,101.420489,0.409912,0.870359,25.517636,0.48736,1.086337,0.610459,0.499393
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,3.0,120.0,188.0,0.0,0.0,121.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,4.0,130.0,229.0,0.0,0.0,140.5,0.0,0.6,2.0,1.0
75%,60.0,1.0,4.0,140.0,269.75,0.0,2.0,160.0,1.0,1.6,2.0,1.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,1.0


In [12]:
heart_data["target"].value_counts()

target
1    629
0    561
Name: count, dtype: int64

In [13]:
'''
1 -> Defective Heart
0 -> Healty Heart 
'''

'\n1 -> Defective Heart\n0 -> Healty Heart \n'

# Splitting categorical And Numerical

In [16]:
x = heart_data.drop(columns="target", axis=1)
y = heart_data["target"]

# Spiliting the data into Train & Test

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [19]:
print(x.shape, x_train.shape, x_test.shape)

(1190, 11) (952, 11) (238, 11)


# Model Training 

## Logistic Regression

In [20]:
# Training the LR model 
lr_model = LogisticRegression()

In [21]:
lr_model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Model Evaluation

In [24]:
# aaccuracy_score of training data 
train_data_pred = lr_model.predict(x_train)
# accuracy score 
train_accuracy_score = accuracy_score(train_data_pred, y_train)
# display value 
print("Accuracy score of Training Data : ", train_accuracy_score)

Accuracy score of Training Data :  0.8098739495798319


In [27]:
# aaccuracy_score of testing data 
test_data_pred = lr_model.predict(x_test)
# accuracy score 
test_accuracy_score = accuracy_score(test_data_pred, y_test)
# display value 
print("Accuracy score of Testing Data : ", test_accuracy_score)

Accuracy score of Testing Data :  0.8319327731092437


# Build Predictive System 

In [35]:
import warnings
warnings.filterwarnings("ignore")

In [62]:
input_data = ([49, 0, 3, 160, 180, 0, 0, 156, 0, 1.0, 2])

# change input data into numpy array 
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array , as we are prrediction for only on instance 
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = lr_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
    print("The Person does not have Heart Disease")
else:
    print("The Person Has Heart Disease")

[0]
The Person does not have Heart Disease


In [63]:
print("Model Coefficients:", lr_model.coef_)


Model Coefficients: [[ 4.42629141e-04  9.97391250e-01  5.94019156e-01 -6.36784795e-03
  -2.66556583e-03  8.31813003e-01  7.79397687e-02 -2.37143087e-02
   1.06003777e+00  4.42606107e-01  9.40686016e-01]]


In [45]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, lr_model.predict(x_test)))


[[ 86  21]
 [ 19 112]]


# Random Forest Model 

In [59]:
rfr_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [60]:
rfr_model.fit(x_train, y_train)

In [61]:
# aaccuracy_score of testing data 
test_data_pred = rfr_model.predict(x_test)
# accuracy score 
test_accuracy_score = accuracy_score(test_data_pred, y_test)
# display value 
print("Accuracy score of Testing Data : ", test_accuracy_score)

Accuracy score of Testing Data :  0.9453781512605042


In [65]:
input_data = ([49, 0, 3, 160, 180, 0, 0, 156, 0, 1.0, 2])

# change input data into numpy array 
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array , as we are prrediction for only on instance 
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = rfr_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
    print("The Person does not have Heart Disease")
else:
    print("The Person Has Heart Disease")

[1]
The Person Has Heart Disease


# Save the Model 

In [69]:
import joblib
# name 
filename ="Lr Heart Disease Prediction.pkl"
joblib.dump(lr_model, filename)

['Lr Heart Disease Prediction.pkl']

In [70]:
import joblib
# name 
filename =" RFClassifier Heart Disease Prediction.pkl"
joblib.dump(lr_model, filename)

[' RFClassifier Heart Disease Prediction.pkl']