We have a data which classified if patients have heart disease or not according to features in it. We will try to use this data to create a model which tries predict if a patient has this disease or not. We will use Classification algorithms.


*   Logistic Regression
*   Support Vector Machine
* K-Nearest Neighbor
* Decision Trees
* Random Forest
* CatBoost


In [5]:
pip install numpy pandas matplotlib seaborn scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [6]:
# import warnings
# warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.preprocessing import LabelEncoder

#For Model Evaluation
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score,f1_score,classification_report,recall_score
from sklearn.preprocessing import label_binarize

### **Read Data**

---



In [7]:
# from google.colab import drive
# drive.mount('/content/drive')

In [8]:
df = pd.read_csv('./heart.csv')

In [9]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [10]:
df.shape

(918, 12)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


Data contains;

* Age - age in years
* Sex - (1 = male; 0 = female)
* ChestPainType - chest pain type
* RestingBP - resting blood pressure (in mm Hg on admission to the hospital)
* Cholesterol - serum cholestoral in mg/dl
* FastingBS  - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
* RestingECG  - resting electrocardiographic results
* MaxHR - maximum heart rate achieved
* ExerciseAngina - exercise induced angina (1 = yes; 0 = no)
* Oldpeak - ST depression induced by exercise relative to rest
* ST_Slope - the slope of the peak exercise ST segment
* HeartDisease - have disease or not (1=yes, 0=no)

In [12]:
df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [13]:
en = LabelEncoder()

In [14]:
# df.drop(columns=['ChestPainType','RestingECG','ST_Slope'])
cols = ['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope']
for col in cols:
    df[col] = en.fit_transform(df[col])

In [15]:
df.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,0.78976,0.781046,132.396514,198.799564,0.233115,0.989107,136.809368,0.404139,0.887364,1.361656,0.553377
std,9.432617,0.407701,0.956519,18.514154,109.384145,0.423046,0.631671,25.460334,0.490992,1.06657,0.607056,0.497414
min,28.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,0.0,120.0,173.25,0.0,1.0,120.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,0.0,130.0,223.0,0.0,1.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,1.0,2.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,2.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


In [16]:
df.groupby('HeartDisease').mean()

Unnamed: 0_level_0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,50.55122,0.65122,1.192683,130.180488,227.121951,0.107317,0.94878,148.15122,0.134146,0.408049,1.739024
1,55.899606,0.901575,0.448819,134.185039,175.940945,0.334646,1.021654,127.655512,0.622047,1.274213,1.057087


In [17]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


## **Data Preprocessing**

In [18]:
# RestingBP
# RestingBP represents the blood pressure of the patient. It is impossible to have values equal to 0; that's why I'll remove the outlier with value 0.
df = df.drop(df[(df['RestingBP'] == 0)].index)
# df.head()
# Dealing with the outliers (RestingBP)
q1 = df['RestingBP'].quantile(0.25)
q3 = df['RestingBP'].quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr

u = df[(df['RestingBP'] >= Upper_tail) | (df['RestingBP'] <= Lower_tail)]  # | means "or"
u = pd.DataFrame(u)
print('Outliers on RestingBP:')
u.value_counts(u['HeartDisease'])


Outliers on RestingBP:


HeartDisease
1    28
0    13
dtype: int64

In [19]:
med = np.median(df['RestingBP'])
for i in df['RestingBP']:
    if i > Upper_tail or i < Lower_tail:
            df['RestingBP'] = df['RestingBP'].replace(i, med)

In [20]:
# Dealing with outliers (Cholesterol)
q1 = df['Cholesterol'].quantile(0.25)
q3 = df['Cholesterol'].quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr

u = df[(df['Cholesterol'] >= Upper_tail) | (df['Cholesterol'] <= Lower_tail)]  # | means "or"
u = pd.DataFrame(u)
print('Outliers on Cholesterol:')
u.value_counts(df['HeartDisease'])

Outliers on Cholesterol:


HeartDisease
1    158
0     25
dtype: int64

In [21]:
# Median imputation (Cholesterol) just on upper tail
med = np.median(df['Cholesterol'])
for i in df['Cholesterol']:
    if i > Upper_tail:
            df['Cholesterol'] = df['Cholesterol'].replace(i, med)

In [22]:
# Dealing with outliers (Oldpeak)
q1 = df['Oldpeak'].quantile(0.25)
q3 = df['Oldpeak'].quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr

u = df[(df['Oldpeak'] >= Upper_tail) | (df['Oldpeak'] <= Lower_tail)]  # | means "or"
u = pd.DataFrame(u)
u.value_counts(df['HeartDisease'])

HeartDisease
1    15
0     1
dtype: int64

In [23]:
x = df.drop('HeartDisease', axis = 1)
y = df['HeartDisease']

In [24]:
print(x)


     Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0     40    1              1        140          289          0           1   
1     49    0              2        160          180          0           1   
2     37    1              1        130          283          0           2   
3     48    0              0        138          214          0           1   
4     54    1              2        150          195          0           1   
..   ...  ...            ...        ...          ...        ...         ...   
913   45    1              3        110          264          0           1   
914   68    1              0        144          193          1           1   
915   57    1              0        130          131          0           1   
916   57    0              1        130          236          0           0   
917   38    1              2        138          175          0           1   

     MaxHR  ExerciseAngina  Oldpeak  ST_Slope  
0  

In [25]:
print(y)

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 917, dtype: int64


In [26]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, stratify= y, random_state = 2)

In [27]:
print(x_train.shape)
print(x_test.shape)

(733, 11)
(184, 11)


## **Building Logistic Regression Model**

---



In [28]:
# model = LogisticRegression(random_state = 0, max_iter = 1000).fit(x_train, y_train)
# y_pred = model.predict(x_test)
# x_train_pred = model.predict(x_train)
# training_data_accuracy = accuracy_score(x_train_pred, y_train)
# x_test_pred = model.predict(x_test)
# testing_data_accuracy = accuracy_score(x_test_pred, y_test)

In [29]:
# print("Accuracy of training data: ",training_data_accuracy * 100)
# print("Accuracy of testing data: ",testing_data_accuracy * 100)

## **Building SVM Model**

---



In [30]:
# classifier = svm.SVC(kernel= 'linear')
# classifier.fit(x_train, y_train)
# x_train_pred2 = classifier.predict(x_train)
# training_data_accuracy = accuracy_score(x_train_pred2, y_train)
# x_test_pred2 = classifier.predict(x_test)
# testing_data_accuracy = accuracy_score(x_test_pred2, y_test)

In [31]:
# print("Accuracy of training data: ",training_data_accuracy * 100)
# print("Accuracy of testing data: ",testing_data_accuracy * 100)

## **k-Nearest Neighbour**

In [32]:
# from sklearn.neighbors import KNeighborsClassifier

# knn = KNeighborsClassifier(n_neighbors=4)
# knn = knn.fit(x_train, y_train)
# y_pred = knn.predict(x_test)
# x_train_pred3 = knn.predict(x_train)
# training_data_accuracy = accuracy_score(x_train_pred3, y_train)
# x_test_pred3 = knn.predict(x_test)
# testing_data_accuracy = accuracy_score(x_test_pred3, y_test)

In [33]:
# print("Accuracy of training data: ",training_data_accuracy * 100)
# print("Accuracy of testing data: ",testing_data_accuracy * 100)

### **Decision Tree**

In [34]:
# from sklearn.tree import DecisionTreeClassifier

# dt = DecisionTreeClassifier(random_state=42)
# dt = dt.fit(x_train, y_train)
# y_pred = dt.predict(x_test)

# x_train_pred4 = dt.predict(x_train)
# training_data_accuracy = accuracy_score(x_train_pred4, y_train)
# x_test_pred4 = dt.predict(x_test)
# testing_data_accuracy = accuracy_score(x_test_pred4, y_test)

In [35]:
# # print("Accuracy of training data: ",training_data_accuracy * 100)
# print("Accuracy of testing data: ",testing_data_accuracy * 100)

## **Random Forest**

In [36]:
from sklearn.ensemble import RandomForestClassifier

# First model
RF = RandomForestClassifier(random_state=42, n_estimators=100)
                            
RF = RF.fit(x_train, y_train)
y_pred = RF.predict(x_test)

x_train_pred5 = RF.predict(x_train)
training_data_accuracy = accuracy_score(x_train_pred5, y_train)
x_test_pred5 = RF.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_pred5, y_test)

In [37]:
# print("Accuracy of training data: ",training_data_accuracy * 100)
print("Accuracy of testing data: ",testing_data_accuracy * 100)

Accuracy of testing data:  90.21739130434783


## **CatBoost**

In [38]:
# !pip install catboost

In [39]:
# from catboost import CatBoostClassifier

# cat = CatBoostClassifier(iterations=80,
#                            learning_rate=1,
#                            depth=2)

# # Fit model

# cat.fit(x_train, y_train)
# # Get predicted classes
# preds_class = cat.predict(x_test)
# x_train_pred = cat.predict(x_train)
# training_data_accuracy = accuracy_score(x_train_pred, y_train)
# x_test_pred = cat.predict(x_test)
# testing_data_accuracy = accuracy_score(x_test_pred, y_test)

# # print(model.get_best_iteration())
# # print("Accuracy of training data: ",training_data_accuracy * 100)
# print("Accuracy of testing data: ",testing_data_accuracy * 100)

In [40]:
# models_accuracy_scores = {}
# models_accuracy_scores["Logistic Regression"] = [f1_score(y_test,model.predict(x_test),average="weighted"),
#                                                  model.score(x_test,y_test),
#                                                  recall_score(y_test,model.predict(x_test), average = 'binary')]
# models_accuracy_scores["SVM"] = [f1_score(y_test,classifier.predict(x_test),average="weighted"),
#                                                  classifier.score(x_test,y_test),
#                                                  recall_score(y_test,classifier.predict(x_test), average = 'binary')]
# models_accuracy_scores["K-Nearest Neighbors"] = [f1_score(y_test,knn.predict(x_test),average="weighted"),
#                                                  knn.score(x_test,y_test),
#                                                  recall_score(y_test,knn.predict(x_test), average = 'binary')]
# models_accuracy_scores["Random Forest"] = [f1_score(y_test,RF.predict(x_test),average="weighted"),
#                                                  RF.score(x_test,y_test),
#                                                  recall_score(y_test,RF.predict(x_test), average = 'binary')]
# models_accuracy_scores["Decision Tree"] = [f1_score(y_test,dt.predict(x_test),average="weighted"),
#                                                  dt.score(x_test,y_test),
#                                                  recall_score(y_test,dt.predict(x_test), average = 'binary')]
# models_accuracy_scores["CatBoost"] = [f1_score(y_test,cat.predict(x_test),average="weighted"),
#                                                  cat.score(x_test,y_test),
#                                                  recall_score(y_test,cat.predict(x_test), average = 'binary')]

In [41]:
# names = ["F1-Score","Accuracy","Recall"]
# df_scores = pd.DataFrame(models_accuracy_scores.values(),columns = names,index=models_accuracy_scores.keys())
    
# for names in df_scores.columns:
#     fig = plt.figure(figsize = (10, 5))
#     ax = sns.barplot(y=df_scores[names],x=df_scores.index)
#     ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right") 
#     plt.xlabel("Methods")
#     plt.ylabel(names)
#     plt.title("Best Perfomed Method")
#     plt.show()

In [42]:
# df_scores

## Review Model

In [43]:
def prediction(a,g,cpt,restingbp,c,f,restingECG,m,ea,op,st):
    # a = int(a)
    a = ((a-0.08)/(82-0.08))
    if cpt == "ATA":
        cpt = 1
    if cpt == "NAP":
        cpt = 2
    if cpt == "ASY":
        cpt = 3
    if cpt == "TA":
        cpt = 4

    if g == "male":
        g = 1
    elif g == "female":
         g = 0
    else:
         g = 2

    if st == "up":
        st = 0
    else:
        st = 1


    pred = RF.predict([[a,g,cpt,restingbp,c,f,restingECG,m,ea,op,st]])
    return pred[0]


#INPUT - FROM - USER
a = int(input("Enter your age : "))

g = input("Enter your gender : ")
g = g.lower()


cpt = input("Do you have cpt ? ATA, NAP, ASY, TA")
cpt = cpt.upper()

restingbp = int(input("Do you have any restingbp ? 0 or 1 : "))
# restingbp = ht.lower()

c = int(input("High Cholestrol: 1 or 0 "))
# m = m.lower()

f = int(input("High fastingBS: 1 or 0 "))

restingECG = int(input("High restingECG: 1 or 0 "))
# r = r.lower()
m = int(input("max hr: 1 or 0 "))

ea = int(input("Exercise angina: 1 or 0 "))

op = int(input("old peak: 1 or 0 "))
# gl = gl.lower()

st = input("ST_slope : up or down")
st = st.lower()

#call prediction function
op = prediction(a,g,cpt,restingbp,c,f,restingECG,m,ea,op,st)
if op == 1:
    print("Person has chances of having heart disease")
else:
    print("Patient has no risk of heart disease")


Enter your age : 19
Enter your gender : 1
Do you have cpt ? ATA, NAP, ASY, TA1
Do you have any restingbp ? 0 or 1 : 1
High Cholestrol: 1 or 0 1
High fastingBS: 1 or 0 1
High restingECG: 1 or 0 1
max hr: 1 or 0 1
Exercise angina: 1 or 0 1
old peak: 1 or 0 1
ST_slope : up or down1
Person has chances of having heart disease




In [48]:
pip install pickle

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement pickle (from versions: none)
ERROR: No matching distribution found for pickle


In [50]:
import pickle
pickle.dump(RF,open('model.pkl','wb'))