We have a data which classified if patients have heart disease or not according to features in it. We will try to use this data to create a model which tries predict if a patient has this disease or not. We will use Classification algorithms.


*   Logistic Regression
*   Support Vector Machine
* K-Nearest Neighbor
* Decision Trees
* Random Forest
* CatBoost


In [None]:
pip install numpy pandas


In [1]:
# import warnings
# warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.preprocessing import LabelEncoder

#For Model Evaluation
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score,f1_score,classification_report,recall_score
from sklearn.preprocessing import label_binarize

ModuleNotFoundError: No module named 'numpy'

### **Read Data**

---



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/heart.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

Data contains;

* Age - age in years
* Sex - (1 = male; 0 = female)
* ChestPainType - chest pain type
* RestingBP - resting blood pressure (in mm Hg on admission to the hospital)
* Cholesterol - serum cholestoral in mg/dl
* FastingBS  - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
* RestingECG  - resting electrocardiographic results
* MaxHR - maximum heart rate achieved
* ExerciseAngina - exercise induced angina (1 = yes; 0 = no)
* Oldpeak - ST depression induced by exercise relative to rest
* ST_Slope - the slope of the peak exercise ST segment
* HeartDisease - have disease or not (1=yes, 0=no)

In [None]:
df.columns

In [None]:
en = LabelEncoder()

In [None]:
# df.drop(columns=['ChestPainType','RestingECG','ST_Slope'])
cols = ['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope']
for col in cols:
    df[col] = en.fit_transform(df[col])

In [None]:
df.describe()

In [None]:
df.groupby('HeartDisease').mean()

In [None]:
df.head()

## **Data Preprocessing**

In [None]:
# RestingBP
# RestingBP represents the blood pressure of the patient. It is impossible to have values equal to 0; that's why I'll remove the outlier with value 0.
df = df.drop(df[(df['RestingBP'] == 0)].index)
# df.head()
# Dealing with the outliers (RestingBP)
q1 = df['RestingBP'].quantile(0.25)
q3 = df['RestingBP'].quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr

u = df[(df['RestingBP'] >= Upper_tail) | (df['RestingBP'] <= Lower_tail)]  # | means "or"
u = pd.DataFrame(u)
print('Outliers on RestingBP:')
u.value_counts(u['HeartDisease'])


In [None]:
med = np.median(df['RestingBP'])
for i in df['RestingBP']:
    if i > Upper_tail or i < Lower_tail:
            df['RestingBP'] = df['RestingBP'].replace(i, med)

In [None]:
# Dealing with outliers (Cholesterol)
q1 = df['Cholesterol'].quantile(0.25)
q3 = df['Cholesterol'].quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr

u = df[(df['Cholesterol'] >= Upper_tail) | (df['Cholesterol'] <= Lower_tail)]  # | means "or"
u = pd.DataFrame(u)
print('Outliers on Cholesterol:')
u.value_counts(df['HeartDisease'])

In [None]:
# Median imputation (Cholesterol) just on upper tail
med = np.median(df['Cholesterol'])
for i in df['Cholesterol']:
    if i > Upper_tail:
            df['Cholesterol'] = df['Cholesterol'].replace(i, med)

In [None]:
# Dealing with outliers (Oldpeak)
q1 = df['Oldpeak'].quantile(0.25)
q3 = df['Oldpeak'].quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr

u = df[(df['Oldpeak'] >= Upper_tail) | (df['Oldpeak'] <= Lower_tail)]  # | means "or"
u = pd.DataFrame(u)
u.value_counts(df['HeartDisease'])

In [None]:
x = df.drop('HeartDisease', axis = 1)
y = df['HeartDisease']

In [None]:
print(x)


In [None]:
print(y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, stratify= y, random_state = 2)

In [None]:
print(x_train.shape)
print(x_test.shape)

## **Building Logistic Regression Model**

---



In [None]:
# model = LogisticRegression(random_state = 0, max_iter = 1000).fit(x_train, y_train)
# y_pred = model.predict(x_test)
# x_train_pred = model.predict(x_train)
# training_data_accuracy = accuracy_score(x_train_pred, y_train)
# x_test_pred = model.predict(x_test)
# testing_data_accuracy = accuracy_score(x_test_pred, y_test)

In [None]:
# print("Accuracy of training data: ",training_data_accuracy * 100)
# print("Accuracy of testing data: ",testing_data_accuracy * 100)

## **Building SVM Model**

---



In [None]:
# classifier = svm.SVC(kernel= 'linear')
# classifier.fit(x_train, y_train)
# x_train_pred2 = classifier.predict(x_train)
# training_data_accuracy = accuracy_score(x_train_pred2, y_train)
# x_test_pred2 = classifier.predict(x_test)
# testing_data_accuracy = accuracy_score(x_test_pred2, y_test)

In [None]:
# print("Accuracy of training data: ",training_data_accuracy * 100)
# print("Accuracy of testing data: ",testing_data_accuracy * 100)

## **k-Nearest Neighbour**

In [None]:
# from sklearn.neighbors import KNeighborsClassifier

# knn = KNeighborsClassifier(n_neighbors=4)
# knn = knn.fit(x_train, y_train)
# y_pred = knn.predict(x_test)
# x_train_pred3 = knn.predict(x_train)
# training_data_accuracy = accuracy_score(x_train_pred3, y_train)
# x_test_pred3 = knn.predict(x_test)
# testing_data_accuracy = accuracy_score(x_test_pred3, y_test)

In [None]:
# print("Accuracy of training data: ",training_data_accuracy * 100)
# print("Accuracy of testing data: ",testing_data_accuracy * 100)

### **Decision Tree**

In [None]:
# from sklearn.tree import DecisionTreeClassifier

# dt = DecisionTreeClassifier(random_state=42)
# dt = dt.fit(x_train, y_train)
# y_pred = dt.predict(x_test)

# x_train_pred4 = dt.predict(x_train)
# training_data_accuracy = accuracy_score(x_train_pred4, y_train)
# x_test_pred4 = dt.predict(x_test)
# testing_data_accuracy = accuracy_score(x_test_pred4, y_test)

In [None]:
# # print("Accuracy of training data: ",training_data_accuracy * 100)
# print("Accuracy of testing data: ",testing_data_accuracy * 100)

## **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# First model
RF = RandomForestClassifier(random_state=42, n_estimators=100)
                            
RF = RF.fit(x_train, y_train)
y_pred = RF.predict(x_test)

x_train_pred5 = RF.predict(x_train)
training_data_accuracy = accuracy_score(x_train_pred5, y_train)
x_test_pred5 = RF.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_pred5, y_test)

In [None]:
# print("Accuracy of training data: ",training_data_accuracy * 100)
print("Accuracy of testing data: ",testing_data_accuracy * 100)

## **CatBoost**

In [None]:
# !pip install catboost

In [None]:
# from catboost import CatBoostClassifier

# cat = CatBoostClassifier(iterations=80,
#                            learning_rate=1,
#                            depth=2)

# # Fit model

# cat.fit(x_train, y_train)
# # Get predicted classes
# preds_class = cat.predict(x_test)
# x_train_pred = cat.predict(x_train)
# training_data_accuracy = accuracy_score(x_train_pred, y_train)
# x_test_pred = cat.predict(x_test)
# testing_data_accuracy = accuracy_score(x_test_pred, y_test)

# # print(model.get_best_iteration())
# # print("Accuracy of training data: ",training_data_accuracy * 100)
# print("Accuracy of testing data: ",testing_data_accuracy * 100)

In [None]:
# models_accuracy_scores = {}
# models_accuracy_scores["Logistic Regression"] = [f1_score(y_test,model.predict(x_test),average="weighted"),
#                                                  model.score(x_test,y_test),
#                                                  recall_score(y_test,model.predict(x_test), average = 'binary')]
# models_accuracy_scores["SVM"] = [f1_score(y_test,classifier.predict(x_test),average="weighted"),
#                                                  classifier.score(x_test,y_test),
#                                                  recall_score(y_test,classifier.predict(x_test), average = 'binary')]
# models_accuracy_scores["K-Nearest Neighbors"] = [f1_score(y_test,knn.predict(x_test),average="weighted"),
#                                                  knn.score(x_test,y_test),
#                                                  recall_score(y_test,knn.predict(x_test), average = 'binary')]
# models_accuracy_scores["Random Forest"] = [f1_score(y_test,RF.predict(x_test),average="weighted"),
#                                                  RF.score(x_test,y_test),
#                                                  recall_score(y_test,RF.predict(x_test), average = 'binary')]
# models_accuracy_scores["Decision Tree"] = [f1_score(y_test,dt.predict(x_test),average="weighted"),
#                                                  dt.score(x_test,y_test),
#                                                  recall_score(y_test,dt.predict(x_test), average = 'binary')]
# models_accuracy_scores["CatBoost"] = [f1_score(y_test,cat.predict(x_test),average="weighted"),
#                                                  cat.score(x_test,y_test),
#                                                  recall_score(y_test,cat.predict(x_test), average = 'binary')]

In [None]:
# names = ["F1-Score","Accuracy","Recall"]
# df_scores = pd.DataFrame(models_accuracy_scores.values(),columns = names,index=models_accuracy_scores.keys())
    
# for names in df_scores.columns:
#     fig = plt.figure(figsize = (10, 5))
#     ax = sns.barplot(y=df_scores[names],x=df_scores.index)
#     ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right") 
#     plt.xlabel("Methods")
#     plt.ylabel(names)
#     plt.title("Best Perfomed Method")
#     plt.show()

In [None]:
# df_scores

## Review Model

In [None]:
def prediction(a,g,cpt,restingbp,c,f,restingECG,m,ea,op,st):
    # a = int(a)
    a = ((a-0.08)/(82-0.08))
    if cpt == "ATA":
        cpt = 1
    if cpt == "NAP":
        cpt = 2
    if cpt == "ASY":
        cpt = 3
    if cpt == "TA":
        cpt = 4

    if g == "male":
        g = 1
    elif g == "female":
         g = 0
    else:
         g = 2

    if st == "up":
        st = 0
    else:
        st = 1


    pred = RF.predict([[a,g,cpt,restingbp,c,f,restingECG,m,ea,op,st]])
    return pred[0]


#INPUT - FROM - USER
a = int(input("Enter your age : "))

g = input("Enter your gender : ")
g = g.lower()


cpt = input("Do you have cpt ? ATA, NAP, ASY, TA")
cpt = cpt.upper()

restingbp = int(input("Do you have any restingbp ? 0 or 1 : "))
# restingbp = ht.lower()

c = int(input("High Cholestrol: 1 or 0 "))
# m = m.lower()

f = int(input("High fastingBS: 1 or 0 "))

restingECG = int(input("High restingECG: 1 or 0 "))
# r = r.lower()
m = int(input("max hr: 1 or 0 "))

ea = int(input("Exercise angina: 1 or 0 "))

op = int(input("old peak: 1 or 0 "))
# gl = gl.lower()

st = input("ST_slope : up or down")
st = st.lower()

#call prediction function
op = prediction(a,g,cpt,restingbp,c,f,restingECG,m,ea,op,st)
if op == 1:
    print("Person has chances of having heart disease")
else:
    print("Patient has no risk of heart disease")


In [None]:
import pickle
pickle.dump(model,open('model.pkl','wb'))