### Heart Failure analysis and detection using Machine Learning techniques

In [1]:
# import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import plot_confusion_matrix, accuracy_score, classification_report
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
## Read the data

In [3]:
data = pd.read_csv("heart.csv")
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'heart.csv'

In [None]:
# Basic information about the dataset

In [None]:
data.info()

In [None]:
data.shape

#### There are 918 rows and 12 columns

In [None]:
data.size

#### The size of the dataset is 11016.
- 11 colums
- 6 integer feature
- 1 float value
- 4 object data ( categorical data)


#### Check null values

In [None]:
data.isna().sum()

In [None]:
### visualize in the heatmap
plt.figure(figsize = (16,8))
sns.heatmap(data.isna(),cmap='BrBG')

In [None]:
data.head()

#### Data analysis

In [None]:
# analyze sex

In [None]:
data['Sex'].value_counts()

In [None]:
plt.figure(figsize = (16,8))
sns.countplot(data['Sex'])

#### According to this dataset, which gender suffered more from heart disease?

In [None]:
tot = data['HeartDisease'].groupby(data['Sex']).count()
suff = data['HeartDisease'].groupby(data['Sex']).sum()
not_suf = tot-suff
not_suf

In [None]:
sns.countplot(not_suf)

In [None]:
sns.countplot(suff)

In [None]:
female_suffered = suff[0]
male_suffered = suff[1]


In [None]:
total_gend = data['Sex'].count()
print("Female suffered percent ", (female_suffered/total_gend)*100)

In [None]:
total_gend = data['Sex'].count()
print("Male suffered percent ", (male_suffered/total_gend)*100)

#### Analyze the chest pain type

In [None]:
data['ChestPainType'].value_counts()

In [None]:
plt.figure(figsize = (16,8))
sns.countplot(data['ChestPainType'
])

In [None]:
count_me = data['HeartDisease'].groupby(data['ChestPainType']).count()

In [None]:
sum_me = data['HeartDisease'].groupby(data['ChestPainType']).sum()

In [None]:
sum_me

In [None]:
no_risk = count_me-sum_me

In [None]:
print("{} of people who have ASY got heart disease and {} of people who have ASY doesn't get heart disease.".format(sum_me[0],no_risk[0]))

In [None]:
data.head()

##### Relation between RestingECG and the HeartDisease

In [None]:
data["RestingECG"].value_counts()

In [None]:
plt.figure(figsize = (16,8))
plt.hist(data["RestingECG"],color='green')

In [None]:
data.head()

### Label Encoding

In [None]:
labelencoder = LabelEncoder()
data["Sex"] = labelencoder.fit_transform(data["Sex"])

In [None]:
data["ChestPainType"] = labelencoder.fit_transform(data["ChestPainType"])
data["RestingECG"] = labelencoder.fit_transform(data["RestingECG"])
data["ExerciseAngina"] = labelencoder.fit_transform(data["ExerciseAngina"])
data["ST_Slope"] = labelencoder.fit_transform(data["ST_Slope"])

In [None]:

#label = data["HeartDisease"].copy()


In [None]:
sns.boxplot(data["RestingBP"])

In [None]:
sns.boxplot(data["Cholesterol"])

In [None]:
sns.boxplot(data["MaxHR"])

In [None]:
#### Remove the outlier
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3-Q1
IQR

In [None]:
data = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis = 1)]
data.shape

In [None]:
sns.boxplot(data["RestingBP"])

In [None]:
label = data["HeartDisease"].copy()
data = data.drop("HeartDisease",axis=1)

#### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = 0.2, random_state = 42)


In [None]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.fit_transform(X_test)

In [None]:
lor = LogisticRegression()
lor.fit(X_train, y_train)
y_pred = lor.predict(X_test)
classification_report(y_test, y_pred)

In [None]:
plot_confusion_matrix(lor,X_test,y_test)
plt.show()  

In [None]:
#printing the accuracy for test set
print('Accuracy of Logistic Regression model is {}'.format(accuracy_score(y_test,y_pred)*100))

In [None]:
metrics.plot_roc_curve(lor, X_test, y_test)

#### Decision Tree

In [None]:
mdl = DecisionTreeClassifier(criterion="entropy", max_depth=6)
mdl.fit(X_train,y_train)
y_p = mdl.predict(X_test)


In [None]:
print(classification_report(y_test, y_p))

In [None]:
plot_confusion_matrix(mdl,X_test,y_test)
plt.show()  

In [None]:
# printing the accuracy for test set
print('Accuracy of Decision Tree model is {}'.format(accuracy_score(y_test,y_p)*100))

In [None]:
metrics.plot_roc_curve(mdl, X_test, y_test)

#### Random Forest

In [None]:
clf = RandomForestClassifier(n_estimators =100)
clf.fit(X_train, y_train)
pp = clf.predict(X_test)

In [None]:
plot_confusion_matrix(clf,X_test,y_test)
plt.show()  

In [None]:
# printing the accuracy for test set
print('Accuracy of Random forest classifier model is {}'.format(accuracy_score(y_test,pp)*100))

In [None]:
metrics.plot_roc_curve(clf, X_test, y_test)