# **Exploratory Data Analysis**:

Exploring Data before any DataScience or Machine Learning Process is a crucial step and is mandatory process to understand the data and be prepared...

In this Notebook we are going to explore the Heart Datset by analysing the attributes/features that helps in classifying the given person in having the heart disease or not.


# Importing all the Necessary libraries and loading the Dataset...


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


path="/kaggle/input/human-heart-disease-dataset/heart.csv"

df=pd.read_csv(path)

print(df.shape,'\n')
print(df.columns,'\n')
print(df.head(6))

# Renaming Columns..

In [None]:
df.rename(columns={'cp':'chest_pain(4types)'},inplace=True)
df.rename(columns={'trestbps':'restbp'},inplace=True)
df.rename(columns={'thalach':'max_heart_rate'},inplace=True)
df.rename(columns={'exang':'ex_agnia'},inplace=True)
df.rename(columns={'ca':'major_vessels'},inplace=True)
df.rename(columns={'thal':'disease_curable'},inplace=True)

In [None]:
print(df.columns,'\n')


# Descriptive Statistical analysis...

In [None]:
df.info()
df.describe()


# Finding out any Fault Values exist



In [None]:
print("different set of age groups\n",df['age'].unique(),'\n')
print("types of genders\n",df['sex'].unique(),'\n')
print("chest pain types\n",df['chest_pain(4types)'].unique(),'\n')
print("range of resting blood pressure\n",df['restbp'].unique(),'\n')
print("number of vessels\n",df['major_vessels'].unique(),'\n')
print("is the disease curable\n",df['disease_curable'].unique())


# Major_vessels and Disease_curable has values 4 and 3 respectively which is not defined


In [None]:
print(df['major_vessels'].value_counts()[4])
print(df['disease_curable'].value_counts()[3])

# Removing 2 Columns Since they are not Well Defined

In [None]:
df.drop(columns=['major_vessels','disease_curable'], inplace=True)

In [None]:
print(df.columns)

# Exploring Data through Visualization...
**Univariate Analysis**

In [None]:
sns.histplot(df.age,bins=20, kde=False , color='red', edgecolor='black')   # The Best number of bins is chosen based on Rice Criterion
plt.show()

In [None]:
sns.histplot(df.max_heart_rate,bins=20, kde=True , color='red', edgecolor='black')   # The Best number of bits is chosen based on Rice Criterion
plt.show()

In [None]:
sns.pairplot(df,vars=["target","max_heart_rate"])
plt.show()

In [None]:
sns.countplot(x='sex',data=df,color="blue")
plt.show()
print(df['sex'].value_counts()[0])
print(df['sex'].value_counts()[1])

In [None]:
pd.crosstab(df['target'],df['sex']).plot(kind='bar')
plt.show()

In [None]:
sns.countplot(x="chest_pain(4types)",data=df,color="green")
plt.show()


count=df["chest_pain(4types)"].value_counts()
# print(count)

plt.pie(count,shadow=True,explode=(0.05,0,0,0),startangle=90)
plt.title("Pie Chart for chest pain types")
plt.legend(labels=['Type 0', 'Type 1', 'Type 2', 'Type 3'], loc='upper right')
plt.show()

In [None]:
pd.crosstab(df['target'],df["chest_pain(4types)"]).plot(kind='bar')
plt.xticks(rotation =1)
plt.show()


In [None]:
sns.countplot(x='ex_agnia',data=df,color="blue")
plt.show()
print(df['sex'].value_counts()[0])
print(df['sex'].value_counts()[1])

pd.crosstab(df['sex'],df['target']).plot(kind='bar')
plt.xticks(rotation =1)
plt.show()

# Generating HeatMap to find out Corelation between different Attributes...

In [None]:
# To find the corelation between the 2 Attributes(0=No Relation,1=+ve Relation,-1= -ve Relation) 
correlation_matrix = df.corr()
# print(correlation_matrix)

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


# Inference : chest_pain , max_heart_rate , slope , ex_agnia ,oldpeak  has higher positive and negative corelation with target



# Building Classification Model using scikit-learn...

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

classifiers = [
    ('Random Forest', RandomForestClassifier()),
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('SVM', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('KNN', KNeighborsClassifier())
]

# Iterate over each classifier
for name, model in classifiers:
    print(f"Training and evaluating {name}...")
    # Train the model
    model.fit(X_train, y_train)
    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


# Random Forest gives the Higher Accuracy...
# 