# **1. Data Collection**

In [None]:
# Importing essential libraries
import numpy as np
import pandas as pd

In [None]:
# Loading the dataset
data = pd.read_csv('Heart_Data.csv')

In [None]:
data.head()

# **2. Exploring the dataset**

In [None]:
# Returns number of rows and columns of the dataset
data.shape

In [None]:
# Returns an object with all of the column headers
data.columns

In [None]:
# Returns different datatypes for each columns (float, int, string, bool, etc.)
data.dtypes

In [None]:
# Returns the first x number of rows when head(x). Without a number it returns 5
data.head()

In [None]:
# Returns the last x number of rows when tail(x). Without a number it returns 5
data.tail()

In [None]:
# Returns true for a column having null values, else false
data.isnull().any()

In [None]:
# Returns basic information on all columns
data.info()

In [None]:
# Returns basic statistics on numeric columns
data.describe().T

# **3. Data Visualization**

In [None]:
# Importing essential libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# Plot histogram for all columns in the dataset
data.hist(figsize=(15, 15))
plt.tight_layout()  # Optional: improves spacing between plots
plt.show()

In [None]:
# Visualization to check if the dataset is balanced or not
g = sns.countplot(x='target', data=data)
plt.xlabel('Target')
plt.ylabel('Count')

# **4. Data Preprocessing**

In [None]:
# find missing values.
data.isnull().sum()[data.isnull().sum()>0].sort_values(ascending=False)

In [None]:
# find categorical Columns
cat_cols = data.select_dtypes(include='object').columns.tolist()
cat_cols

# find Numerical Columns
Num_cols = data.select_dtypes(exclude='object').columns.tolist()
Num_cols

print(f'categorical Columns: {cat_cols}')
print(f'numerical Columns: {Num_cols}')

In [None]:
heart_data = pd.get_dummies(data, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])

# **5. Feature Engineering**

## 5.1 Feature Selection

In [None]:
# Selecting correlated features using Heatmap

# Get correlation of all the features of the dataset
corr_matrix = data.corr()
top_corr_features = corr_matrix.index

# Plotting the heatmap
plt.figure(figsize=(20,20))
sns.heatmap(data=data[top_corr_features].corr(), annot=True, cmap='RdYlGn')

## 5.2 Feature Scaling

In [None]:
heart_data.columns

In [None]:
from sklearn.preprocessing import StandardScaler
standScaler = StandardScaler()
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
heart_data[columns_to_scale] = standScaler.fit_transform(heart_data[columns_to_scale])

In [None]:
heart_data.head()

In [None]:
# Splitting the dataset into dependent and independent features
X = heart_data.drop('target', axis=1)
y = heart_data['target']

# **6. Model Building**

## 6.1 KNeighbors Classifier Model

In [None]:
# Importing essential libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [None]:
# Finding the best accuracy for knn algorithm using cross_val_score 
knn_scores = []
for i in range(1, 21):
  knn_classifier = KNeighborsClassifier(n_neighbors=i)
  cvs_scores = cross_val_score(knn_classifier, X, y, cv=10)
  knn_scores.append(round(cvs_scores.mean(),3))

In [None]:
# Plotting the results of knn_scores
plt.figure(figsize=(20,15))
plt.plot([k for k in range(1, 21)], knn_scores, color = 'red')
for i in range(1,21):
    plt.text(i, knn_scores[i-1], (i, knn_scores[i-1]))
plt.xticks([i for i in range(1, 21)])
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Scores')
plt.title('K Neighbors Classifier scores for different K values')

In [None]:
# Training the knn classifier model with k value as 12
knn_classifier = KNeighborsClassifier(n_neighbors=12)
cvs_scores = cross_val_score(knn_classifier, X, y, cv=10)
print("KNeighbours Classifier Accuracy with K=12 is: {}%".format(round(cvs_scores.mean(), 4)*100))

## 6.2 Decision Tree Classifier

In [None]:
# Importing essential libraries
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Finding the best accuracy for decision tree algorithm using cross_val_score 
decision_scores = []
for i in range(1, 11):
  decision_classifier = DecisionTreeClassifier(max_depth=i)
  cvs_scores = cross_val_score(decision_classifier, X, y, cv=10)
  decision_scores.append(round(cvs_scores.mean(),3))

In [None]:
# Plotting the results of decision_scores
plt.figure(figsize=(20,15))
plt.plot([i for i in range(1, 11)], decision_scores, color = 'red')
for i in range(1,11):
    plt.text(i, decision_scores[i-1], (i, decision_scores[i-1]))
plt.xticks([i for i in range(1, 11)])
plt.xlabel('Depth of Decision Tree (N)')
plt.ylabel('Scores')
plt.title('Decision Tree Classifier scores for different depth values')

In [None]:
# Training the decision tree classifier model with max_depth value as 3
decision_classifier = DecisionTreeClassifier(max_depth=3)
cvs_scores = cross_val_score(decision_classifier, X, y, cv=10)
print("Decision Tree Classifier Accuracy with max_depth=3 is: {}%".format(round(cvs_scores.mean(), 4)*100))

## 6.3 Random Forest Classifier

In [None]:
# Importing essential libraries
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Finding the best accuracy for random forest algorithm using cross_val_score 
forest_scores = []
for i in range(10, 101, 10):
  forest_classifier = RandomForestClassifier(n_estimators=i)
  cvs_scores = cross_val_score(forest_classifier, X, y, cv=5)
  forest_scores.append(round(cvs_scores.mean(),3))

In [None]:
# Plotting the results of forest_scores
plt.figure(figsize=(20,15))
plt.plot([n for n in range(10, 101, 10)], forest_scores, color = 'red')
for i in range(1,11):
    plt.text(i*10, forest_scores[i-1], (i*10, forest_scores[i-1]))
plt.xticks([i for i in range(10, 101, 10)])
plt.xlabel('Number of Estimators (N)')
plt.ylabel('Scores')
plt.title('Random Forest Classifier scores for different N values')

In [None]:
# Training the random forest classifier model with n value as 90
forest_classifier = RandomForestClassifier(n_estimators=90)
cvs_scores = cross_val_score(forest_classifier, X, y, cv=5)
print("Random Forest Classifier Accuracy with n_estimators=90 is: {}%".format(round(cvs_scores.mean(), 4)*100))