<a href="https://colab.research.google.com/github/Neethu0207/Projects/blob/main/heart_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# HEART DISEASE PREDICTION


# **Importing libraries and loading dataset**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, recall_score, roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_score, f1_score
import matplotlib.pyplot as plt

# Read and process the dataset

In [None]:
heart_data=pd.read_csv('/content/heart (1).csv')

In [None]:
heart_data.head()

In [None]:
heart_data.shape

In [None]:
heart_data.info()

# Checking for null values in the dataset


In [None]:
heart_data.isnull().sum()

# Total missing percent of data

In [None]:
missing_data=heart_data.isnull().sum()
total_cells=np.prod(heart_data.shape) # Changed 'product' to 'prod'
total_missing=missing_data.sum()
missing_percent=(total_missing/total_cells)*100
print(missing_percent)

# Checking for Duplicate

In [None]:
duplicate=heart_data.duplicated().sum()
print(duplicate)


In [None]:
#drop duplicate
heart_data.drop_duplicates()


# Describing dataset

In [None]:
heart_data.describe()

In [None]:
#distribution of the target variable
heart_data['target'].value_counts()

# Divide data into training testing

In [None]:
# Split data into features and target
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']


In [None]:
print(X)

In [None]:
print(Y)

# Splitting the dataset into train and test data

In [None]:
# Split data into test/train set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

# **Implementing Logistic Regression**

In [None]:
model = LogisticRegression()

In [None]:
# Train and evaluate models
model.fit(X_train,Y_train)

In [None]:
#Prediction of train set
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

In [None]:
#Prediction of test set
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy on Test data : ', test_data_accuracy)

In [None]:
input_data = (62,1,0,140,268,0,3.6,0,2,2,3.6,0,2.3) # Added missing values to the tuple to match the expected number of features (13).

input_data_as_numpy_array= np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

In [None]:
# Vizualize confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(Y_test, y_pred)

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

# **Support Vector Machine**

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC(kernel='linear')
model.fit(X_train, Y_train)

In [None]:

X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

In [None]:

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy on Test data : ', test_data_accuracy)

In [None]:
input_data = (62,1,0,140,268,3.6,0,160,0,3.6,0,12,89)
input_data_as_numpy_array= np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

In [None]:
# Vizualize confusion matrix
y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(Y_test, y_pred)

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Svm')
plt.show()

# Random Forest


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [None]:
# Initialize Random Forest Classifier
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, Y_train)

In [None]:
X_train_prediction = rf_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

X_test_prediction = rf_model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

In [None]:
input_data = (62,1,0,140,268,3.6,0,160,0,3.6,0,12,13)
input_data_as_numpy_array= np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = rf_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

In [None]:
# Vizualize confusion matrix
y_pred = rf_model.predict(X_test)
conf_matrix = confusion_matrix(Y_test, y_pred)

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Randomforest')
plt.show()

# **GradientBoosting Classifier**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, Y_train)

In [None]:
X_train_prediction = gb_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

X_test_prediction = gb_model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

In [None]:
input_data = (62,1,0,140,268,3.6,0,160,0,3.6,0,13,14)
input_data_as_numpy_array= np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = gb_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

In [None]:
# Vizualize confusion matrix
y_pred = gb_model.predict(X_test)
conf_matrix = confusion_matrix(Y_test, y_pred)

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Gradient boosting')
plt.show()

# **Gaussian Naive Bayes**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [None]:
gnb_model = GaussianNB()
gnb_model.fit(X_train, Y_train)


In [None]:

X_train_prediction = gnb_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

X_test_prediction = gnb_model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

In [None]:
input_data = (62,1,0,140,268,3.6,0,160,0,3.6,0,11,15)
input_data_as_numpy_array= np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = gnb_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

In [None]:
# Vizualize confusion matrix
y_pred = gnb_model.predict(X_test)
conf_matrix = confusion_matrix(Y_test, y_pred)

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Gaussian Naive Bayes')
plt.show()

# **KNeighbors Classifier**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, Y_train)
knn_scores=[]

In [None]:
X_train_prediction = knn_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

X_test_prediction = knn_model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

In [None]:
#calculating classification report
from sklearn.metrics import classification_report
print(classification_report(Y_test, X_test_prediction))


In [None]:
input_data = (62,1,0,140,268,3.6,0,160,0,3.6,0,1,0)
input_data_as_numpy_array= np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = knn_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

In [None]:
# Vizualize confusion matrix
y_pred = knn_model.predict(X_test)
conf_matrix = confusion_matrix(Y_test, y_pred)

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for knn')
plt.show()

# **DecisionTree Classifier**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, Y_train)

In [None]:
X_train_prediction = dt_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

X_test_prediction = dt_model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

In [None]:
input_data = (62,1,0,140,268,3.6,0,160,0,3.6,0,1,1)
input_data_as_numpy_array= np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = dt_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

In [None]:
# Vizualize confusion matrix
y_pred = dt_model.predict(X_test)
conf_matrix = confusion_matrix(Y_test, y_pred)

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for DecisionTreeClassifier')
plt.show()

HEAT MAP

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt



In [None]:
#visualizing with the help of heatmap
sns.heatmap(heart_data.corr(),annot=True)



In [None]:
#visualizing heartdisease by gender using piechart
heart_disease_by_gender = heart_data[heart_data['target'] == 1]['sex'].value_counts()
labels = ['Female', 'Male']
colors = ['#ff9999', '#66b3ff']
explode = (0.1, 0)
plt.pie(heart_disease_by_gender, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', shadow=True, startangle=140)

In [None]:
#visualizing age distribution with the help of barchart
plt.figure(figsize=(10, 6))
plt.hist(heart_data['age'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age Distribution of Patients')

In [None]:

algorithms = ['Logistic Regression', 'SVM', 'Random Forest', 'Gradient Boosting', 'Gaussian Naive Bayes', 'KNeighbors Classifier', 'Decision Tree Classifier']

scores = [
    test_data_accuracy_lr,
    test_data_accuracy_svm,
    test_data_accuracy_rf,
    test_data_accuracy_gb,
    test_data_accuracy_gnb,
    test_data_accuracy_knn,
    test_data_accuracy_dt
]

sns.set(rc={'figure.figsize':(15,8)})
plt.xlabel("Algorithms")
plt.ylabel("Accuracy score")


sns.barplot(x=algorithms, y=scores)
plt.xticks(rotation=45, ha='right')
plt.title('Accuracy Comparison of Machine Learning Models') # Add a title
plt.tight_layout()
plt.show()