<a href="https://colab.research.google.com/github/Madhu2s6361/Cardiovascular-Disease-Prediction/blob/main/Project_1_Cardiovascular_Disease_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Project 1 : Cardiovascular Disease Prediction


**Cardiovascular Disease:
Definition: It is the application of machine learning (ML) algorithms to analyze patient health data (such as age, cholesterol, blood pressure, glucose levels, lifestyle habits, etc.) to predict the risk or presence of cardiovascular diseases (CVDs) like heart attack, stroke, or heart failure.

Why it’s important

Cardiovascular diseases are the leading cause of death worldwide.

Early detection can save lives by enabling timely intervention.

Traditional methods rely on manual interpretation (ECG, blood tests, clinical judgment), while ML can process large, complex datasets and find hidden patterns that humans may miss.

Cardiovascular Diseases: https://bigapollospectra.com/news/cardiovascular-disease-types-causes-and-symptoms

****Concept

Input (Features): age, gender, cholesterol, blood pressure, smoking, BMI, glucose, etc.

Target (Label): cardio (1 = has cardiovascular disease; 0 = no disease).

****Approach:

Perform EDA → clean + explore dataset.

Train/Test split.

Scale features.

Apply ML model (Logistic Regression, RandomForest, etc.).

Evaluate (Accuracy, Confusion Matrix, ROC, Precision, F1 score, classification report etc...)


• Perform data pre-processing operations.
• As a part of data analysis and visualizations draw all the possible plots to provide essential informations and to derive some meaningful insights.

• Show your correlation matrix of features according to the datasets.

• Find out accuracy levels of various machine learning techniques such as
Support Vector Machines (SVM), K-Nearest Neighbor (KNN),
Decision Trees (DT) ,
Logistic Regression (LR) and
Random Forest (RF).

• Build your Machine learning model for heart disease detection according to the result.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df= pd.read_csv("cardio.csv")

In [None]:
df

In [None]:
df= pd.read_csv("cardio.csv",sep=";") # The separator (delimiter) between columns in the CSV file is a semicolon (;), not a comma (,).

In [None]:
df.head() # View the first 5 rowsof the DataFrame.

In [None]:
df.tail() # View the last 5 rowsof the DataFrame.

In [None]:
df.sample() # View the random 5rows of the DataFrame.

In [None]:
df.shape # Get the dimensions ofthe DataFrame.

In [None]:
df.dtypes # Check data types ofcolumns.

In [None]:
df.index # Display the indexrange.

In [None]:
df.info() # Get a concise summaryof the DataFrame.

In [None]:
df.describe() # Summarystatistics for numericalcolumns.

In [None]:
df.columns # List column names.

In [None]:
# Cleaning Data

df.isnull() # Check for nullvalues.

In [None]:
df.notnull() # Check for non-nullvalues.

In [None]:
df["age"]= (df["age"]/365).astype(int)
df["age"]

In [None]:
df["bmi"]= df["weight"]/ ((df["height"]/100 ** 2))
df["bmi"]

In [None]:
# Remove abnormal values (data cleaning)
# ap_hi: systolic_bp
# ap_lo: diastolic_bp
df = df[(df["ap_hi"] >= df["ap_lo"]) & (df["ap_hi"] <= 250) & (df["ap_lo"] >= 40)]

In [None]:
df

In [None]:
# Step 1: Remove missing values
df = df.dropna(subset=['cardio'])

In [None]:
# Step 2: Keep only valid entries (0 or 1)
df = df[df['cardio'].isin([0, 1, 0.0, 1.0])]

In [None]:
# Step 3: Convert to integer type
df['cardio'] = df['cardio'].astype(int)

In [None]:
df['cardio']

In [None]:
# Distribution of target variable
sns.countplot(x="cardio", data=df, palette="coolwarm")
plt.title("heart disease distribution (0=NO, 1=YES)")
plt.show()

In [None]:
gender_cardio= df.groupby("gender")["cardio"].mean().reset_index()
gender_cardio

In [None]:
display(gender_cardio['cardio'])

In [None]:
sns.barplot(x="gender", y="cardio", data=gender_cardio, palette="viridis")
plt.title("cardiovascular disease % by gender")
plt.show()

In [None]:
age_chol= df.groupby("age")["cholesterol"].mean().reset_index()

In [None]:
# Systolic Blood Pressure distribution [ap_hi: systolic_bp]
sns.histplot(df['ap_hi'], kde=True, bins=40)
plt.title('Systolic Blood Pressure Distribution')
plt.show()

In [None]:
# Diastolic Blood Pressure distribution [ap_lo: diastolic_bp]
sns.histplot(df['ap_lo'], kde=True, bins=40)
plt.title('Diastolic Blood Pressure Distribution')
plt.show()

In [None]:
prob = 0.496727  # example probability
if prob >= 0.5:
    print(f"Risk Detected! ({prob*100:.2f}% chance of heart disease)")
else:
    print(f"No significant risk. ({prob*100:.2f}% chance of heart disease)")

In [None]:
# Average cholesterol by gender
# Average cholesterol by age
df.groupby('gender')['cholesterol'].mean()

In [None]:
df.groupby('age')['cholesterol'].mean()

In [None]:
# Count of patients by cardio status 0=NO, 1=Yes
df.groupby('cardio').size()

In [None]:
df.groupby('cardio')[['ap_hi', 'ap_lo']].mean()


In [None]:
df.groupby(['gender', 'cardio'])['cholesterol'].mean()


In [None]:
df.groupby('cardio')['age'].agg(['mean', 'min', 'max'])


In [None]:
age_chol = df.groupby('age')['cholesterol'].mean().reset_index()

plt.figure(figsize=(10,6))
sns.lineplot(data=age_chol, x='age', y='cholesterol', marker='o', color='red')
plt.title('Average Cholesterol Level by Age')
plt.show()


In [None]:
# Correlation matrix
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(), annot=True, cmap="Blues", annot_kws={"size": 8})
plt.title("correlation heatmap")
plt.show()

#Train Models and Compare Accuracy
---> the train and evaluate

***five ML algorithms:


1. SVM
2. KNN
3. Decision Tree
4. Logistic Regression
5. Random Forest



#Train-test split and StandardScaler

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Feature Selection and Train-Test Split
x = df.drop(columns=["cardio","patientid"], errors='ignore')
y = df['cardio']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Feature scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

 the dataset is divided into input features (X) and output labels (y).

Split data → 80% training, 20% testing.

Used StandardScaler to normalize values (important for ML models like SVM, KNN).

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#Logistic Regression (LR):

Concept:
A statistical model that predicts binary outcomes (like Disease / No Disease) by finding a linear relationship between features and the log-odds of the target.

In [None]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)

In [None]:
acc_lr = accuracy_score(y_test, y_pred_lr)
print(f"🔹 Logistic Regression Accuracy: {acc_lr:.4f}")
print(classification_report(y_test, y_pred_lr))

In [None]:
# Confusion Matrix
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, cmap="Blues", fmt='d')
plt.title("Logistic Regression Confusion Matrix")
plt.show()

#K-Nearest Neighbors (KNN):

Concept:
KNN predicts a sample’s class by looking at the majority class among its k nearest neighbors in the feature space.

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)

In [None]:
acc_knn = accuracy_score(y_test, y_pred_knn)
print(f"🔹 KNN Accuracy: {acc_knn:.4f}")
print(classification_report(y_test, y_pred_knn))

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, y_pred_knn), annot=True, cmap="Greens", fmt='d')
plt.title("KNN Confusion Matrix")
plt.show()

#Support Vector Machine (SVM):

Concept:
SVM tries to find the optimal hyperplane that separates the classes (Disease / No Disease) with maximum margin.

In [None]:
# SVM
svm = SVC(kernel="rbf", probability=True)
svm.fit(x_train, y_train)
y_pred_svm = svm.predict(x_test)

In [None]:
acc_svm = accuracy_score(y_test, y_pred_svm)
print(f"🔹 SVM Accuracy: {acc_svm:.4f}")
print(classification_report(y_test, y_pred_svm))

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, y_pred_svm), annot=True, cmap="Oranges", fmt='d')
plt.title("SVM Confusion Matrix")
plt.show()

#Decision Tree (DT):

Concept:
Decision Tree splits the data based on feature thresholds to create a tree of decisions. Easy to interpret but may overfit.

In [None]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_train, y_train)
y_pred_dt = dt.predict(x_test)

In [None]:
acc_dt = accuracy_score(y_test, y_pred_dt)
print(f"🔹 Decision Tree Accuracy: {acc_dt:.4f}")
print(classification_report(y_test, y_pred_dt))

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, y_pred_dt), annot=True, cmap="Purples", fmt='d')
plt.title("Decision Tree Confusion Matrix")
plt.show()

#Random Forest (RF)

Concept:
Random Forest combines multiple decision trees to improve accuracy and reduce overfitting.

In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

In [None]:
acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"🔹 Random Forest Accuracy: {acc_rf:.4f}")
print(classification_report(y_test, y_pred_rf))

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, cmap="coolwarm", fmt='d')
plt.title("Random Forest Confusion Matrix")
plt.show()

#Compare All Models

In [None]:
# Compare Accuracies
results = pd.DataFrame({
    "Model": ["Logistic Regression","KNN","SVM","Decision Tree","Random Forest"],
    "Accuracy": [acc_lr, acc_knn, acc_svm, acc_dt, acc_rf]
}).sort_values(by="Accuracy", ascending=False)

print("\n Model Performance Comparison:")
display(results)

plt.figure(figsize=(8,4))
sns.barplot(x="Model", y="Accuracy", data=results, palette="mako")
plt.title("Model Accuracy Comparison")
plt.ylim(0,1)
plt.show()

best_model_name = results.iloc[0,0]
print(f"Best performing model: {best_model_name}")

In [None]:
# Confusion Matrix for Best Model
best_model_name = results_df.iloc[0,0]
best_model = models[best_model_name]

y_pred_best = best_model.predict(x_test)
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
plt.title(f"Confusion Matrix - {best_model_name}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

print(f"\n Best Performing Model: {best_model_name}")

#Patient-Level Prediction (Interactive)

In [None]:
print("\n Enter patient values manually or modify the example below:")

# Example patient data (modify values)
patient_data = {
    "age_years": 52,
    "gender": 1,          # 1 = male, 2 = female (depends on dataset)
    "height": 168,
    "weight": 70,
    "ap_hi": 120,         # systolic BP
    "ap_lo": 80,          # diastolic BP
    "cholesterol": 1,     # 1 = normal, 2 = above normal, 3 = well above
    "gluc": 1,            # glucose level
    "smoke": 0,
    "alco": 0,
    "active": 1
}

In [None]:
# Convert to dataframe
patient_df = pd.DataFrame([patient_data])

In [None]:
# Drop missing columns or align with model
for col in x.columns:
    if col not in patient_df.columns:
        patient_df[col] = 0
patient_df = patient_df[x.columns]

# Scale same as training
patient_scaled = scaler.transform(patient_df)

In [None]:

# Predict disease & probability
disease_pred = best_model.predict(patient_scaled)[0]
prob = best_model.predict_proba(patient_scaled)[0][1] if hasattr(best_model, 'predict_proba') else None

print("\n Patient Information:")
display(patient_df)

print("\nPrediction Result:")
if disease_pred == 1:
    print("High Risk of Cardiovascular Disease")
else:
    print("No Significant Risk Detected")

if prob is not None:
    print(f"Probability of Disease: {prob*100:.2f}%")

In [None]:
# Optional: Visualize risk probability
if prob is not None:
    plt.figure(figsize=(5,3))
    plt.bar(["No Disease", "Disease"], [1-prob, prob], color=["green","red"])
    plt.title("Patient Risk Probability")
    plt.ylabel("Probability")
    plt.show()

In [None]:
# Predict on your test set

y_pred= model.predict(x_test)

# Compare actual vs predicted
result= pd.DataFrame({
    "actual": y_test.values,
    "predicted": y_pred
  })
print(result.tail(10)) # head() is the first'st 10nd rows and tail() is the last 10nd rows

In [None]:
# Predict probabilities

y_prob= model.predict_proba(x_test)[:,1]
result["risk_probability"]= y_prob
print(result.head(20))

In [None]:
# Show actual vs predicted for first 20 test patients
print(result.head(20))

# Bar plot with class labels
plt.figure(figsize=(12,6))
sns.barplot(x=result.head(20).index,
            y=result.head(20)["risk_probability"],
            hue=result.head(20)["predicted"],
            palette={0:"green", 1:"red"})

# Add labels (CVD/No CVD + probability)
for i, (prob, cls) in enumerate(zip(result.head(20)["risk_probability"],
                                    result.head(20)["predicted"])):
    label = "No CVD" if cls==0 else "CVD"
    plt.text(i, prob+0.02, f"{label}\n{prob:.2f}", ha="center")

plt.xticks(result.head(20).index, [f"Patient {i+1}" for i in result.head(20).index], rotation=45)
plt.ylabel("Predicted Probability of CVD")
plt.title("CVD Prediction for First 20 Test Patients (0=No CVD, 1=CVD)")
plt.ylim(0, 1.1)
plt.legend(title="Predicted Class", labels=["No CVD", "CVD"])
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x='actual', y='risk_probability', data=result.groupby('actual')['risk_probability'].mean().reset_index(), palette='viridis')
plt.title('Average Predicted CVD Probability by Actual Status')
plt.xlabel('Actual CVD Status (0: No CVD, 1: CVD)')
plt.ylabel('Average Predicted Probability of CVD')
plt.xticks([0, 1], ['No CVD', 'CVD'])
plt.ylim(0, 1) # Probabilities are between 0 and 1
plt.grid(axis='y')
plt.show()

#or
#Predict for All Patients in the Dataset

In [None]:
# Predict probabilities for all patients
if hasattr(best_model, 'predict_proba'):
    df["CVD_Probability"] = best_model.predict_proba(scaler.transform(x))[:,1]
else:
    df["CVD_Probability"] = best_model.predict(scaler.transform(x))

In [None]:
# Predicted class (0 = No Disease, 1 = Disease)
df["Predicted_Status"] = best_model.predict(scaler.transform(x))

In [None]:
# Display first 10 predictions
print("Patient-level Disease Predictions:")
display(df[["age", "gender", "ap_hi", "ap_lo", "cholesterol",
            "gluc", "CVD_Probability", "Predicted_Status"]].head(10))

In [None]:
# Optionally, save to CSV
output_path = "/content/Patient_Predictions.csv"
df.to_csv(output_path, index=False)
print(f"\n✅ All patient predictions saved to: {output_path}")

In [None]:
# Distribution of predicted risk
plt.figure(figsize=(6,4))
sns.histplot(df["CVD_Probability"], bins=20, kde=True, color="tomato")
plt.title("Distribution of Predicted Disease Probability (All Patients)")
plt.xlabel("Probability of Cardiovascular Disease")
plt.ylabel("Number of Patients")
plt.show()

In [None]:
# Visual Analysis: High-Risk vs Low-Risk Patients
# Categorize patients by predicted status
df["Risk_Level"] = df["Predicted_Status"].map({0: "Low Risk", 1: "High Risk"})

In [None]:
# Countplot: Number of High vs Low Risk Patients

plt.figure(figsize=(6,4))
sns.countplot(data=df, x="Risk_Level", palette="coolwarm")
plt.title("Count of High-Risk vs Low-Risk Patients")
plt.xlabel("Risk Category")
plt.ylabel("Number of Patients")
plt.show()

# Patient-Level Prediction (SVM)

In [None]:
# Predict for All Patients Using SVM

df["CVD_Probability_SVM"] = svm.predict_proba(scaler.transform(x))[:,1]
df["Predicted_Status_SVM"] = svm.predict(scaler.transform(x))

In [None]:
print("First 10 SVM Predictions:")
display(df[["age", "cholesterol", "ap_hi", "ap_lo", "CVD_Probability_SVM", "Predicted_Status_SVM"]].head(10))

In [None]:
# Save predictions
output_svm = "/content/SVM_Patient_Predictions.csv"
df.to_csv(output_svm, index=False)
print(f"\n All SVM predictions saved to: {output_svm}")

#Predicted Disease Risk for First 50 Patients (Using SVM)

In [None]:
# Select first 50 patients
patients_50 = df.head(50).copy()

In [None]:
# Ensure correct columns are used
x_cols = x.columns  # features used in model training

In [None]:
# Scale the data (use same scaler as training)
patients_scaled = scaler.transform(patients_50[x_cols])

In [None]:
# Predict using your trained model (choose: svm_model or best_model)
patients_50["Predicted_Class"] = svm.predict(patients_scaled)
patients_50["CVD_Probability"] = svm.predict_proba(patients_scaled)[:, 1]

In [None]:
# Plot probability for first 50 patients
plt.figure(figsize=(15,6))
sns.barplot(x=patients_50.index, y=patients_50['CVD_Probability'], palette="coolwarm")

# Add labels (Disease / No Disease)
for i, prob in enumerate(patients_50['CVD_Probability']):
    label = "Disease" if patients_50['Predicted_Class'].iloc[i] == 1 else "No Disease"
    plt.text(i, prob + 0.02, label,
             ha='center',
             color=('red' if label=="Disease" else 'green'),
             fontsize=9)

In [None]:
# Generate x-axis positions
x_positions = range(len(patients_50))

In [None]:
# Assign colors based on risk
colors = ['red' if risk == 1 else 'green' for risk in patients_50["Predicted_Class"]]

In [None]:
# Plot
plt.figure(figsize=(15, 6))
plt.bar(x_positions, patients_50["CVD_Probability"], color=colors)
plt.xticks(x_positions, [f"Patient {i+1}" for i in x_positions], rotation=90)
plt.ylabel("Predicted CVD Probability")
plt.title("Predicted Cardiovascular Disease Risk for First 50 Patients (SVM Model)")
plt.ylim(0, 1.1)

# Add legend
red_patch = mpatches.Patch(color='red', label='High Risk')
green_patch = mpatches.Patch(color='green', label='Low Risk')
plt.legend(handles=[red_patch, green_patch])

plt.tight_layout()
plt.show()

In [None]:
# Save to Excel
output_file = "CVD_Predictions.xlsx"
result.to_excel(output_file, index=False)

print(f"Predictions saved to {output_file}")

In [None]:
# Save Predictions
# Save patient predictions
patients_50.to_excel("CVD_First50_Predictions.xlsx", index=False)

# Save test set predictions
result.to_excel("CVD_TestSet_Predictions.xlsx", index=False)

In [None]:
# Save all patient predictions to an Excel file
df.to_excel("CVD_All_Patient_Predictions.xlsx", index=False)

# Save model performance comparison to an Excel file
results_df.to_excel("CVD_Model_Performance.xlsx", index=False)

# Save feature importance to an Excel file
# Check if 'feat_importance' DataFrame exists before saving
if 'feat_importance' in locals() and isinstance(feat_importance, pd.DataFrame):
    feat_importance.to_excel("CVD_Feature_Importance.xlsx", index=False)
else:
    print("Feature importance DataFrame not found. Skipping save.")

print("All specified DataFrames saved to separate Excel files.")

In [None]:
import os
print(os.getcwd())   # shows working folder

In [None]:
os.listdir()

In [None]:
from google.colab import files
files.download("CVD_Model_Performance.xlsx")
print("Attempting to download CVD_Model_Performance.xlsx...")

In [None]:
from google.colab import files

files_to_download = [
    "CVD_Model_Performance.xlsx",
    "CVD_Predictions.xlsx",
    "CVD_Model_Results.xlsx"
]

for file in files_to_download:
    print(f"Downloading {file}...")
    files.download(file)

In [None]:
import pandas as pd
from google.colab import files
import os

# Combine all Excel files in the folder into one file with separate sheets
with pd.ExcelWriter("Combined_CVD_Data.xlsx", engine="openpyxl") as writer:
    for f in [x for x in os.listdir() if x.endswith(".xlsx")]:
        try:
            pd.read_excel(f, engine="openpyxl").to_excel(writer, sheet_name=f.split(".")[0], index=False)
        except Exception as e:
            print(f"Could not read or write file {f}: {e}")


files.download("Combined_CVD_Data.xlsx")