In [1]:
from google.colab import files

# This line is used to upload files from your system to the Colab environment
uploaded = files.upload()

# Display the list of uploaded files
for filename in uploaded.keys():
    print(f'Uploaded file: {filename}')


Saving Heart_Disease_Prediction .xlsx to Heart_Disease_Prediction .xlsx
Uploaded file: Heart_Disease_Prediction .xlsx


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Load the data from an Excel file
file_path = 'Heart_Disease_Prediction .xlsx'
data = pd.read_excel(file_path)

# Set the target and features
X = data.drop('Heart Disease', axis=1)  # Features
y = data['Heart Disease']  # Target

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Feature selection using SelectKBest
selector = SelectKBest(f_classif, k=min(8, X_train_scaled.shape[1]))  # Maximum of 8 features
X_train_scaled = selector.fit_transform(X_train_scaled, y_train)
X_val_scaled = selector.transform(X_val_scaled)
X_test_scaled = selector.transform(X_test_scaled)

# Now the data is ready for model training
print("Train data shape:", X_train_scaled.shape)
print("Validation data shape:", X_val_scaled.shape)
print("Test data shape:", X_test_scaled.shape)


Train data shape: (189, 8)
Validation data shape: (40, 8)
Test data shape: (41, 8)


In [6]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Upload the dataset in Colab (uncomment and run in Colab)
# from google.colab import files
# uploaded = files.upload()

# Load the dataset
# Replace 'Heart_Disease_Prediction.xlsx' with the exact file name if different
df = pd.read_excel('Heart_Disease_Prediction .xlsx')

# Step 1: Preprocessing
# Encode target variable
df['Heart Disease'] = df['Heart Disease'].map({'Presence': 1, 'Absence': 0})

# Define features and target
X = df.drop('Heart Disease', axis=1)
y = df['Heart Disease']

# Identify categorical and numerical columns
categorical_cols = ['Sex', 'Chest pain type', 'FBS over 120', 'EKG results',
                    'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium']
numerical_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

# Step 2: Split the data
# Train (70%), Validation (15%), Test (15%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp)
# Note: 0.1765 = 0.15/(1-0.15) to get 15% of total as validation

# Step 3: Create XGBoost model pipeline
xgb = XGBClassifier(random_state=42, eval_metric='logloss')
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb)
])

# Step 4: Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 150, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Step 5: Evaluate the model
def evaluate_model(model, X_set, y_set, set_name):
    y_pred = model.predict(X_set)
    accuracy = accuracy_score(y_set, y_pred)
    precision = precision_score(y_set, y_pred)
    recall = recall_score(y_set, y_pred)
    f1 = f1_score(y_set, y_pred)
    print(f"\n{set_name} Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"Classification Report:\n{classification_report(y_set, y_pred)}")

# Evaluate on training, validation, and test sets
evaluate_model(best_model, X_train, y_train, "Training")
evaluate_model(best_model, X_val, y_val, "Validation")
evaluate_model(best_model, X_test, y_test, "Test")

# Step 6: Feature importance (optional)
# Extract feature names after one-hot encoding
cat_transformer = best_model.named_steps['preprocessor'].named_transformers_['cat']
cat_feature_names = cat_transformer.get_feature_names_out(categorical_cols)
feature_names = numerical_cols + list(cat_feature_names)

# Get feature importances
importances = best_model.named_steps['classifier'].feature_importances_
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'classifier__colsample_bytree': 0.8, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}

Training Metrics:
Accuracy: 0.87
Precision: 0.94
Recall: 0.76
F1-Score: 0.84
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89       104
           1       0.94      0.76      0.84        84

    accuracy                           0.87       188
   macro avg       0.89      0.86      0.87       188
weighted avg       0.88      0.87      0.87       188


Validation Metrics:
Accuracy: 0.90
Precision: 0.89
Recall: 0.89
F1-Score: 0.89
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        23
           1       0.89      0.89      0.89        18

    accuracy                           0.90        41
   macr

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
# Replace 'Heart_Disease_Prediction.xlsx' with the exact file name if different
df = pd.read_excel('Heart_Disease_Prediction .xlsx')

# Step 1: Preprocessing
# Encode target variable
df['Heart Disease'] = df['Heart Disease'].map({'Presence': 1, 'Absence': 0})

# Define features and target
X = df.drop('Heart Disease', axis=1)
y = df['Heart Disease']

# Identify categorical and numerical columns
categorical_cols = ['Sex', 'Chest pain type', 'FBS over 120', 'EKG results',
                    'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium']
numerical_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

# Step 2: Split the data
# Train (70%), Validation (15%), Test (15%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp)
# Note: 0.1765 = 0.15/(1-0.15) to get 15% of total as validation

# Step 3: Create XGBoost model pipeline
xgb = XGBClassifier(random_state=42, eval_metric='logloss')
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb)
])

# Step 4: Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 150, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Step 5: Evaluate the model
def evaluate_model(model, X_set, y_set, set_name):
    y_pred = model.predict(X_set)
    accuracy = accuracy_score(y_set, y_pred)
    precision = precision_score(y_set, y_pred)
    recall = recall_score(y_set, y_pred)
    f1 = f1_score(y_set, y_pred)
    print(f"\n{set_name} Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"Classification Report:\n{classification_report(y_set, y_pred)}")

# Evaluate on training, validation, and test sets
evaluate_model(best_model, X_train, y_train, "Training")
evaluate_model(best_model, X_val, y_val, "Validation")
evaluate_model(best_model, X_test, y_test, "Test")

# Step 6: Feature importance (optional)
# Extract feature names after one-hot encoding
cat_transformer = best_model.named_steps['preprocessor'].named_transformers_['cat']
cat_feature_names = cat_transformer.get_feature_names_out(categorical_cols)
feature_names = numerical_cols + list(cat_feature_names)

# Get feature importances
importances = best_model.named_steps['classifier'].feature_importances_
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'classifier__colsample_bytree': 0.8, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}

Training Metrics:
Accuracy: 0.87
Precision: 0.94
Recall: 0.76
F1-Score: 0.84
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89       104
           1       0.94      0.76      0.84        84

    accuracy                           0.87       188
   macro avg       0.89      0.86      0.87       188
weighted avg       0.88      0.87      0.87       188


Validation Metrics:
Accuracy: 0.90
Precision: 0.89
Recall: 0.89
F1-Score: 0.89
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        23
           1       0.89      0.89      0.89        18

    accuracy                           0.90        41
   macr

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_curve
import warnings
warnings.filterwarnings('ignore')

# Load the dataset (already uploaded in Colab)
df = pd.read_excel('Heart_Disease_Prediction .xlsx')

# Step 1: Preprocessing
# Encode target variable
df['Heart Disease'] = df['Heart Disease'].map({'Presence': 1, 'Absence': 0})

# Define features and target
X = df.drop('Heart Disease', axis=1)
y = df['Heart Disease']

# Identify categorical and numerical columns
categorical_cols = ['Sex', 'Chest pain type', 'FBS over 120', 'EKG results',
                    'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium']
numerical_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

# Step 2: Split the data
# Train (70%), Validation (15%), Test (15%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp)

# Step 3: Create pipeline with SMOTE and XGBoost
xgb = XGBClassifier(random_state=42, eval_metric='logloss')
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', xgb)
])

# Step 4: Expanded hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 150, 200, 300],
    'classifier__max_depth': [3, 4, 5],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__subsample': [0.7, 0.8, 0.9],
    'classifier__colsample_bytree': [0.7, 0.8, 0.9],
    'classifier__scale_pos_weight': [1, 1.2, 1.5]  # Adjust for class imbalance
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Step 5: Threshold tuning on validation set
def evaluate_model_with_threshold(model, X_set, y_set, set_name, threshold=0.5):
    y_scores = model.predict_proba(X_set)[:, 1]
    y_pred = (y_scores >= threshold).astype(int)
    accuracy = accuracy_score(y_set, y_pred)
    precision = precision_score(y_set, y_pred)
    recall = recall_score(y_set, y_pred)
    f1 = f1_score(y_set, y_pred)
    print(f"\n{set_name} Metrics (Threshold={threshold:.2f}):")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"Classification Report:\n{classification_report(y_set, y_pred)}")
    return y_scores

# Evaluate on validation set and find optimal threshold
y_val_scores = evaluate_model_with_threshold(best_model, X_val, y_val, "Validation")
precisions, recalls, thresholds = precision_recall_curve(y_val, y_val_scores)
# Find threshold where precision and recall are both >0.90 (if possible)
valid_thresholds = thresholds[(precisions[:-1] > 0.90) & (recalls[:-1] > 0.90)]
optimal_threshold = valid_thresholds[0] if len(valid_thresholds) > 0 else 0.5
print(f"Optimal Threshold from Validation: {optimal_threshold:.2f}")

# Step 6: Evaluate all sets with optimal threshold
evaluate_model_with_threshold(best_model, X_train, y_train, "Training", optimal_threshold)
evaluate_model_with_threshold(best_model, X_val, y_val, "Validation", optimal_threshold)
evaluate_model_with_threshold(best_model, X_test, y_test, "Test", optimal_threshold)

# Step 7: Feature importance
cat_transformer = best_model.named_steps['preprocessor'].named_transformers_['cat']
cat_feature_names = cat_transformer.get_feature_names_out(categorical_cols)
feature_names = numerical_cols + list(cat_feature_names)
importances = best_model.named_steps['classifier'].feature_importances_
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))

Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best Parameters: {'classifier__colsample_bytree': 0.7, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__scale_pos_weight': 1, 'classifier__subsample': 0.7}

Validation Metrics (Threshold=0.50):
Accuracy: 0.90
Precision: 0.89
Recall: 0.89
F1-Score: 0.89
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        23
           1       0.89      0.89      0.89        18

    accuracy                           0.90        41
   macro avg       0.90      0.90      0.90        41
weighted avg       0.90      0.90      0.90        41

Optimal Threshold from Validation: 0.50

Training Metrics (Threshold=0.50):
Accuracy: 0.87
Precision: 0.87
Recall: 0.85
F1-Score: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.89       104
      

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_curve
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_excel('Heart_Disease_Prediction .xlsx')

# Step 1: Preprocessing
# Encode target variable
df['Heart Disease'] = df['Heart Disease'].map({'Presence': 1, 'Absence': 0})

# Define features and target
X = df.drop('Heart Disease', axis=1)
y = df['Heart Disease']

# Identify categorical and numerical columns
categorical_cols = ['Sex', 'Chest pain type', 'FBS over 120', 'EKG results',
                    'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium']
numerical_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

# Step 2: Split the data
# Train (70%), Validation (15%), Test (15%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp)

# Step 3: Create pipeline with SMOTE and XGBoost
xgb = XGBClassifier(random_state=42, eval_metric='logloss', scale_pos_weight=1.5)  # Class weight adjusted
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', xgb)
])

# Step 4: Expanded hyperparameter tuning with more options
param_grid = {
    'classifier__n_estimators': [100, 150, 200, 300, 500],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__subsample': [0.7, 0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.7, 0.8, 0.9],
    'classifier__scale_pos_weight': [1.2, 1.5, 2]  # Adjust for class imbalance
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Step 5: Threshold tuning on validation set
def evaluate_model_with_threshold(model, X_set, y_set, set_name, threshold=0.5):
    y_scores = model.predict_proba(X_set)[:, 1]
    y_pred = (y_scores >= threshold).astype(int)
    accuracy = accuracy_score(y_set, y_pred)
    precision = precision_score(y_set, y_pred)
    recall = recall_score(y_set, y_pred)
    f1 = f1_score(y_set, y_pred)
    print(f"\n{set_name} Metrics (Threshold={threshold:.2f}):")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"Classification Report:\n{classification_report(y_set, y_pred)}")
    return y_scores

# Evaluate on validation set and find optimal threshold
y_val_scores = evaluate_model_with_threshold(best_model, X_val, y_val, "Validation")
precisions, recalls, thresholds = precision_recall_curve(y_val, y_val_scores)
# Find threshold where precision and recall are both >0.90 (if possible)
valid_thresholds = thresholds[(precisions[:-1] > 0.90) & (recalls[:-1] > 0.90)]
optimal_threshold = valid_thresholds[0] if len(valid_thresholds) > 0 else 0.5
print(f"Optimal Threshold from Validation: {optimal_threshold:.2f}")

# Step 6: Evaluate all sets with optimal threshold
evaluate_model_with_threshold(best_model, X_train, y_train, "Training", optimal_threshold)
evaluate_model_with_threshold(best_model, X_val, y_val, "Validation", optimal_threshold)
evaluate_model_with_threshold(best_model, X_test, y_test, "Test", optimal_threshold)

# Step 7: Feature importance
cat_transformer = best_model.named_steps['preprocessor'].named_transformers_['cat']
cat_feature_names = cat_transformer.get_feature_names_out(categorical_cols)
feature_names = numerical_cols + list(cat_feature_names)
importances = best_model.named_steps['classifier'].feature_importances_
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Importance', ascending=False))


Fitting 5 folds for each of 2880 candidates, totalling 14400 fits
Best Parameters: {'classifier__colsample_bytree': 0.7, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__scale_pos_weight': 1.2, 'classifier__subsample': 0.7}

Validation Metrics (Threshold=0.50):
Accuracy: 0.90
Precision: 0.89
Recall: 0.89
F1-Score: 0.89
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        23
           1       0.89      0.89      0.89        18

    accuracy                           0.90        41
   macro avg       0.90      0.90      0.90        41
weighted avg       0.90      0.90      0.90        41

Optimal Threshold from Validation: 0.50

Training Metrics (Threshold=0.50):
Accuracy: 0.87
Precision: 0.83
Recall: 0.90
F1-Score: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.85      0.88       104
  

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

# Print TensorFlow version and GPU availability
print(f"TensorFlow Version: {tf.__version__}")
print(f"GPU Available: {tf.test.is_gpu_available()}")

# Load the dataset (already uploaded in Colab)
df = pd.read_excel('Heart_Disease_Prediction .xlsx')

# Step 1: Preprocessing
# Encode target variable
df['Heart Disease'] = df['Heart Disease'].map({'Presence': 1, 'Absence': 0})

# Define features and target
X = df.drop('Heart Disease', axis=1)
y = df['Heart Disease']

# Identify categorical and numerical columns
categorical_cols = ['Sex', 'Chest pain type', 'FBS over 120', 'EKG results',
                    'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium']
numerical_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

# Step 2: Split the data
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp)

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)

# Step 3: Compute class weights for imbalance
class_counts = np.bincount(y_train)
class_weight = {0: 1.0, 1: class_counts[0] / class_counts[1]}  # ~1.24 for 104:84 split

# Step 4: Build deep learning model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    Dropout(0.3),
    Dense(16, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Step 5: Train model with error handling
try:
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=16,
        class_weight=class_weight,
        callbacks=[early_stopping],
        verbose=1
    )
except Exception as e:
    print(f"Training failed: {e}")
    print("Retrying with minimal configuration...")
    # Rebuild and compile model to reset state
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        Dropout(0.3),
        Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        Dropout(0.3),
        Dense(16, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=1000,
        batch_size=16,
        class_weight=class_weight,
        callbacks=[early_stopping],
        verbose=1
    )

# Step 6: Threshold tuning on validation set
def evaluate_model_with_threshold(model, X_set, y_set, set_name, threshold=0.5):
    y_scores = model.predict(X_set, verbose=0).flatten()
    y_pred = (y_scores >= threshold).astype(int)
    accuracy = accuracy_score(y_set, y_pred)
    precision = precision_score(y_set, y_pred)
    recall = recall_score(y_set, y_pred)
    f1 = f1_score(y_set, y_pred)
    print(f"\n{set_name} Metrics (Threshold={threshold:.2f}):")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"Classification Report:\n{classification_report(y_set, y_pred)}")
    return y_scores

# Evaluate on validation set and find optimal threshold
y_val_scores = evaluate_model_with_threshold(model, X_val, y_val, "Validation")
thresholds = np.arange(0.3, 0.71, 0.01)
best_threshold = 0.5
best_f1 = 0
for t in thresholds:
    y_pred = (y_val_scores >= t).astype(int)
    f1 = f1_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    if precision >= 0.90 and recall >= 0.90 and f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Optimal Threshold from Validation: {best_threshold:.2f}")

# Step 7: Evaluate all sets with optimal threshold
evaluate_model_with_threshold(model, X_train, y_train, "Training", best_threshold)
evaluate_model_with_threshold(model, X_val, y_val, "Validation", best_threshold)
evaluate_model_with_threshold(model, X_test, y_test, "Test", best_threshold)

TensorFlow Version: 2.18.0
GPU Available: True
Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 193ms/step - accuracy: 0.4046 - loss: 1.7489 - val_accuracy: 0.5854 - val_loss: 1.5738
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4306 - loss: 1.6402 - val_accuracy: 0.6829 - val_loss: 1.4917
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4777 - loss: 1.5737 - val_accuracy: 0.7561 - val_loss: 1.4152
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6371 - loss: 1.4847 - val_accuracy: 0.7805 - val_loss: 1.3401
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6391 - loss: 1.4035 - val_accuracy: 0.8049 - val_loss: 1.2665
Epoch 6/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6188 - loss: 1.3696 - val_accuracy: 0.8049 - va

array([0.1446545 , 0.7849375 , 0.13904195, 0.67620116, 0.07238589,
       0.8652813 , 0.15106726, 0.69651127, 0.22773719, 0.9804182 ,
       0.99428   , 0.1473177 , 0.3044472 , 0.86931247, 0.9683708 ,
       0.8390678 , 0.88249767, 0.95584846, 0.955394  , 0.95700693,
       0.27728915, 0.94511974, 0.948141  , 0.9438954 , 0.01757617,
       0.02902043, 0.16862495, 0.92657614, 0.10458346, 0.975963  ,
       0.131153  , 0.16245401, 0.07308483, 0.9494881 , 0.99004   ,
       0.21059312, 0.06299305, 0.09785069, 0.09997994, 0.9874676 ,
       0.10108708], dtype=float32)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline

# Load data from Excel file
file_path = 'Heart_Disease_Prediction .xlsx'
data = pd.read_excel(file_path)

# Map the target labels if needed
data['Heart Disease'] = data['Heart Disease'].replace({0: 'Absence', 1: 'Presence'})

# Set features and target variable
X = data.drop('Heart Disease', axis=1)  # Features
y = data['Heart Disease']  # Target variable

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Select top K best features using SelectKBest
selector = SelectKBest(f_classif, k=min(8, X_train_scaled.shape[1]))  # Max 8 features
X_train_scaled = selector.fit_transform(X_train_scaled, y_train)
X_val_scaled = selector.transform(X_val_scaled)
X_test_scaled = selector.transform(X_test_scaled)

# Now the data is ready for model training

# Define different models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Hyperparameter tuning for Random Forest and Gradient Boosting using GridSearchCV
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Train models and evaluate them
best_model = None
best_score = 0
best_model_name = ""

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")

    if model_name == 'Random Forest':
        grid_search = GridSearchCV(model, param_grid_rf, cv=5, scoring='accuracy')
        grid_search.fit(X_train_scaled, y_train)
        model = grid_search.best_estimator_

    elif model_name == 'Gradient Boosting':
        grid_search = GridSearchCV(model, param_grid_gb, cv=5, scoring='accuracy')
        grid_search.fit(X_train_scaled, y_train)
        model = grid_search.best_estimator_

    # Train the final model
    model.fit(X_train_scaled, y_train)

    # Predict on validation data
    y_pred_val = model.predict(X_val_scaled)

    # Evaluate the model
    accuracy = accuracy_score(y_val, y_pred_val)
    precision = precision_score(y_val, y_pred_val, pos_label='Presence')  # Use 'Presence' as pos_label
    recall = recall_score(y_val, y_pred_val, pos_label='Presence')        # Use 'Presence' as pos_label
    f1 = f1_score(y_val, y_pred_val, pos_label='Presence')                # Use 'Presence' as pos_label

    print(f"\n{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    if accuracy > best_score:
        best_score = accuracy
        best_model = model
        best_model_name = model_name

# Predict on final test data using the best model
y_pred_test = best_model.predict(X_test_scaled)

# Evaluate the best model on test data
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, pos_label='Presence')
test_recall = recall_score(y_test, y_pred_test, pos_label='Presence')
test_f1 = f1_score(y_test, y_pred_test, pos_label='Presence')

print(f"\nBest Model: {best_model_name} with Accuracy: {best_score:.4f}")
print("\nTest Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")

# Classification report for detailed evaluation
print("\nClassification Report for Test Data:")
print(classification_report(y_test, y_pred_test))



Training Logistic Regression...

Logistic Regression Metrics:
Accuracy: 0.7750
Precision: 0.7000
Recall: 0.5385
F1 Score: 0.6087

Training Random Forest...

Random Forest Metrics:
Accuracy: 0.8000
Precision: 0.8571
Recall: 0.4615
F1 Score: 0.6000

Training Gradient Boosting...

Gradient Boosting Metrics:
Accuracy: 0.8250
Precision: 0.8000
Recall: 0.6154
F1 Score: 0.6957

Best Model: Gradient Boosting with Accuracy: 0.8250

Test Metrics:
Accuracy: 0.9024
Precision: 1.0000
Recall: 0.7895
F1 Score: 0.8824

Classification Report for Test Data:
              precision    recall  f1-score   support

     Absence       0.85      1.00      0.92        22
    Presence       1.00      0.79      0.88        19

    accuracy                           0.90        41
   macro avg       0.92      0.89      0.90        41
weighted avg       0.92      0.90      0.90        41



In [12]:
# Perform predictions after pipeline preprocessing (including scaling and one-hot encoding)
rf_pred_test = pipeline.set_params(classifier=rf_model).predict(X_test)
xgb_pred_test = pipeline.set_params(classifier=xgb_model).predict(X_test)

# Create confusion matrix for McNemar's test
cm = confusion_matrix(rf_pred_test, xgb_pred_test)

# Perform McNemar's test
from statsmodels.stats.contingency_tables import mcnemar
result = mcnemar(cm, exact=True)

# Print the results
print(f"\nMcNemar Test Statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

# Interpretation of McNemar's Test
if result.pvalue < 0.05:
    print("The difference between the two models is statistically significant.")
else:
    print("The difference between the two models is not statistically significant.")



McNemar Test Statistic: 0.0
P-value: 1.0
The difference between the two models is not statistically significant.
