In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler,FunctionTransformer,OneHotEncoder # For encoding categorical variables it doesnt matter if they are ordinal or not
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore", message=".*glibc.*")

import time


In [None]:
# Load Data
data = pd.read_csv('heart.csv')
data.head()
data.info()

In [None]:
# Distingushing numerical and categorical features and train test split

categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numerical_features = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

X = data.drop('HeartDisease', axis=1)
y = data['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipelines for both numerical and categorical data

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])


In [None]:
preprocessor.fit(X_train)

# Get feature names after one-hot encoding
ohe_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_features)
all_feature_names = np.concatenate([numerical_features, ohe_feature_names])

# Hyperparameter grid for RandomizedSearchCV
param_distributions = { 
    'classifier__n_estimators': [50, 100, 150, 200],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Stratified K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Hyperparameter tuning with RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model_pipeline,
    param_distributions=param_distributions,
    n_iter=20,
    cv=skf,
    verbose=0,
    n_jobs=-1,
    random_state=42
)



In [None]:
# Run Hyperparameter Tuning

start_time = time.time()
random_search.fit(X_train, y_train)
end_time = time.time()
print(f"Hyperparameter tuning took {end_time - start_time:.2f} seconds")
print("Best Hyperparameters:", random_search.best_params_)
best_model = random_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

In [None]:
# Plot Feature importance
feature_importances = best_model.named_steps['classifier'].feature_importances_
feature_names = all_feature_names
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.show()

In [None]:
# Feature Engineering based on feature importance
# (This part can be expanded based on specific insights from feature importance)

threshold = 0.005
to_drop =  importance_df[importance_df['Importance'] < threshold]['Feature'].tolist()
print("Features to drop based on importance threshold:", to_drop)

def drop_low_importance_features(X):
    X_dropped = pd.DataFrame(X, columns=all_feature_names).copy()
    X_dropped = X_dropped.drop(columns=to_drop, errors='ignore')
    return X_dropped 

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', FunctionTransformer(drop_low_importance_features)),
    ('classifier', GradientBoostingClassifier(random_state=42))
])


In [None]:
# Hyperparameter tuning with RandomizedSearchCV (after feature engineering) 
random_search = RandomizedSearchCV(
    estimator=model_pipeline,
    param_distributions=param_distributions,
    n_iter=20,
    cv=skf,
    verbose=0,
    n_jobs=-1,
    random_state=42
)

# Fit the model
start_time = time.time()
random_search.fit(X_train, y_train)
end_time = time.time()  
print(f"RandomizedSearchCV took {end_time - start_time:.2f} seconds to complete.")  
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the model (after feature engineering)
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]) 

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC Score: {roc_auc:.4f}")



In [None]:
# Feature importance (after feature engineering)

feature_importances = best_model.named_steps['classifier'].feature_importances_
feature_names = [f for f in all_feature_names if f not in to_drop]

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot Feature importance
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances After Feature Engineering')
plt.show()
 
