In [None]:
# Install required packages
%pip install numpy pandas matplotlib seaborn scikit-learn shap xgboost lightgbm


In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
import shap
import warnings
warnings.filterwarnings('ignore')

# Set visualization styles
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)


In [None]:
## Data Loading

# For Google Colab: Upload datasets
from google.colab import files

print("Please upload the heart disease datasets (heart.csv, heart_disease_uci.csv, heart_cleveland_upload.csv)")
uploaded = files.upload()

# Alternatively, you can download from GitHub if datasets are available there
# !wget https://raw.githubusercontent.com/your-username/heart-disease-project/main/heart.csv
# !wget https://raw.githubusercontent.com/your-username/heart-disease-project/main/heart_disease_uci.csv
# !wget https://raw.githubusercontent.com/your-username/heart-disease-project/main/heart_cleveland_upload.csv


In [None]:
# Load datasets
heart_df = pd.read_csv('heart.csv')
uci_df = pd.read_csv('heart_disease_uci.csv')
cleveland_df = pd.read_csv('heart_cleveland_upload.csv')

# Display basic information about datasets
print("Heart Dataset:")
print(f"Shape: {heart_df.shape}")
heart_df.head()


In [None]:
print("\nUCI Dataset:")
print(f"Shape: {uci_df.shape}")
uci_df.head()


In [None]:
print("\nCleveland Dataset:")
print(f"Shape: {cleveland_df.shape}")
cleveland_df.head()


In [None]:
## Data Preprocessing and Exploration

# Check for missing values
print("Missing values in Heart Dataset:")
print(heart_df.isnull().sum())

print("\nMissing values in UCI Dataset:")
print(uci_df.isnull().sum())

print("\nMissing values in Cleveland Dataset:")
print(cleveland_df.isnull().sum())


In [None]:
# Create a function for data preprocessing
def preprocess_data(df, dataset_name):
    """
    Preprocess dataset and return cleaned dataframe
    """
    # Make a copy to avoid modifying the original
    data = df.copy()
    
    # Handle missing values if any (based on the dataset)
    if dataset_name == 'uci':
        # Fill missing values with median for numerical columns
        for col in data.select_dtypes(include=['float64', 'int64']).columns:
            if data[col].isnull().sum() > 0:
                data[col] = data[col].fillna(data[col].median())
    
    # Convert target variable to binary (0 or 1) if needed
    if dataset_name == 'cleveland':
        if 'condition' in data.columns:
            data['target'] = data['condition'].apply(lambda x: 0 if x == 0 else 1)
        elif 'target' in data.columns and data['target'].max() > 1:
            data['target'] = data['target'].apply(lambda x: 0 if x == 0 else 1)
    
    # Create common feature names for all datasets
    # This would depend on your specific datasets
    
    return data

# Preprocess all datasets
heart_processed = preprocess_data(heart_df, 'heart')
uci_processed = preprocess_data(uci_df, 'uci')
cleveland_processed = preprocess_data(cleveland_df, 'cleveland')

print("Processed datasets shapes:")
print(f"Heart: {heart_processed.shape}")
print(f"UCI: {uci_processed.shape}")
print(f"Cleveland: {cleveland_processed.shape}")


In [None]:
## Exploratory Data Analysis

# Create histograms for key numerical features in the Heart dataset
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for i, col in enumerate(numerical_cols):
    if col in heart_processed.columns:
        sns.histplot(data=heart_processed, x=col, hue='target', bins=20, kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {col} by Target')
        axes[i].set_ylabel('Count')

plt.tight_layout()
plt.suptitle('Numerical Features Distribution by Target - Heart Dataset', fontsize=16, y=1.02)
plt.show()


In [None]:
# Create boxplots for numerical features to detect outliers
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    if col in heart_processed.columns:
        sns.boxplot(data=heart_processed, x='target', y=col, ax=axes[i])
        axes[i].set_title(f'Boxplot of {col} by Target')
        axes[i].set_ylabel(col)
        axes[i].set_xlabel('Heart Disease')

plt.tight_layout()
plt.suptitle('Boxplots of Numerical Features by Target - Heart Dataset', fontsize=16, y=1.02)
plt.show()


In [None]:
# Correlation analysis
plt.figure(figsize=(12, 10))
corr = heart_processed.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5)
plt.title('Correlation Matrix - Heart Dataset', fontsize=16)
plt.show()


In [None]:
# Categorical features analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for i, col in enumerate(categorical_cols[:4]):  # First 4 categorical columns
    if col in heart_processed.columns:
        sns.countplot(data=heart_processed, x=col, hue='target', ax=axes[i])
        axes[i].set_title(f'Count of {col} by Target')
        axes[i].set_ylabel('Count')
        # Rotate x labels if needed
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.suptitle('Categorical Features by Target - Heart Dataset (Part 1)', fontsize=16, y=1.02)
plt.show()

# Second set of categorical features
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for i, col in enumerate(categorical_cols[4:]):  # Last 4 categorical columns
    if col in heart_processed.columns:
        sns.countplot(data=heart_processed, x=col, hue='target', ax=axes[i])
        axes[i].set_title(f'Count of {col} by Target')
        axes[i].set_ylabel('Count')
        # Rotate x labels if needed
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.suptitle('Categorical Features by Target - Heart Dataset (Part 2)', fontsize=16, y=1.02)
plt.show()


In [None]:
## Feature Engineering and Data Preparation

# Define features and target variables
def prepare_data_for_modeling(df):
    """
    Prepare dataset for modeling by separating features and target
    and performing any necessary feature engineering
    """
    # Identify target column (should be 'target')
    target_col = 'target'
    if target_col not in df.columns and 'condition' in df.columns:
        target_col = 'condition'
        
    # Separate features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    
    # Drop any non-feature columns
    columns_to_drop = []
    for col in X.columns:
        if col in ['id', 'dataset', 'patient_id']:
            columns_to_drop.append(col)
    
    if columns_to_drop:
        X = X.drop(columns_to_drop, axis=1)
    
    # Handle categorical features
    # Get categorical columns
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    
    # One-hot encode categorical features
    if cat_cols:
        X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
    
    return X, y

# Prepare data for each dataset
X_heart, y_heart = prepare_data_for_modeling(heart_processed)
X_uci, y_uci = prepare_data_for_modeling(uci_processed)
X_cleveland, y_cleveland = prepare_data_for_modeling(cleveland_processed)

# Print shapes to verify
print("Heart dataset:")
print(f"X shape: {X_heart.shape}, y shape: {y_heart.shape}")
print("\nUCI dataset:")
print(f"X shape: {X_uci.shape}, y shape: {y_uci.shape}")
print("\nCleveland dataset:")
print(f"X shape: {X_cleveland.shape}, y shape: {y_cleveland.shape}")


In [None]:
# Split data into training and testing sets
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    return X_train, X_test, y_train, y_test

# Split each dataset
X_train_heart, X_test_heart, y_train_heart, y_test_heart = split_data(X_heart, y_heart)
X_train_uci, X_test_uci, y_train_uci, y_test_uci = split_data(X_uci, y_uci)
X_train_cleveland, X_test_cleveland, y_train_cleveland, y_test_cleveland = split_data(X_cleveland, y_cleveland)

# Normalize/standardize the data
scaler = StandardScaler()
X_train_heart_scaled = scaler.fit_transform(X_train_heart)
X_test_heart_scaled = scaler.transform(X_test_heart)

scaler_uci = StandardScaler()
X_train_uci_scaled = scaler_uci.fit_transform(X_train_uci)
X_test_uci_scaled = scaler_uci.transform(X_test_uci)

scaler_cleveland = StandardScaler()
X_train_cleveland_scaled = scaler_cleveland.fit_transform(X_train_cleveland)
X_test_cleveland_scaled = scaler_cleveland.transform(X_test_cleveland)

print("Data split and scaled successfully!")


In [None]:
## Model Training and Evaluation

# Define a function to evaluate model performance
def evaluate_model(model, X_train, X_test, y_train, y_test, dataset_name):
    """
    Train and evaluate a model on the given dataset
    """
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Get probability predictions for ROC curve
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
    else:
        fpr, tpr, roc_auc = None, None, None
    
    # Print results
    print(f"Performance on {dataset_name} dataset:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    
    # Create confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {dataset_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # Plot ROC curve if available
    if fpr is not None and tpr is not None:
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'Receiver Operating Characteristic - {dataset_name}')
        plt.legend(loc="lower right")
        plt.show()
    
    return model, accuracy, report, fpr, tpr, roc_auc


In [None]:
# Train and evaluate Random Forest on Heart dataset
print("Training Random Forest on Heart dataset...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model, rf_accuracy, rf_report, rf_fpr, rf_tpr, rf_auc = evaluate_model(
    rf_model, X_train_heart_scaled, X_test_heart_scaled, y_train_heart, y_test_heart, "Heart")


In [None]:
# Train and evaluate XGBoost on Heart dataset
print("Training XGBoost on Heart dataset...")
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model, xgb_accuracy, xgb_report, xgb_fpr, xgb_tpr, xgb_auc = evaluate_model(
    xgb_model, X_train_heart_scaled, X_test_heart_scaled, y_train_heart, y_test_heart, "Heart")


In [None]:
# Train and evaluate LightGBM on Heart dataset
print("Training LightGBM on Heart dataset...")
lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
lgb_model, lgb_accuracy, lgb_report, lgb_fpr, lgb_tpr, lgb_auc = evaluate_model(
    lgb_model, X_train_heart_scaled, X_test_heart_scaled, y_train_heart, y_test_heart, "Heart")


In [None]:
# Compare models performance
models = ['Random Forest', 'XGBoost', 'LightGBM']
accuracies = [rf_accuracy, xgb_accuracy, lgb_accuracy]

plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracies)
plt.title('Model Comparison - Heart Dataset')
plt.ylabel('Accuracy')
plt.ylim(0.7, 1.0)
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
# Plot ROC curves for all models in one plot
plt.figure(figsize=(10, 8))
plt.plot(rf_fpr, rf_tpr, label=f'Random Forest (AUC = {rf_auc:.3f})', color='blue')
plt.plot(xgb_fpr, xgb_tpr, label=f'XGBoost (AUC = {xgb_auc:.3f})', color='red')
plt.plot(lgb_fpr, lgb_tpr, label=f'LightGBM (AUC = {lgb_auc:.3f})', color='green')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc="lower right")
plt.grid(True, linestyle='--', alpha=0.3)
plt.show()


In [None]:
## Model Explanation with SHAP

# Get the best model (assuming XGBoost performed the best)
best_model = xgb_model  # Replace with the best model based on results

# Calculate SHAP values
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test_heart_scaled)

# Convert test data back to DataFrame with feature names for better visualization
X_test_heart_df = pd.DataFrame(X_test_heart_scaled, columns=X_heart.columns)

# Summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test_heart_df, plot_type="bar", show=False)
plt.title("Feature Importance Using SHAP Values")
plt.tight_layout()
plt.show()

# Detailed SHAP summary plot
plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values, X_test_heart_df, show=False)
plt.title("SHAP Summary Plot")
plt.tight_layout()
plt.show()


In [None]:
# SHAP Dependence Plots for top features
# Get top 3 features based on SHAP values
feature_importance = pd.DataFrame(
    np.abs(shap_values).mean(0),
    index=X_heart.columns,
    columns=['importance']
).sort_values('importance', ascending=False)

top_features = feature_importance.head(3).index.tolist()

# Create dependence plots for top 3 features
for feature in top_features:
    plt.figure(figsize=(10, 7))
    shap.dependence_plot(feature, shap_values, X_test_heart_df, show=False)
    plt.title(f"SHAP Dependence Plot for {feature}")
    plt.tight_layout()
    plt.show()


In [None]:
## Feature Importance from Random Forest

# Get feature importances from Random Forest model
rf_importances = pd.DataFrame({
    'Feature': X_heart.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=rf_importances.head(15))
plt.title('Feature Importances from Random Forest')
plt.tight_layout()
plt.show()


In [None]:
## Hyperparameter Tuning

# Hyperparameter tuning for XGBoost model
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Use a subset of hyperparameters to save computation time
param_grid_small = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}

print("Performing hyperparameter tuning for XGBoost. This may take a while...")
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(random_state=42),
    param_grid=param_grid_small,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_heart_scaled, y_train_heart)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Evaluate the tuned model
print("\nEvaluating the tuned XGBoost model...")
tuned_xgb_model, tuned_xgb_accuracy, tuned_xgb_report, tuned_xgb_fpr, tuned_xgb_tpr, tuned_xgb_auc = evaluate_model(
    best_xgb_model, X_train_heart_scaled, X_test_heart_scaled, y_train_heart, y_test_heart, "Heart (Tuned XGBoost)")


In [None]:
## Cross-Dataset Evaluation

# Test the best model trained on heart dataset on the other datasets
print("Testing the best model on Cleveland dataset...")
cleveland_pred = best_xgb_model.predict(X_test_cleveland_scaled)
cleveland_accuracy = accuracy_score(y_test_cleveland, cleveland_pred)
cleveland_report = classification_report(y_test_cleveland, cleveland_pred)

print(f"Accuracy on Cleveland dataset: {cleveland_accuracy:.4f}")
print("Classification Report:")
print(cleveland_report)

# Create confusion matrix for Cleveland dataset
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test_cleveland, cleveland_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - Cleveland Dataset (with Heart-trained model)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Test on UCI dataset
print("\nTesting the best model on UCI dataset...")
uci_pred = best_xgb_model.predict(X_test_uci_scaled)
uci_accuracy = accuracy_score(y_test_uci, uci_pred)
uci_report = classification_report(y_test_uci, uci_pred)

print(f"Accuracy on UCI dataset: {uci_accuracy:.4f}")
print("Classification Report:")
print(uci_report)

# Create confusion matrix for UCI dataset
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test_uci, uci_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - UCI Dataset (with Heart-trained model)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


In [None]:
## Save the Best Model for Future Use

# Save the best model using pickle
import pickle

# Save the best model
model_filename = 'best_heart_disease_model.pkl'
pickle.dump(best_xgb_model, open(model_filename, 'wb'))

# Save the scaler
scaler_filename = 'heart_scaler.pkl'
pickle.dump(scaler, open(scaler_filename, 'wb'))

print(f"Model saved to {model_filename}")
print(f"Scaler saved to {scaler_filename}")

# Example of how to load and use the saved model
print("\nExample of loading and using the saved model:")
print("```python")
print("import pickle")
print("# Load the model and scaler")
print("loaded_model = pickle.load(open('best_heart_disease_model.pkl', 'rb'))")
print("loaded_scaler = pickle.load(open('heart_scaler.pkl', 'rb'))")
print("# Prepare new data and make prediction")
print("new_data = pd.DataFrame(...) # Your new data")
print("new_data_scaled = loaded_scaler.transform(new_data)")
print("prediction = loaded_model.predict(new_data_scaled)")
print("probability = loaded_model.predict_proba(new_data_scaled)")
print("```")


In [None]:
## Make Predictions on New Data

# Create a function to make predictions on new data
def predict_heart_disease(data, model, scaler):
    """
    Make heart disease predictions on new data
    
    Parameters:
    -----------
    data : pandas DataFrame
        New patient data with the same features as the training data
    model : trained model
        The trained model to use for predictions
    scaler : fitted scaler
        The scaler used to standardize the training data
        
    Returns:
    --------
    predictions : numpy array
        Binary predictions (0: No disease, 1: Disease)
    probabilities : numpy array
        Probability of heart disease
    """
    # Scale the data
    scaled_data = scaler.transform(data)
    
    # Make predictions
    predictions = model.predict(scaled_data)
    probabilities = model.predict_proba(scaled_data)[:, 1]
    
    return predictions, probabilities

# Example: Create a sample patient data
print("Example: Prediction for a new patient")
sample_patient = pd.DataFrame({
    'age': [65],
    'sex': [1],  # Male
    'cp': [2],   # Chest pain type
    'trestbps': [145],  # Resting blood pressure
    'chol': [240],  # Cholesterol
    'fbs': [1],  # Fasting blood sugar > 120 mg/dl
    'restecg': [0],  # Resting ECG
    'thalach': [150],  # Max heart rate achieved
    'exang': [0],  # Exercise induced angina
    'oldpeak': [2.3],  # ST depression
    'slope': [0],  # Slope of peak exercise ST segment
    'ca': [1],  # Number of major vessels colored by fluoroscopy
    'thal': [3]  # Thalassemia
})

# Make prediction
# Check if the sample has the same columns as the training data
if set(sample_patient.columns) != set(X_heart.columns):
    print("Warning: Sample data columns don't match training data columns.")
    print(f"Sample columns: {sample_patient.columns}")
    print(f"Training columns: {X_heart.columns}")
    print("Adjust the sample data or model before making predictions.")
else:
    prediction, probability = predict_heart_disease(sample_patient, best_xgb_model, scaler)
    
    print(f"Prediction: {'Heart Disease' if prediction[0] == 1 else 'No Heart Disease'}")
    print(f"Probability of Heart Disease: {probability[0]:.4f}")
    
    # Create a visualization for the prediction probability
    plt.figure(figsize=(8, 2))
    plt.barh(['Heart Disease Risk'], [probability[0]*100], color='coral' if probability[0] > 0.5 else 'skyblue')
    plt.xlim(0, 100)
    plt.xlabel('Probability (%)')
    plt.title('Heart Disease Risk Assessment')
    for i, v in enumerate([probability[0]*100]):
        plt.text(v + 3, i, f"{v:.1f}%")
    plt.tight_layout()
    plt.show()


In [None]:
## Conclusion and Future Work

# Summarize findings
print("# Summary of Heart Disease Analysis Project")
print("\n## Key Findings:")
print("1. Several machine learning models were evaluated, with XGBoost showing the best performance")
print("2. The tuned XGBoost model achieved an accuracy of approximately {:.1f}% on the test set".format(tuned_xgb_accuracy*100))
print("3. Most important features for heart disease prediction include:")
for i, row in feature_importance.head(5).iterrows():
    print(f"   - {i}: {row['importance']:.4f}")
    
print("\n## Model Performance Across Datasets:")
print(f"- Heart dataset: {tuned_xgb_accuracy:.4f}")
print(f"- Cleveland dataset: {cleveland_accuracy:.4f}")
print(f"- UCI dataset: {uci_accuracy:.4f}")

print("\n## Future Work:")
print("1. Collect more data to improve model generalization")
print("2. Explore additional feature engineering techniques")
print("3. Test more advanced models like neural networks")
print("4. Create a web application for heart disease risk assessment")
print("5. Incorporate additional medical parameters for improved accuracy")
