In [1]:
# COMPLETE HEART DISEASE PREDICTION SYSTEM
# Let's run everything in the correct order

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load and prepare the data
print("Loading and preprocessing data...")
df = pd.read_csv('heart_disease_uci.csv')

# Data preprocessing function
def preprocess_heart_data(df):
    df_clean = df.copy()
    
    # Handle missing values
    num_cols = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
    for col in num_cols:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
        df_clean[col].fillna(df_clean[col].median(), inplace=True)
    
    # Categorical columns
    cat_cols = ['cp', 'restecg', 'slope', 'ca', 'thal']
    for col in cat_cols:
        df_clean[col].fillna(df_clean[col].mode()[0] if not df_clean[col].mode().empty else 'unknown', inplace=True)
    
    # Binary columns
    df_clean['fbs'] = df_clean['fbs'].fillna(False)
    df_clean['exang'] = df_clean['exang'].fillna(False)
    df_clean['fbs'] = df_clean['fbs'].map({True: 1, False: 0, 'TRUE': 1, 'FALSE': 0})
    df_clean['exang'] = df_clean['exang'].map({True: 1, False: 0, 'TRUE': 1, 'FALSE': 0})
    df_clean['sex'] = df_clean['sex'].map({'Male': 1, 'Female': 0})
    
    # Encode categorical variables
    categorical_features = ['cp', 'restecg', 'slope', 'thal', 'dataset']
    label_encoders = {}
    
    for feature in categorical_features:
        if feature in df_clean.columns:
            le = LabelEncoder()
            df_clean[feature] = le.fit_transform(df_clean[feature].astype(str))
            label_encoders[feature] = le
    
    # Create binary target
    df_clean['target'] = (df_clean['num'] > 0).astype(int)
    
    # Drop unnecessary columns
    cols_to_drop = ['id', 'num']
    df_clean = df_clean.drop([col for col in cols_to_drop if col in df_clean.columns], axis=1)
    
    return df_clean, label_encoders

# Preprocess the data
df_processed, encoders = preprocess_heart_data(df)
print(f"Data preprocessing complete. Shape: {df_processed.shape}")

# Prepare features and target
X = df_processed.drop('target', axis=1)
y = df_processed['target']
feature_names = list(X.columns)

print(f"Features: {feature_names}")
print(f"Target distribution: {y.value_counts().to_dict()}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

# Train the final model
print("\nTraining Random Forest model...")
final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)
final_model.fit(X_train, y_train)

print("Model training completed!")

Loading and preprocessing data...
Data preprocessing complete. Shape: (920, 15)
Features: ['age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
Target distribution: {1: 509, 0: 411}
Training set: (736, 14), Test set: (184, 14)

Training Random Forest model...
Model training completed!


In [2]:
# NOW RUN THE PERFORMANCE MONITORING

def monitor_model_performance(model, X_test, y_test, threshold=0.8):
    """
    Monitor model performance and alert if degradation
    """
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    current_auc = roc_auc_score(y_test, y_pred_proba)
    current_accuracy = model.score(X_test, y_test)
    
    print(f"\nüìä CURRENT MODEL PERFORMANCE:")
    print(f"   AUC: {current_auc:.4f}")
    print(f"   Accuracy: {current_accuracy:.4f}")
    
    if current_auc < threshold:
        print("üö® ALERT: Model performance below threshold! Consider retraining.")
    else:
        print("‚úÖ Model performance satisfactory.")
    
    return current_auc, current_accuracy

# Check current performance
current_auc, current_accuracy = monitor_model_performance(final_model, X_test, y_test)


üìä CURRENT MODEL PERFORMANCE:
   AUC: 0.9281
   Accuracy: 0.8424
‚úÖ Model performance satisfactory.


In [3]:
# COMPREHENSIVE PREDICTION SYSTEM

def save_prediction_system(model, feature_names, encoders, filename='heart_disease_predictor.pkl'):
    """
    Save all components needed for predictions
    """
    prediction_system = {
        'model': model,
        'feature_names': feature_names,
        'encoders': encoders
    }
    
    joblib.dump(prediction_system, filename)
    print(f"Prediction system saved as '{filename}'")
    return prediction_system

# Save the system
prediction_system = save_prediction_system(final_model, feature_names, encoders)

def load_prediction_system(filename='heart_disease_predictor.pkl'):
    """
    Load the prediction system
    """
    return joblib.load(filename)

def predict_heart_disease_comprehensive(patient_data, prediction_system):
    """
    Make comprehensive predictions with detailed explanations
    """
    model = prediction_system['model']
    feature_names = prediction_system['feature_names']
    
    # Prepare the input data
    processed_data = preprocess_patient_data(patient_data, feature_names)
    
    # Make prediction
    prediction = model.predict(processed_data)[0]
    probability = model.predict_proba(processed_data)[0, 1]
    
    # Get feature contributions
    feature_contributions = get_feature_contributions(model, processed_data, feature_names)
    
    # Generate comprehensive result
    result = {
        'prediction': int(prediction),
        'probability': float(probability),
        'risk_level': get_risk_level(probability),
        'confidence': get_confidence(probability),
        'feature_contributions': feature_contributions,
        'recommendations': generate_recommendations(prediction, probability, feature_contributions)
    }
    
    return result

def preprocess_patient_data(patient_data, feature_names):
    """
    Preprocess new patient data to match training format
    """
    processed_data = {}
    
    for feature in feature_names:
        if feature in patient_data:
            processed_data[feature] = patient_data[feature]
        else:
            processed_data[feature] = get_default_value(feature)
    
    # Convert to DataFrame with correct column order
    df = pd.DataFrame([processed_data], columns=feature_names)
    
    return df

def get_default_value(feature):
    """
    Get sensible default values for missing features
    """
    defaults = {
        'age': 50,
        'trestbps': 120,
        'chol': 200,
        'thalch': 150,
        'oldpeak': 0,
        'sex': 1,
        'fbs': 0,
        'exang': 0,
        'cp': 0,
        'restecg': 0,
        'slope': 1,
        'ca': 0,
        'thal': 1,
        'dataset': 0
    }
    return defaults.get(feature, 0)

def get_feature_contributions(model, processed_data, feature_names, top_n=5):
    """
    Get top contributing features for the prediction
    """
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        feature_imp = list(zip(feature_names, importances))
        feature_imp.sort(key=lambda x: x[1], reverse=True)
        return feature_imp[:top_n]
    return []

def get_risk_level(probability):
    if probability >= 0.7:
        return "High Risk"
    elif probability >= 0.4:
        return "Medium Risk"
    else:
        return "Low Risk"

def get_confidence(probability):
    confidence = 2 * abs(probability - 0.5)
    if confidence > 0.8:
        return "Very High"
    elif confidence > 0.6:
        return "High"
    elif confidence > 0.4:
        return "Medium"
    else:
        return "Low"

def generate_recommendations(prediction, probability, feature_contributions):
    recommendations = []
    
    if prediction == 1 or probability > 0.3:
        recommendations.append("Consult a cardiologist for further evaluation")
        recommendations.append("Consider lifestyle modifications (diet, exercise)")
        
        top_features = [feat[0] for feat in feature_contributions[:3]]
        
        if 'oldpeak' in top_features:
            recommendations.append("Monitor ST depression during exercise")
        if 'thal' in top_features:
            recommendations.append("Discuss thalassemia testing with your doctor")
        if 'ca' in top_features:
            recommendations.append("Consider coronary angiography evaluation")
        if 'cp' in top_features:
            recommendations.append("Report any chest pain symptoms promptly")
        if 'trestbps' in top_features:
            recommendations.append("Monitor blood pressure regularly")
    else:
        recommendations.append("Maintain regular health checkups")
        recommendations.append("Continue healthy lifestyle habits")
    
    return recommendations

Prediction system saved as 'heart_disease_predictor.pkl'


In [4]:
# TEST THE COMPLETE SYSTEM

print("="*60)
print("TESTING THE COMPLETE PREDICTION SYSTEM")
print("="*60)

# Test patients
test_patients = [
    {
        'name': "High Risk Patient",
        'data': {
            'age': 65, 'sex': 1, 'cp': 3, 'trestbps': 180, 'chol': 300,
            'fbs': 1, 'restecg': 1, 'thalch': 100, 'exang': 1, 
            'oldpeak': 4.2, 'slope': 1, 'ca': 3, 'thal': 2, 'dataset': 0
        }
    },
    {
        'name': "Low Risk Patient", 
        'data': {
            'age': 45, 'sex': 0, 'cp': 1, 'trestbps': 120, 'chol': 180,
            'fbs': 0, 'restecg': 0, 'thalch': 160, 'exang': 0,
            'oldpeak': 0.5, 'slope': 2, 'ca': 0, 'thal': 1, 'dataset': 0
        }
    }
]

# Make predictions
for patient in test_patients:
    print(f"\n{'='*50}")
    print(f"PREDICTION FOR: {patient['name']}")
    print('='*50)
    
    result = predict_heart_disease_comprehensive(patient['data'], prediction_system)
    
    print(f"üîç Prediction: {'‚ù§Ô∏è HEART DISEASE DETECTED' if result['prediction'] else '‚úÖ NO HEART DISEASE'}")
    print(f"üìä Probability: {result['probability']:.3f} ({result['probability']*100:.1f}%)")
    print(f"‚ö†Ô∏è  Risk Level: {result['risk_level']}")
    print(f"üéØ Confidence: {result['confidence']} Confidence")
    
    print(f"\nüìà Top Contributing Features:")
    for feature, importance in result['feature_contributions']:
        print(f"   - {feature}: {importance:.4f}")
    
    print(f"\nüí° Recommendations:")
    for i, recommendation in enumerate(result['recommendations'], 1):
        print(f"   {i}. {recommendation}")

TESTING THE COMPLETE PREDICTION SYSTEM

PREDICTION FOR: High Risk Patient
üîç Prediction: ‚ù§Ô∏è HEART DISEASE DETECTED
üìä Probability: 0.785 (78.5%)
‚ö†Ô∏è  Risk Level: High Risk
üéØ Confidence: Medium Confidence

üìà Top Contributing Features:
   - chol: 0.1267
   - cp: 0.1257
   - age: 0.1123
   - thalch: 0.1092
   - oldpeak: 0.1073

üí° Recommendations:
   1. Consult a cardiologist for further evaluation
   2. Consider lifestyle modifications (diet, exercise)
   3. Report any chest pain symptoms promptly

PREDICTION FOR: Low Risk Patient
üîç Prediction: ‚úÖ NO HEART DISEASE
üìä Probability: 0.040 (4.0%)
‚ö†Ô∏è  Risk Level: Low Risk
üéØ Confidence: Very High Confidence

üìà Top Contributing Features:
   - chol: 0.1267
   - cp: 0.1257
   - age: 0.1123
   - thalch: 0.1092
   - oldpeak: 0.1073

üí° Recommendations:
   1. Maintain regular health checkups
   2. Continue healthy lifestyle habits


In [5]:
# SIMPLIFIED PREDICTION FUNCTION

def quick_predict(age, sex, cp, trestbps, chol, thalch, oldpeak, 
                 fbs=0, restecg=0, exang=0, slope=1, ca=0, thal=1):
    """
    Quick prediction function with essential parameters
    """
    # Map string inputs to numeric values
    sex_map = {'male': 1, 'female': 0, 'm': 1, 'f': 0, '1': 1, '0': 0}
    cp_map = {
        'typical angina': 0, 'typical': 0,
        'atypical angina': 1, 'atypical': 1, 
        'non-anginal': 2, 'non anginal': 2,
        'asymptomatic': 3
    }
    slope_map = {
        'upsloping': 0, 'up': 0,
        'flat': 1,
        'downsloping': 2, 'down': 2
    }
    thal_map = {
        'normal': 1, 'fixed defect': 2, 'fixed': 2,
        'reversable defect': 3, 'reversable': 3, 'reversible': 3
    }
    
    patient_data = {
        'age': age,
        'sex': sex_map.get(str(sex).lower(), 1),
        'cp': cp_map.get(str(cp).lower(), 0),
        'trestbps': trestbps,
        'chol': chol,
        'fbs': 1 if fbs else 0,
        'restecg': restecg,
        'thalch': thalch,
        'exang': 1 if exang else 0,
        'oldpeak': oldpeak,
        'slope': slope_map.get(str(slope).lower(), 1),
        'ca': ca,
        'thal': thal_map.get(str(thal).lower(), 1),
        'dataset': 0
    }
    
    result = predict_heart_disease_comprehensive(patient_data, prediction_system)
    
    print(f"\nüéØ QUICK PREDICTION RESULT:")
    print(f"   Heart Disease: {'YES' if result['prediction'] else 'NO'}")
    print(f"   Probability: {result['probability']*100:.1f}%")
    print(f"   Risk Level: {result['risk_level']}")
    print(f"   Confidence: {result['confidence']}")
    
    return result

# Test quick predictions
print("\n" + "="*50)
print("QUICK PREDICTION EXAMPLES")
print("="*50)

print("\nExample 1 - High risk factors:")
result1 = quick_predict(
    age=65, sex='male', cp='asymptomatic', 
    trestbps=180, chol=300, thalch=100, oldpeak=4.2,
    exang=1, fbs=1
)

print("\nExample 2 - Healthy patient:")
result2 = quick_predict(
    age=45, sex='female', cp='typical angina',
    trestbps=110, chol=180, thalch=160, oldpeak=0.5
)


QUICK PREDICTION EXAMPLES

Example 1 - High risk factors:

üéØ QUICK PREDICTION RESULT:
   Heart Disease: YES
   Probability: 71.5%
   Risk Level: High Risk
   Confidence: Medium

Example 2 - Healthy patient:

üéØ QUICK PREDICTION RESULT:
   Heart Disease: NO
   Probability: 17.5%
   Risk Level: Low Risk
   Confidence: High


In [6]:
# FINAL PERFORMANCE SUMMARY

print("="*60)
print("FINAL PROJECT SUMMARY")
print("="*60)

# Final performance check
final_auc, final_accuracy = monitor_model_performance(final_model, X_test, y_test)

print(f"\nüìã PROJECT STATISTICS:")
print(f"   Total patients: {len(df_processed)}")
print(f"   With heart disease: {y.sum()} ({y.mean()*100:.1f}%)")
print(f"   Number of features: {len(feature_names)}")
print(f"   Model: Random Forest")
print(f"   Best AUC: {final_auc:.4f}")
print(f"   Best Accuracy: {final_accuracy:.4f}")

print(f"\nüéØ TOP 5 MOST IMPORTANT FEATURES:")
feature_importance = list(zip(feature_names, final_model.feature_importances_))
feature_importance.sort(key=lambda x: x[1], reverse=True)
for i, (feature, importance) in enumerate(feature_importance[:5], 1):
    print(f"   {i}. {feature}: {importance:.4f}")

print(f"\n‚úÖ SYSTEM READY FOR USE!")
print(f"   - Prediction system saved as 'heart_disease_predictor.pkl'")
print(f"   - Use quick_predict() for simple predictions")
print(f"   - Use predict_heart_disease_comprehensive() for detailed analysis")

FINAL PROJECT SUMMARY

üìä CURRENT MODEL PERFORMANCE:
   AUC: 0.9281
   Accuracy: 0.8424
‚úÖ Model performance satisfactory.

üìã PROJECT STATISTICS:
   Total patients: 920
   With heart disease: 509 (55.3%)
   Number of features: 14
   Model: Random Forest
   Best AUC: 0.9281
   Best Accuracy: 0.8424

üéØ TOP 5 MOST IMPORTANT FEATURES:
   1. chol: 0.1267
   2. cp: 0.1257
   3. age: 0.1123
   4. thalch: 0.1092
   5. oldpeak: 0.1073

‚úÖ SYSTEM READY FOR USE!
   - Prediction system saved as 'heart_disease_predictor.pkl'
   - Use quick_predict() for simple predictions
   - Use predict_heart_disease_comprehensive() for detailed analysis


In [7]:
# For new predictions (after loading the saved system):
prediction_system = load_prediction_system()

# Quick prediction
result = quick_predict(55, 'male', 'atypical angina', 140, 240, 130, 1.5)

# Detailed prediction
patient_data = {
    'age': 60, 'sex': 1, 'cp': 3, 'trestbps': 150, 
    'chol': 280, 'fbs': 0, 'restecg': 1, 'thalch': 120,
    'exang': 1, 'oldpeak': 2.1, 'slope': 1, 'ca': 2, 
    'thal': 3, 'dataset': 0
}
detailed_result = predict_heart_disease_comprehensive(patient_data, prediction_system)


üéØ QUICK PREDICTION RESULT:
   Heart Disease: NO
   Probability: 40.0%
   Risk Level: Medium Risk
   Confidence: Low
