In [None]:
# 🔽 Auto-download dataset from Kaggle (requires Kaggle API credentials in ~/.kaggle/kaggle.json)
import os, glob
try:
    import opendatasets as od
    od.download('https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset', data_dir='data')
    # Find heart.csv wherever Kaggle placed it
    candidates = glob.glob('data/**/heart.csv', recursive=True)
    if candidates:
        target_path = 'data/heart.csv'
        if candidates[0] != target_path:
            os.makedirs('data', exist_ok=True)
            import shutil
            shutil.copyfile(candidates[0], target_path)
        print('✅ heart.csv ready at data/heart.csv')
    else:
        print('⚠️ heart.csv not found after download. You may need to accept terms on Kaggle.')
except Exception as e:
    print('⚠️ Kaggle download failed:', e)
    print('Please place heart.csv into the data/ folder manually.')


# 🏥 AI Disease Diagnosis System - Heart Disease Prediction

**Author**: Imani Gad  
**Date**: 2024  
**Objective**: Build an explainable ML model to predict heart disease from patient clinical data

---

## Table of Contents
1. Setup & Imports
2. Auto-Download Dataset (Kaggle)
3. Data Loading
4. Exploratory Data Analysis (EDA)
5. Data Preprocessing
6. Model Training
7. Model Evaluation
8. SHAP Explainability
9. Testing on New Patient
10. Model Saving


## 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score, precision_recall_fscore_support

# Explainable AI
import shap

# Persistence
import joblib, os

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("✅ Libraries ready")

## 3. Data Loading

In [None]:
import os
import pandas as pd

df = pd.read_csv('data/heart.csv')
print("📊 Dataset loaded:", df.shape)
df.head()

## 4. Exploratory Data Analysis (EDA)

In [None]:
df.info()
df.describe()

In [None]:
# Target distribution
ax = sns.countplot(x='target', data=df)
ax.set_title('Target Distribution (0=no disease, 1=disease)')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## 5. Data Preprocessing

In [None]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train.shape, X_test.shape)

## 6. Model Training

In [None]:
model = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, solver='lbfgs')
model.fit(X_train_scaled, y_train)

cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print("CV mean accuracy:", cv_scores.mean())

## 7. Model Evaluation

In [None]:
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['No Disease','Heart Disease']))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Disease','Heart Disease'],
            yticklabels=['No Disease','Heart Disease'])
plt.title('Confusion Matrix'); plt.ylabel('Actual'); plt.xlabel('Predicted'); plt.show()

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f'AUC={auc:.2f}'); plt.plot([0,1],[0,1],'--')
plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC Curve'); plt.legend(); plt.show()

## 8. SHAP Explainability

In [None]:
explainer = shap.LinearExplainer(model, X_train_scaled)
shap_values = explainer.shap_values(X_test_scaled)

# Global importance (bar)
shap.summary_plot(shap_values, X_test_scaled, feature_names=X.columns, plot_type='bar')

In [None]:
# Detailed summary
shap.summary_plot(shap_values, X_test_scaled, feature_names=X.columns)

## 9. Testing on New Patient

In [None]:
new_patient = pd.DataFrame({
    'age':[58],'sex':[1],'cp':[2],'trestbps':[150],'chol':[280],'fbs':[1],'restecg':[1],
    'thalach':[140],'exang':[1],'oldpeak':[2.5],'slope':[2],'ca':[2],'thal':[3]
})
new_scaled = scaler.transform(new_patient)
pred = model.predict(new_scaled)[0]
proba = model.predict_proba(new_scaled)[0]
pred, proba

In [None]:
# SHAP explanation for new patient
new_shap = explainer.shap_values(new_scaled)
contrib = pd.DataFrame({'Feature': X.columns, 'Value': new_patient.iloc[0].values, 'SHAP Value': new_shap[0]}).sort_values('SHAP Value', key=abs, ascending=False)
contrib.head(10)

## 10. Model Saving

In [None]:
os.makedirs('models', exist_ok=True)
joblib.dump(model, 'models/logistic_model.pkl')
joblib.dump(scaler, 'models/scaler.pkl')
joblib.dump(list(X.columns), 'models/feature_names.pkl')
print('✅ Saved model, scaler, and feature names.')