# Explaining Breast Cancer Diagnosis Predictions with SHAP

Here **SHAP (SHapley Additive exPlanations)** has been used to interpret the predictions of a machine learning
classification model for breast cancer diagnosis using the **Wisconsin Breast Cancer** dataset. The goals:

- Implement SHAP for a binary classification task.
- Analyze feature importance in a clinical context.
- Create interpretable visualizations for clinical insights.
- Compare SHAP values between benign and malignant predictions.



In [None]:
# Setup: imports and helper functions
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt

# Display settings
pd.set_option('display.max_columns', 50)
np.random.seed(42)

print('Ready.')


In [None]:
# Load the Wisconsin Breast Cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)  # 0 = malignant, 1 = benign in sklearn's dataset (confirm below)

print('Feature names (first 10):', list(X.columns)[:10])
print('Target names:', data.target_names)
print('Value counts (target):\n', y.value_counts())

# Quick descriptive stats
display(X.describe().T)


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred, target_names=data.target_names))
print('ROC AUC:', roc_auc_score(y_test, y_proba))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))


## SHAP explanation



In [None]:
# Try to import shap; if not installed, the user can pip install it.
try:
    import shap
    print('shap version:', shap.__version__)
except Exception as e:
    print('shap is not installed in this environment. To install, run:')
    print('    pip install shap')
    raise e


In [None]:
# Create an explainer and compute SHAP values for the RandomForest model
explainer = shap.TreeExplainer(clf)
X_test_sample = X_test.copy().reset_index(drop=True)
shap_values = explainer.shap_values(X_test_sample)  # for classification, shap_values is a list per class

print('Type of shap_values:', type(shap_values))
print('Number of classes (shap_values length):', len(shap_values))


In [None]:
# Summary plot for the class of interest.
class_idx = 0  # inspect contributions toward predicting 'malignant' (class 0)
shap.summary_plot(shap_values[class_idx], X_test_sample, feature_names=X_test_sample.columns, show=True)


In [None]:
# Dependence plot for the top feature by mean(|SHAP|)
mean_abs_shap = np.abs(shap_values[class_idx]).mean(axis=0)
top_idx = np.argsort(mean_abs_shap)[-1]
top_feature = X_test_sample.columns[top_idx]
print('Top feature by mean(|SHAP|):', top_feature)
shap.dependence_plot(top_feature, shap_values[class_idx], X_test_sample, show=True)


In [None]:
# Force plot for a single instance
i = 5
print('True label:', y_test.reset_index(drop=True).iloc[i], '  (0=malignant,1=benign)')
print('Model proba for benign (class 1):', clf.predict_proba(X_test_sample.iloc[[i]])[0,1])

shap.initjs()
# Render a matplotlib-based force plot inline
shap.force_plot(explainer.expected_value[class_idx], shap_values[class_idx][i], X_test_sample.iloc[i], matplotlib=True)


## Compare SHAP values between benign and malignant predictions



In [None]:
pred_labels = clf.predict(X_test_sample)
df_shap = pd.DataFrame(np.abs(shap_values[class_idx]), columns=X_test_sample.columns)
df_shap['predicted_label'] = pred_labels
grouped = df_shap.groupby('predicted_label').mean().T

grouped['diff'] = grouped[0] - grouped[1]
display(grouped.sort_values('diff', ascending=False).head(10))

top_feats = grouped['diff'].abs().sort_values(ascending=False).head(10).index.tolist()
fig, ax = plt.subplots(figsize=(8,6))
bar_width = 0.4
y = np.arange(len(top_feats))
ax.barh(y - bar_width/2, grouped.loc[top_feats, 0], height=bar_width, label='Predicted Malignant (mean |SHAP|)')
ax.barh(y + bar_width/2, grouped.loc[top_feats, 1], height=bar_width, label='Predicted Benign (mean |SHAP|)')
ax.set_yticks(y)
ax.set_yticklabels(top_feats)
ax.invert_yaxis()
ax.set_xlabel('Mean |SHAP value|')
ax.legend()
plt.title('Top features: mean |SHAP| by predicted label')
plt.tight_layout()
plt.show()


## Clinical interpretation 


In [None]:
# Save model and a small sample of SHAP values (optional)
import joblib, json
joblib.dump(clf, '/mnt/data/rf_breast_cancer_model.joblib')
output_df = X_test_sample.copy().reset_index(drop=True)
output_df['true_label'] = y_test.reset_index(drop=True)
output_df['pred_label'] = pred_labels
output_df['mean_abs_shap_class0'] = np.abs(shap_values[class_idx]).mean(axis=1)
output_df.to_csv('/mnt/data/breast_cancer_test_sample_with_shap.csv', index=False)
print('Saved model and CSV to /mnt/data/')
