In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import shap
import sklearn
from dmgpred.train import get_pipeline
from dmgpred.utils.loading import load_data
from sklearn.metrics import (
    ConfusionMatrixDisplay,
)
from sklearn.model_selection import train_test_split

# print the JS visualization code to the notebook
shap.initjs()
# sns.set_theme()
sklearn.set_config(transform_output="pandas")
np.random.seed(0)

In [None]:
data = load_data(data_dir="../data/", processed=True)

X = data["X_train"]
y = data["y_train"]

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
pipe = get_pipeline(X_train, clf=lgb.LGBMClassifier())
pipe.fit(X_train, y_train)

In [None]:
preprocessor = pipe.named_steps["preprocessor"]
clf = pipe.named_steps["clf"]

In [None]:
X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
feature_names = X_test_preprocessed.columns

## Feature Importances

In [None]:
(
    pd.Series(clf.feature_importances_, index=feature_names)
    .sort_values(ascending=True)
    .plot(kind="barh")
)

## Confusion Matrices

In [None]:
ConfusionMatrixDisplay.from_estimator(pipe, X_test, y_test, cmap="Blues")

## SHAP Values

In [None]:
explainer = shap.TreeExplainer(clf)
explanation = explainer(X_test_preprocessed)

In [None]:
shap_values = explainer.shap_values(X_test_preprocessed)

In [None]:
shap.plots.force(explainer.expected_value[0], shap_values[1, :])

In [None]:
shap.summary_plot(shap_values[:, :, 1], X_test_preprocessed)

In [None]:
shap.plots.violin(shap_values[:, :, 1], feature_names=feature_names)

In [None]:
shap.plots.violin(shap_values[:, :, 2], feature_names=feature_names)

## Wrong Predictions

In [None]:
y_pred = pipe.predict(X_test)
wrong_preds = y_pred[y_pred != y_test]