In [1]:
#############################################################################################################
##### Notebook Explainability
##### Baseado em:
##  Dataset: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset
##
##############################################################################################################
## Objetivos:
##   Demonstrar os principais metodos de explainability

In [2]:
#!pip install xgboost
#!pip install dice-ml

In [None]:
import pandas as pd
import imblearn
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from interpret.blackbox import LimeTabular
from interpret import show

pd.set_option('display.max_columns', None)

import shap
# import dice_ml


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\Masmok\miniconda3\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\Masmok\miniconda3\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:\Users\Masmok\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Masmok\AppData\Roaming\Python\Python310\site-packages\traitlets\config\application.py", line 1043, in launch_instance
    app.start()
 

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [None]:
class DataLoader():
    def __init__(self):
        self.data = None

    def load_dataset(self, path="C:/Users/dealbuqc/Desktop/ontomqol/Datasets/stroke/healthcare-dataset-stroke-data.csv"):
        self.data = pd.read_csv(path)

    def preprocess_data(self, key=0):
        # One-hot encode para todas as colunas categoricas
        categorical_cols = ["gender",
                            "ever_married",
                            "work_type",
                            "Residence_type",
                            "smoking_status"]
        encoded = pd.get_dummies(self.data[categorical_cols], 
                                prefix=categorical_cols, dtype=float)

        # Atualiza dataset com novas colunas
        self.data = pd.concat([encoded, self.data], axis=1)
        self.data.drop(categorical_cols, axis=1, inplace=True)

        # Incluir valores que faltam na coluna BMI
        self.data.bmi = self.data.bmi.fillna(0)
        #self.data.drop(self.data[self.data['bmi'] < 10].index, inplace=True)
        
        # Drop id - caracteristica nao eh relevante
        self.data.drop(["id"], axis=1, inplace=True)

        

    def get_data_split(self):
        X = self.data.iloc[:,:-1]
        y = self.data.iloc[:,-1]
        return train_test_split(X, y, test_size=0.20, random_state=2021)
    
    def oversample(self, X_train, y_train):
        oversample = RandomOverSampler(sampling_strategy='minority')
        # Converte para numpy e oversample
        x_np = X_train.to_numpy()
        y_np = y_train.to_numpy()
        x_np, y_np = oversample.fit_resample(x_np, y_np)
        # Convert de volta para pandas
        x_over = pd.DataFrame(x_np, columns=X_train.columns)
        y_over = pd.Series(y_np, name=y_train.name)
        return x_over, y_over

In [None]:
# Carregar dados
data_loader = DataLoader()
data_loader.load_dataset()
data_loader.preprocess_data()

# Separar em treinamento e avaliacao, fazendo o oversampling
X_train, X_test, y_train, y_test = data_loader.get_data_split()
X_train, y_train = data_loader.oversample(X_train, y_train)
print(X_train.shape)
print(X_test.shape)

(7778, 21)
(1022, 21)


In [None]:
# %% Treinar o modelo blackbox (pode ser qualquer um aqui)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

F1 Score 0.5293487041273149
Accuracy 0.9403131115459883


In [None]:
# Aplicar LIME

# Classificacao: predict_proba. Regressao: predict)
lime = LimeTabular(rf, 
                   X_train) # precisa do dataset para gerar as perturbacoes

# Retornar explanation
lime_local = lime.explain_local(X_test[-20:], 
                                y_test[-20:], 
                                name='LIME')
show(lime_local)

In [None]:
# Aplicar SHAP

import xgboost

model = xgboost.XGBClassifier(n_estimators=20)
model.fit(X_train, y_train)

explainer = shap.Explainer(model)
shap_values = explainer(X_train)
shap.plots.beeswarm(shap_values)

In [None]:
X_train.head()

In [None]:
import seaborn as sns
sns.lmplot(x="age", y="bmi", data=X_train);

In [None]:
print(len(X_train))
df_test = X_train.drop(X_train[X_train['bmi'] < 10].index) # abaixo de 18 ja eh desnutrido/modelo. Impossivel abaixo de 10
print(len(df_test))

In [None]:
sns.lmplot(x="age", y="bmi", data=df_test);

In [None]:
X_train.describe()

In [None]:
import xgboost

model = xgboost.XGBClassifier(n_estimators=20)
model.fit(X_train, y_train)

explainer = shap.Explainer(model)
shap_values = explainer(df_test)
shap.plots.beeswarm(shap_values)

In [None]:
# Aplicar DiCE  (Diverse Counterfactual Explanations)

# Dataset
data_dice = dice_ml.Data(dataframe=data_loader.data, 
                         # Indicar quem sao as caracteristicas continuas (para perturbacao)
                         continuous_features=['age', 
                                              'avg_glucose_level',
                                              'bmi'], 
                         outcome_name='stroke')

In [None]:
# Modelo
rf_dice = dice_ml.Model(model=rf, 
                        backend="sklearn") # tf, torch, ...
explainer = dice_ml.Dice(data_dice, 
                         rf_dice, 
                         # Random sampling, genetic algorithm, kd-tree,... (Ver github.com/)
                         # Ver github.com/interpretml/DICE para outras opcoes incluindo para DL
                         method="random")

In [None]:
# %% Criar explanations
input_datapoint = X_test[0:1]
cf = explainer.generate_counterfactuals(input_datapoint, 
                                  total_CFs=3, 
                                  desired_class="opposite")

# Visualizar
cf.visualize_as_dataframe(show_only_changes=True)


In [None]:
# %% Criar contrafatos condicionais
features_to_vary=['avg_glucose_level',
                  'bmi',
                  'smoking_status_smokes']
permitted_range={'avg_glucose_level':[80,250],
                'bmi':[18, 35]}

cf = explainer.generate_counterfactuals(input_datapoint, 
                                  total_CFs=3, 
                                  desired_class="opposite",
                                  permitted_range=permitted_range,
                                  features_to_vary=features_to_vary)
# Visualizacao
cf.visualize_as_dataframe(show_only_changes=True)

In [None]:
#######################################################
### Codigo abaixo nao vai ser exigido
#######################################################

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input, decode_predictions
from tensorflow.keras.applications.vgg16 import VGG16
from innvestigate import create_analyzer

# Carregar os pesos do modelo pre-treinado VGG16
model = VGG16(weights='imagenet')

In [None]:
# Carregar um exemplo de imagem
img_path = 'C:/Users/dealbuqc/Desktop/ontomqol/Datasets/brain-mri/Testing/glioma_tumor/image(1).jpg'
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

In [None]:
plt.imshow(img)

In [None]:
# Identificacao da imagem (classificao)
preds = model.predict(x)
print('Predicted:', decode_predictions(preds, top=3)[0])

In [None]:
## ESSE CODIGO VAI GERAR UM ERRO. TENTE ENTENDER O MOTIVO!

# Create an LRP analyzer
#analyzer = create_analyzer("lrp.z", model)
analyzer = create_analyzer("gradient", model)

# Aplicar o LRP a image
analysis = analyzer.analyze(x)

# Plotar o heatmap
plt.imshow(analysis.squeeze(), cmap='viridis')
plt.colorbar()
plt.show()

# Para uma implementacao correta (em pytorch), ver link abaixo:
# https://www.kaggle.com/code/gustavkeppler/layer-wise-relevance-propagation-lrp-on-vgg16

In [None]:
# Ver um demo em: 
### https://lrpserver.hhi.fraunhofer.de/image-classification