[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/H-IAAC/d3vis_ipynb/blob/main/examples/graphs.ipynb)

# Import Datasets

In [3]:
from sklearn.svm import OneClassSVM
import pandas as pd
from sklearn.ensemble import IsolationForest
import numpy as np
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
import shap
import gdown
import os
from d3vis_ipynb import BarPlot, HistogramPlot, ScatterPlot, LinearPlot, RangeSlider, MatrixLayout, MapPlot, BeeswarmPlot

In [9]:
output_path = "data/dados_saeb.csv"
 
if not os.path.isfile(output_path):
    if not os.path.exists("data"):
        os.makedirs("data")
    file_id = "1ClwCEiXc2bqM5bZ__u1Zxw0Q39yJDqjE"
    gdown.download(f"https://drive.google.com/uc?id={file_id}", output_path, quiet=False)
 
df = pd.read_csv(output_path, encoding='utf-8', sep=';')

In [11]:
df.columns = df.columns.str.replace("ALUNOS_PORCENTAGEM_", "", regex=False)
df.columns = df.columns.str.replace("PROFESSOR_PORCENTAGEM_", "", regex=False)

In [13]:
df_drop = df.drop(['MEDIA_EM_MT', 'MEDIA_EM_LP', 'ALUNOS_ID_ESCOLA'], axis=1)
X = df_drop
y = df['MEDIA_EM_MT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

parameters = {
    'lambda': 0.007342571956423321, 
    'alpha': 0.0049894947342883815, 
    'colsample_bytree': 0.5, 
    'subsample': 0.7, 
    'learning_rate': 0.03263662172830031, 
    'n_estimators': 801, 
    'max_depth': 4, 
    'min_child_weight': 9, 
    'gamma': 0.0014832545774497129
}


model = XGBRegressor(**parameters)#LGBMRegressor(**parameters)
# fit model
model.fit(X_train, y_train)

In [14]:
# compute SHAP values
explainer = shap.Explainer(model, X)
shap_values = explainer(X)



In [15]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(shap_values.values).mean(axis=0)
})

# Ordenando e selecionando as 20 mais importantes
top_20_features = feature_importance.sort_values(by='importance', ascending=False).head(20)
important_features = top_20_features['feature'].tolist()

# Identificando os índices das 20 principais features no array
important_indices = [list(X.columns).index(feature) for feature in important_features]

# Filtrando shap_values para as 20 principais características
filtered_shap_values = shap_values[:, important_indices]

.values =
array([[ 2.10681066,  1.78957442,  0.69119385, ...,  0.29689084,
         1.30396957,  0.98390038],
       [-1.25689289,  1.68717114,  0.93039545, ..., -0.45972767,
         0.44236923,  0.11420441],
       [ 1.61236479, -1.63662815, -3.04713614, ..., -0.95211403,
         1.09564758, -1.67356185],
       ...,
       [ 2.39736109, -1.1983179 , -0.72525895, ...,  0.61214018,
        -1.51259644,  1.31150372],
       [ 2.54077968, -0.96219824, -0.29222381, ...,  0.67398473,
         0.81968006,  1.15127666],
       [ 0.64733507, -1.08320105,  0.99317098, ...,  0.01226893,
        -0.74410098,  0.95507939]])

.base_values =
array([273.6047303, 273.6047303, 273.6047303, ..., 273.6047303,
       273.6047303, 273.6047303])

.data =
array([[0.03030303, 0.87878788, 0.21212121, ..., 0.15151515, 0.81818182,
        0.1969697 ],
       [0.07746479, 0.85211268, 0.21126761, ..., 0.18309859, 0.62676056,
        0.11971831],
       [0.03571429, 0.60714286, 0.60714286, ..., 0.21428571, 0.857

# BeeswarmPlot

Generates a beeswarm plot from data.

Parameters:
- **explanation**: a shap's Explanation object

In [16]:
from d3vis_ipynb import BeeswarmPlot
beeswarmPlot = BeeswarmPlot(explanation=filtered_shap_values[:500])
beeswarmPlot

BeeswarmPlot(baseValue=np.float64(273.6047302957461), dataRecords=[{'feature_names': 'Q017A_Lazer_Nao_Uso_Temp…

# DecisionPlot

Generates a decision plot from data.

Parameters:
- **explanation**: a shap's Explanation object

In [18]:
from d3vis_ipynb import DecisionPlot
decisionPlot = DecisionPlot(explanation=filtered_shap_values[:50])
decisionPlot

DecisionPlot(baseValue=np.float64(273.6047302957461), dataRecords=[{'feature_names': 'Q017A_Lazer_Nao_Uso_Temp…

# ForcePlot

Generates a waterfall plot from data.

Parameters:
- **baseValue**: a float with the base value
- **explanation**: a shap's Explanation object

Can be initialized two different ways: with a *shap._explanation.Explanation* object or with a *pandas.core.frame.DataFrame* object.

In [19]:
from d3vis_ipynb import ForcePlot
forcePlot = ForcePlot(explanation=filtered_shap_values[0])
forcePlot

ForcePlot(baseValue=np.float64(273.6047302957461), dataRecords=[{'data': 0.0303030303030303, 'feature_names': …

Click on the polygons and run the cell bellow:

# WaterfallPlot

Generates a waterfall plot from data.

Parameters:
- **baseValue**: a float with the base value
- **explanation**: a shap's Explanation object

Can be initialized two different ways: with a *shap._explanation.Explanation* object or with a *pandas.core.frame.DataFrame* object.

In [24]:
from d3vis_ipynb import WaterfallPlot
waterfallPlot = WaterfallPlot(explanation=filtered_shap_values[0])
waterfallPlot

WaterfallPlot(baseValue=np.float64(273.6047302957461), dataRecords=[{'data': 0.0303030303030303, 'feature_name…

Click on the polygons and run the cell bellow:

In [None]:
# waterfallPlot.selectedValues