In [1]:
import numpy as np
import pandas as pd
import shap
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from joblib import load

from bd_lc_mediterranean.utilities.confusion_matrix import compute_confusion_matrix

### Data preprocessing

In [2]:
data = pd.read_csv('dataset_postprocessed.csv')
data.dropna()
data = data[~data.isin([np.nan, np.inf, -np.inf]).any(1)]

In [3]:
y_train_data = data["class"] 
x_train_data = data.drop(["class", "latitude", "longitude", "spring_product_name", "autumn_product_name", "summer_product_name"], axis=1)
pc_columns = x_train_data.columns

reduced_x_train_data = data[pc_columns]
reduced_x_train_data.columns

Index(['slope', 'aspect', 'dem', 'spring_cri1', 'spring_evi2', 'spring_mndwi',
       'spring_moisture', 'spring_ndre', 'spring_ndvi', 'spring_ndyi',
       'spring_osavi', 'spring_ri', 'spring_AOT', 'spring_B01', 'spring_B02',
       'spring_B03', 'spring_B04', 'spring_B05', 'spring_B06', 'spring_B07',
       'spring_B08', 'spring_B09', 'spring_B11', 'spring_B12', 'spring_B8A',
       'spring_WVP', 'autumn_cri1', 'autumn_evi2', 'autumn_mndwi',
       'autumn_moisture', 'autumn_ndre', 'autumn_ndvi', 'autumn_ndyi',
       'autumn_osavi', 'autumn_ri', 'autumn_AOT', 'autumn_B01', 'autumn_B02',
       'autumn_B03', 'autumn_B04', 'autumn_B05', 'autumn_B06', 'autumn_B07',
       'autumn_B08', 'autumn_B09', 'autumn_B11', 'autumn_B12', 'autumn_B8A',
       'autumn_WVP', 'summer_cri1', 'summer_evi2', 'summer_mndwi',
       'summer_moisture', 'summer_ndre', 'summer_ndvi', 'summer_ndyi',
       'summer_osavi', 'summer_ri', 'summer_AOT', 'summer_B01', 'summer_B02',
       'summer_B03', 'summer_B04

In [4]:
X_train, X_test, y_train, y_test = train_test_split(reduced_x_train_data, y_train_data, test_size=0.15, random_state=0,)
labels=y_train_data.unique()

In [5]:
model = load('model.joblib')
y_true = model.predict(X_test)

In [6]:
pred = pd.DataFrame(y_true).reset_index(drop=True, inplace=False)
real = y_test.reset_index(drop=True, inplace=False)
test = pd.DataFrame(X_test).reset_index(drop=True, inplace=False)
train = pd.DataFrame(X_train).reset_index(drop=True, inplace=False)

# Explainability with SHAP

In [7]:
explainer = shap.TreeExplainer(model)

In [8]:
X_test

Unnamed: 0,slope,aspect,dem,spring_cri1,spring_evi2,spring_mndwi,spring_moisture,spring_ndre,spring_ndvi,spring_ndyi,...,summer_B04,summer_B05,summer_B06,summer_B07,summer_B08,summer_B09,summer_B11,summer_B12,summer_B8A,summer_WVP
78881,-0.754471,0.982909,-0.701,1.605321,1.262920,-0.550805,-0.004616,0.342479,0.526382,0.232340,...,-0.584857,-0.476000,-0.393857,-0.337857,-0.306286,-0.192000,-0.061714,-0.337857,-0.265000,-0.522429
112941,-0.633810,-0.562908,0.004,1.582229,1.025973,-0.460340,-0.007273,0.230635,0.427601,0.225475,...,-0.439429,-0.254571,-0.164571,-0.106000,-0.122286,-0.036857,0.136286,-0.159429,-0.073714,-0.632000
23,-0.762884,-0.500000,-0.285,1.268103,0.712715,-0.296924,0.040058,0.232441,0.297050,0.118206,...,-0.616571,-0.586857,-0.367714,-0.250286,-0.236000,-0.245714,-0.358000,-0.523714,-0.245714,-0.626857
66908,-0.836641,-0.300716,-0.378,1.260347,0.150304,-0.067387,0.031566,0.112322,0.062641,0.115180,...,-0.485143,-0.095143,-0.177143,-0.131143,-0.449143,-0.065857,-0.053571,-0.256000,-0.031000,-0.467857
102798,-0.887708,-0.721453,-0.801,1.274534,0.530860,-0.224842,0.072165,0.209964,0.221224,0.120699,...,-0.238857,-0.128286,0.073857,0.118571,0.171714,-0.085429,0.037714,-0.054571,0.083571,-0.293714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105916,-0.965526,0.874117,-0.079,1.295508,0.181102,-0.260071,-0.090520,0.065421,0.075484,0.128733,...,-0.468714,-0.533571,-0.491571,-0.514286,-0.434000,-0.368714,-0.313429,-0.319429,-0.516429,-0.702714
65610,-1.000000,-1.000000,-0.590,1.119266,-0.536471,0.492901,-0.237624,-0.333333,-0.225296,0.056277,...,-0.883000,-0.880286,-0.885000,-0.882571,-0.897714,-0.891000,-0.880571,-0.882714,-0.883000,-0.597571
22136,-0.733042,0.589020,-0.625,2.158371,1.863126,-0.573832,0.215101,0.403298,0.776567,0.366762,...,-0.861143,-0.652714,-0.237000,-0.134857,-0.106857,-0.058429,-0.397143,-0.655000,-0.064714,-0.530429
34959,-0.880783,-0.500000,-0.847,1.492941,1.216518,-0.544039,-0.043245,0.193165,0.507057,0.197735,...,-0.632286,-0.406571,-0.264857,-0.187143,-0.221143,-0.060429,0.011714,-0.203143,-0.102571,-0.439571


In [None]:
choosen_instance = X_test
shap_values = explainer.shap_values(choosen_instance)
shap.initjs()

In [None]:
shap.summary_plot(shap_values, X_train)

In [None]:
labels

### Split by class

In [None]:
def data_train(data):
    y_train_data = data["class"] 
    x_train_data = data.drop(["class"], axis=1)
    pc_columns = sorted(["slope","aspect","dem","spring_evi2","spring_mndwi","spring_moisture","spring_ndbg", "spring_ndre","spring_ndvi","spring_osavi","spring_AOT","spring_B01","spring_B02","spring_B03","spring_B04","spring_B05","spring_B06","spring_B07","spring_B08","spring_B09","spring_B11","spring_B12","spring_B8A","summer_evi2","summer_mndwi","summer_moisture","summer_ndbg", "summer_ndre","summer_ndvi","summer_osavi","summer_AOT","summer_B01","summer_B02","summer_B03","summer_B04","summer_B05","summer_B06","summer_B07","summer_B08","summer_B09","summer_B11","summer_B12","summer_B8A","autumn_evi2","autumn_mndwi","autumn_moisture", "autumn_ndbg", "autumn_ndre","autumn_ndvi","autumn_osavi","autumn_AOT","autumn_B01","autumn_B02","autumn_B03","autumn_B04","autumn_B05","autumn_B06","autumn_B07","autumn_B08","autumn_B09","autumn_B11","autumn_B12","autumn_B8A"]) 

    reduced_x_train_data = data[pc_columns]
    #reduced_x_train_data.to_csv(f'{label}.csv')

    X_train, X_test, y_train, y_test = train_test_split(reduced_x_train_data, y_train_data, test_size=0.50, random_state=0,)

    y_true = model.predict(X_test)

    X_train = pd.DataFrame(X_train).reset_index(drop=True, inplace=False)
    X_test = pd.DataFrame(X_test).reset_index(drop=True, inplace=False)
    y_test = pd.DataFrame(y_test).reset_index(drop=True, inplace=False)
    y_train = pd.DataFrame(y_train).reset_index(drop=True, inplace=False)
    y_true = pd.DataFrame(y_true).reset_index(drop=True, inplace=False)
    

    return X_train, X_test, y_train, y_test, y_true


In [None]:
labels = ['bosque']

In [None]:
for label in labels:
    b_aux = data['class'] == label
    b = data[b_aux]
    X_train, X_test, y_train, y_test, y_true = data_train(b)
    explainer = shap.TreeExplainer(model)
    choosen_instance = X_test.loc[0:10]
    shap_values = explainer.shap_values(choosen_instance)
    shap.initjs()
    shap.force_plot(explainer.expected_value[1], shap_values[1], choosen_instance)


In [None]:
explainer = shap.TreeExplainer(model)
choosen_instance = X_test.loc[0:3]
shap_values = explainer.shap_values(choosen_instance)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], choosen_instance)

In [None]:
choosen_instance = X_test.loc[0:3]
shap_values = explainer.shap_values(X_test)

In [None]:
expected_value = explainer.expected_value
shap.decision_plot(expected_value, shap_values[0:10],feature_names=list(X_test.columns))

In [None]:
shap.plots.waterfall(shap_values[[0]])

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
for label in labels:
    data = pd.read_csv(f'{label}.csv')
    X_train, X_test, y_train, y_test = train_test_split(reduced_x_train_data, y_train_data, test_size=0.15, random_state=0,)
    explainer = shap.TreeExplainer(model)
    choosen_instance = X_test.loc[0:10]
    shap_values = explainer.shap_values(choosen_instance)
    shap.initjs()
    shap.force_plot(explainer.expected_value[1], shap_values[1], choosen_instance)

In [None]:
y_true = model.predict(X_test)

In [None]:
#shap.summary_plot(shap_values[1], train)

In [None]:
shap.summary_plot(shap_values, train)

In [None]:
compute_confusion_matrix(y_true, y_test, labels, "/tmp/matrix.png")

In [None]:
t = model.predict(test)

In [None]:
tests

In [None]:
positions = np.flatnonzero(classes['bosque'])
filtered_df = test.iloc[positions]

In [None]:
filtered_df

In [None]:
type(test)

In [None]:
choosen_instance = X_test[0:10]
shap_values = explainer.shap_values(choosen_instance)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], choosen_instance, matplotlib=True)

In [None]:
for label in labels:
    choosen_instance = classes[label]
    shap_values = explainer.shap_values(choosen_instance)
    shap.initjs()
    shap.force_plot(explainer.expected_value[1], shap_values[1], choosen_instance)

In [None]:
choosen_instance = tests
shap_values = explainer.shap_values(choosen_instance)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], choosen_instance)

In [None]:
tests = real[0:10]

In [None]:
labels

In [None]:
shap.summary_plot(shap_values, train)

In [None]:
shap.plots.waterfall(shap_values)

In [None]:
shap.plots.beeswarm(shap_values)