In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
tf.keras.utils.set_random_seed(42)
import altair as alt
import shap
# Used for displaying HTML
from IPython.display import display, HTML
import sys

# import utility functions for the World Bank and OECD
sys.path.append("../utility_functions")
from world_bank_oecd_utility_functions import (
    get_indicator_name_from_code,
    get_indicator_definition_or_additional_info_from_code,
)

2023-10-22 04:04:53.324423: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-22 04:04:53.364146: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-22 04:04:53.364717: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# import mapping dataframes for mapping the indicator code to the names, definitions and/or additional info
mapping_oecd_to_names_and_additional_info_df = pd.read_csv(
    "../OECD/Cleaned/OECD_Indicator_Definition_Info.csv", delimiter=","
)
mapping_world_bank_to_names_and_definitions_df = pd.read_csv(
    "../WorldBankDatasets/Cleaned/World_Bank_Indicator_Definition_Info.csv",
    delimiter="\t",
)

In [3]:
# decide whether you want to use the OECD or World Bank and adjust the mapping_df as you like

# mapping dataframe for OECD
# mapping_df = mapping_oecd_to_names_and_additional_info_df

# mapping dataframe for World Bank
mapping_df = mapping_world_bank_to_names_and_definitions_df

In [4]:
# OECD_model = keras.models.load_model('/oecd/NN_OECD_Model.h5')
WB_model = keras.models.load_model('world_bank/NN_WorldBank_Model.h5')

2023-10-22 04:04:56.441420: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-22 04:04:56.441944: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [5]:
X_train = pd.read_csv('world_bank/X_train.csv')
y_train= pd.read_csv('world_bank/y_train.csv')
X_test= pd.read_csv('world_bank/X_test.csv')
y_test= pd.read_csv('world_bank/y_test.csv')

In [6]:
# y_pred = best_model.predict(X_test)
# This takes the neural network model and passes in X_test and returns predicted values based on the model
y_pred = WB_model.predict(X_test)



# Starting Shapley Analysis

In [7]:
# Choose the Shapley model you want

# Using DeepExplainer for neural networks
# https://shap-lrjball.readthedocs.io/en/latest/generated/shap.DeepExplainer.html
deep_explainer = shap.DeepExplainer(WB_model, X_train.values)

# NOTE KernelExplainer could be VERY SLOW compaired with DeepExplainer
# https://shap-lrjball.readthedocs.io/en/latest/generated/shap.KernelExplainer.html
# kernel_explainer = shap.KernelExplainer(WB_model, X_train.values)

explainer = deep_explainer
# explainer = kernel_explainer

keras is no longer supported, please use tf.keras instead.
Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.


In [8]:
# This function allows you to pass in testing data (assuming you already had a trained model and outputted y_pred

# ⚠️ note we don't need X_train or y_train since they were used to make the model in another file
def get_residuals_df (X_test = X_test, y_test = y_test, y_pred = y_pred):

    df_x_test_with_y_actual_y_pred = X_test.copy()
    df_x_test_with_y_actual_y_pred['y_test'] = y_test
    df_x_test_with_y_actual_y_pred['y_pred'] = y_pred

    # calculate the residuals (which are actual - predicted values)
    y_residuals = df_x_test_with_y_actual_y_pred['y_test'] - df_x_test_with_y_actual_y_pred['y_pred']
    df_x_test_with_y_actual_y_pred['y_residuals'] = y_residuals

    # sort the values by absolute values
    df_x_test_with_y_actual_y_pred['y_residuals_abs_val'] = y_residuals.abs()

    df_x_test_with_y_actual_y_pred_sorted_by_abs_residuals = df_x_test_with_y_actual_y_pred.sort_values(
        'y_residuals_abs_val', ascending = False)
    
    return df_x_test_with_y_actual_y_pred_sorted_by_abs_residuals

In [9]:
# residuals_df = get_residuals_df()

In [10]:
# this function returns the top n rows for a given dataframe
# This assumes you already have a sorted dataframe
def get_top_n_rows(df, top_n_rows = 10):
    df_b = df.copy()
    return df_b.iloc[0:top_n_rows]

In [11]:
# top_10_outliers_df = get_top_n_rows(residuals_df)

In [12]:
# top_10_outliers_df

In [13]:
# this assumes you are passing in a dataframe and it already has the residuals
def clean_df_for_shapley_explanation (df_to_clean):
    # make a copy of the dataframe
    df_to_clean_b = df_to_clean.copy()
    
    # remove residual columns
    columns_to_drop = ['y_test', 'y_pred', 'y_residuals', 'y_residuals_abs_val']
    for col in columns_to_drop:
        if col in df_to_clean_b.columns:
            df_to_clean_b = df_to_clean_b.drop(columns=[col])
    return df_to_clean_b

In [14]:
# top_10_outliers_df = clean_df_for_shapley_explanation(top_10_outliers_df)

In [15]:
# top_10_outliers_df

In [16]:
# this function takes in a dataframe assumed to have sorted outliers
# and will return the shapley values from them.
# This assumes you are passing in a dataframe with at least 2 rows.
# if you want to get Shapley values for just a row, use the function get_shapley_values_from_row_only
def get_shapley_values_from_whole_df (top_outliers_df):
    top_outliers_df_b = top_outliers_df.copy()

    top_outliers_df_b = clean_df_for_shapley_explanation(top_outliers_df_b)

    # get the shap values for the outlier dataframe (all rows that you passed in for top_outliers_df)
    shap_outlier_values = explainer.shap_values(top_outliers_df_b.values)

    return shap_outlier_values

In [17]:
# top_10_outliers_df_shapley_values = get_shapley_values_from_whole_df(top_10_outliers_df)

In [18]:
# top_10_outliers_df_shapley_values

In [19]:
# this takes in shapley values. And it will return a summary plot
def make_shapley_summary_plot(shapley_values):
    return shap.summary_plot(shapley_values)

In [20]:
# shapley_chart_1 = make_shapley_summary_plot(top_10_outliers_df_shapley_values)

In [21]:
# Pass in the dataframe row (which is a series by default), and pass in the Shapley explainer model if you like
def get_shapley_values_from_row_only(row, explainer=explainer):
    # Make sure the row is reshaped correctly. Without below, there might be an error
    data_to_explain = row.values.reshape(1, -1)
    
    # Calculate the SHAP values and then return those
    shap_values = explainer.shap_values(data_to_explain)
    
    return shap_values

In [22]:
# This is one of the most useful functions. You will pass in shapley values, and it will output the top variable names
# that explain the model's output
def get_top_shapley_feature_names_from_shapley_values(shap_values, top_n_feature_values = 10, X_test=X_test):
    # Get the relevant 1D array. Initially assuming 1 row. This is needed otherwise an error might occur.
    shap_values_1d = shap_values[0][0]

    # Convert the above 1D array to a dataframe.
    shap_df = pd.DataFrame([shap_values_1d], columns=X_test.columns)

    # Pull the top shapley features
    top_feature_values = shap_df.iloc[0, :].sort_values(ascending=False)
    top_feature_values = top_feature_values.index

    # Display the top n feature values
    # top_feature_values = top_feature_values[:top_n_feature_values]

    # mapping top feature names with code only to their actual feature names
    top_feature_names_with_full_name = [
        get_indicator_name_from_code(code, mapping_df) for code in top_feature_values
    ]

    # Display the top n features names. Not giving indicator code. Actually giving the full name
    return top_feature_names_with_full_name[0:top_n_feature_values]

In [23]:
# get_top_shapley_feature_names_from_shapley_values(shap_values_for_row)

In [24]:
# adjust top_10_outliers_df with a different dataframe if you like

# Set up the initial dataframe for getting the top Shapley results

In [25]:
residuals_df = get_residuals_df()
# top_n_outliers = 8
top_n_outliers = 10
top_outliers_df = get_top_n_rows(residuals_df, top_n_rows = top_n_outliers)
top_outliers_df = clean_df_for_shapley_explanation(top_outliers_df)

# Now you can see the important explanatory output

In [26]:
top_n_feature_values = 5

display(
    HTML(
        f"""<h2>Assuming you want to get the top {top_n_feature_values} variables explaining the machine learning output for the neural
        net, for each one of the top {top_n_outliers} outliers, they are:
        """
    )
)

cur_outlier_num = 0
for index, row in top_outliers_df.iterrows():
    temp_shapley_vals = get_shapley_values_from_row_only(row)
    top_n_feature_values = 5
    temp_feature_names = get_top_shapley_feature_names_from_shapley_values(temp_shapley_vals, top_n_feature_values)
    # print(f"For row with index {index}, the top {top_n_feature_values} variables explaining the model's output are:\n")
    # print(f"For row with index {index}, the top {top_n_feature_values} variables explaining the model's output are:\n")
    cur_outlier_num = cur_outlier_num + 1
    display(HTML(f"<strong>Top outlier {cur_outlier_num}, for row with index {index}, the top {top_n_feature_values} variables explaining the model's output are:</strong>\n"))
    # print(temp_feature_names)
    for feature_name in temp_feature_names:
        print(feature_name)
    
    print('\n')
    # print('----------------')
    print('----------------')
    print('\n')

`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.


Percentage of enrolment in tertiary education in private institutions (%)
Population, male (% of total population)
indicator name missing
Population, female (% of total population)
The age at which men and women can retire with full pension benefits is the same (1=yes; 0=no)


----------------




Incidence of tuberculosis (per 100,000 people)
Employment in agriculture, female (% of female employment) (modeled ILO estimate)
indicator name missing
Mortality rate, under-5, male (per 1,000 live births)
Mortality rate, under-5 (per 1,000 live births)


----------------




Percentage of enrolment in tertiary education in private institutions (%)
Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)
The law prohibits discrimination in employment based on gender (1=yes; 0=no)
Mortality rate, adult, female (per 1,000 female adults)
A woman can work in a job deemed dangerous in the same way as a man (1=yes; 0=no)


----------------




Percentage of enrolment in tertiary education in private institutions (%)
Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)
The law prohibits discrimination in employment based on gender (1=yes; 0=no)
A woman can work in a job deemed dangerous in the same way as a man (1=yes; 0=no)
Population, male (% of total population)


----------------




Percentage of enrolment in tertiary education in private institutions (%)
The age at which men and women can retire with full pension benefits is the same (1=yes; 0=no)
Immunization, measles (% of children ages 12-23 months)
The law prohibits discrimination in employment based on gender (1=yes; 0=no)
Government expenditure on education, total (% of GDP)


----------------




There is legislation specifically addressing domestic violence (1=yes; 0=no)
Adolescent fertility rate (births per 1,000 women ages 15-19)
Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)
Birth rate, crude (per 1,000 people)
Fertility rate, total (births per woman)


----------------




Incidence of tuberculosis (per 100,000 people)
Mortality rate, adult, female (per 1,000 female adults)
Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)
A woman can work at night in the same way as a man (1=yes; 0=no)
Population, male (% of total population)


----------------




High-technology exports (% of manufactured exports)
Current education expenditure, primary (% of total expenditure in primary public institutions)
Repetition rate in Grade 6 of primary education, male (%)
The law prohibits discrimination in employment based on gender (1=yes; 0=no)
Incidence of HIV, ages 15-49, female (per 1,000 uninfected female population ages 15-49)


----------------




Fertility rate, total (births per woman)
Birth rate, crude (per 1,000 people)
Adolescent fertility rate (births per 1,000 women ages 15-19)
Law mandates equal remuneration for females and males for work of equal value (1=yes; 0=no)
A woman can work at night in the same way as a man (1=yes; 0=no)


----------------




High-technology exports (% of manufactured exports)
The law prohibits discrimination in employment based on gender (1=yes; 0=no)
Incidence of HIV, ages 15-49, female (per 1,000 uninfected female population ages 15-49)
Repetition rate in Grade 6 of primary education, male (%)
Incidence of HIV, ages 15-24, female (per 1,000 uninfected female population ages 15-24)


----------------




# EVENTUALLY MAYBE MAP BACK TO ORIGINAL VALUES AND GIVE A MORE DETAILED EXPLANATION EXPLAINING WHY AN OUTLIER AFFECTED THE MODEL SIGNIFICANTLY