# Disseration Experiment 3h
# Generate SHAP XAI Output (Credit Card Fraud) 
# - Experiment January 16¶
Ciaran Finnegan January 2023

# Import Libraries + Custom Functions

## Import Libraries

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [None]:
# Import libs
import numpy as np
import pandas as pd

# Import SHAP libraries
import shap

# Import Display libraries
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable
import raiutils
from raiutils.exceptions import UserConfigValidationException

# Import libraries to build ANN model
import tensorflow as tf
import keras_tuner as kt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


# Import ML Workflow Libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample


# Classifier training (not used for explainability)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#Import libraries for explainer metrics
from sklearn.cluster import KMeans
from scipy.spatial import distance


# Additional display libraires
import contextlib
import os
import sys
from contextlib import contextmanager


# Libraries used in Experiment Creation of XL Output Metrics
import os
import time
import random
import openpyxl

## Custom Functions

Dataset Visualisations

In [None]:
%run ./DS_Visualisation_Functions.ipynb

Metrics

In [None]:
%run ./XAI_Metrics_Functions.ipynb

Model Evaluation Functions

In [None]:
%run ./DS_Model_Build_Evaluation_Functions.ipynb

Track Experiment Result Functions

In [None]:
%run ./XAI_Experiment_Functions.ipynb

# Load Model

A Neural Network Model has been created in another Kubeflow Notebook and is being used in all the XAI experiments

In [None]:
loaded_model = keras.models.load_model('ccfraud_model')  # If saved as SavedModel

In [None]:
X_test_loaded, y_test_loaded, X_train_loaded, y_train_loaded, df_downsampled_loaded, dfCatCols = load_CC_train_test_data()

In [None]:
X_train_loaded.head(1)

In [None]:
y_train_loaded.head(2)

## Re-Display Model Peformance

For illustration, the evualtion metrics of the NN model will be repeated here.

### Tabular Data

In [None]:
scaler = StandardScaler()   
X_test_loaded_scaled = scaler.fit_transform(X_test_loaded)

In [None]:
y_pred_loaded = display_model_metrics_tabular(loaded_model, X_test_loaded_scaled, y_test_loaded)

### Confusion Matrix

In [None]:
generate_confusion_matrix(y_test_loaded, y_pred_loaded)

-------------------------

# Generate Shap Values

## SHAP Summary Plot

In [None]:
# Create a SHAP explainer
# explainer = shap.KernelExplainer(model.predict, shap.sample(X_train_downsampled, 10)) #100

#Jan 6th - use new loaded model
explainer = shap.KernelExplainer(loaded_model.predict, shap.sample(X_train_loaded, 10)) #100

In [None]:
# Extract the feature names, excluding the target variable 'Fraud'
#column_names = df_downsampled.drop('default', axis=1).columns

#Jan 6th - use new loaded data
column_names = df_downsampled_loaded.drop('Fraud', axis=1).columns

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    # Jan 6th - use new loaded test data, which is already a dataframe
    #shap_values = explainer.shap_values(X_test_downsampled.iloc[:5,:], silent=True) #100
    shap_values = explainer.shap_values(X_test_loaded.iloc[:10,:], silent=True) #100

In [None]:
# Create a SHAP summary plot
#shap.summary_plot(shap_values, X_test_downsampled.iloc[:10,:], feature_names=X_train_downsampled.columns)

#Jan 6th - use new loaded data, which is already a dataframe
shap.summary_plot(shap_values, X_test_loaded.iloc[:10,:], feature_names=X_train_loaded.columns)

## Single Random Observation (for illustration)

In [None]:
# Select a random observation from the test dataset
#random_observation = X_test_downsampled.sample(1, random_state=42)

# Jan 6th - use loaded data
random_observation = X_test_loaded.sample(1, random_state=42)

In [None]:
# Generate SHAP values for the instances
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    # Your code that produces warnings goes here
    shap_values_random_observation = explainer.shap_values(random_observation)

In [None]:
# Get the SHAP values for class 1 (default) for this observation
shap_values_observation_class1 = shap_values_random_observation[0]

In [None]:
# Convert SHAP values to a Series for easier manipulation
shap_values_series = pd.Series(shap_values_observation_class1[0], index=random_observation.columns)

In [None]:
# Sort the features based on absolute SHAP value
sorted_features = shap_values_series.abs().sort_values(ascending=False)

In [None]:
# Display the top 20 features for the random observation in an aesthetically pleasing tabular format
top_20_features_observation = sorted_features.head(20)
top_20_features_df_observation = pd.DataFrame({'Feature': top_20_features_observation.index, 
                                               'SHAP Value': top_20_features_observation.values})

In [None]:
# Display the index (row number) of the selected observation
print(f"Selected Row Number from Test Data: {random_observation.index[0]}")

In [None]:
# Display the SHAP values for the top 20 features of the observation
print("\nTop 20 Features and Their SHAP Values:")
display(HTML(xai_styles + top_20_features_df_observation.to_html(index=False)))

## Prepare SHAP Values Data for Metric Calculations

Use a custom built decorator to track the time taken to generate the SHAP values

In [None]:
@timeit
def generate_shap_explanations(model, data, target_column='Fraud', 
                                output_instance_file='shap_instances_input.csv', 
                                output_shap_file='shap_value_results.csv'):
    
    print('data[Fraud] :')
    print(data['Fraud'])
    
       
    # Drop the target column from the data
    data_features = data.drop(columns=[target_column])
    
    
    
    #######################
    
    # Assuming data_features is your DataFrame and model is your trained model
    
    # Check if the model has the 'classes_' attribute
    if hasattr(model, 'classes_'):
        print("Classes recognized by the model:", model.classes_)
    else:
        print("The model does not have a 'classes_' attribute.")

    # Select a few sample instances (e.g., first 5 instances)
    sample_instances = data_features.iloc[:5]

    # Use the model's predict method
    predicted_output = model.predict(sample_instances)
    
    # Print the output
    print("Predicted Output (predicted_output):", predicted_output)
    
    # Convert probabilities to binary predictions
    y_pred = [1 if prob > 0.5 else 0 for prob in predicted_output]

    # Print the output
    print("Predicted Output (y_pred):", y_pred)

    # Interpret the results
    # The interpretation depends on whether your model outputs probabilities, class labels, etc.

    
    
    #######################
    
    # Select a subset of the data for explanation (first 60 instances)
    instances_to_explain = data_features.iloc[:5, :]#25
    
    # Select all input feature for which to generate SHAP values
    #instances_to_explain = data_features
    
    # Create a SHAP explainer
    explainer = shap.KernelExplainer(model.predict, shap.sample(data_features, 100)) #100
    
    # Generate SHAP values for the instances
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        # Any code that produces warnings goes here = placeholder
        # Retrieve SHAP values - original focus on mean values
        # shap_values = explainer.shap_values(instances_to_explain)
        # Retrieve all SHAP values
        shap_values_all = explainer.shap_values(instances_to_explain)
        
    # Debug: Print the structure of shap_values_all
    print(type(shap_values_all))
    if isinstance(shap_values_all, list):
        print("Length of list:", len(shap_values_all))
    else:
        print("Content:", shap_values_all)
        
    # Assuming shap_values_all is a list of length 2, where the second element is for the 'fraud' class
    shap_values_fraud = shap_values_all[0]    
        
    
    # Convert the SHAP values to a DataFrame
    #if isinstance(shap_values, list):
        # For multi-class models, average the SHAP values over all classes
    #    shap_values = np.mean(shap_values, axis=0)
    
    # df_shap_values = pd.DataFrame(shap_values, columns=data_features.columns)
    
    # Convert the SHAP values for 'fraud' to a DataFrame
    df_shap_values = pd.DataFrame(shap_values_fraud, columns=data_features.columns)
    
    
    # Jan 6th - align index of instances df to the newly created shap values
    # Reindex df1 to the index of df2
    #df_instances_to_explain_reindexed = instances_to_explain.reindex(df_shap_values.index)
    instances_to_explain = instances_to_explain.reset_index(drop=True)
    df_shap_values = df_shap_values.reset_index(drop=True)



    
    # Output the SHAP values to a csv file
    df_shap_values.to_csv(output_shap_file, index=False)
    
    # Output the instances to a csv file
    instances_to_explain.to_csv(output_instance_file, index=False)
    
    return instances_to_explain, df_shap_values

# Generate XAI Metrics 

## Identity

In [None]:
#from scipy.spatial import distance
#SHAP_Identity_Metric = get_identity_metric(df_instances, df_shap_values, "SHAP")

In [None]:
#SHAP_Identity_Number = "{:.2f}%".format(SHAP_Identity_Metric)
#display_text("SHAP Identity Metric Score: " + SHAP_Identity_Number)

## Stability

In [None]:
# Jan 6th - use loaded data
#SHAP_Stability_Metric = get_stability_metric_y(df_shap_values, y_test_loaded, 'SHAP')

In [None]:
#SHAP_Stability_Number = "{:.2f}%".format(SHAP_Stability_Metric)
#display_text("SHAP Stability Metric Score: " + SHAP_Stability_Number)

## Seperability

In [None]:
#SHAP_Seperability_Metric = get_seperability_metric(df_instances, df_shap_values, "SHAP")

In [None]:
#SHAP_Seperability_Number = "{:.2f}%".format(SHAP_Seperability_Metric)
#display_text("SHAP Seperability Metric Score: " + SHAP_Seperability_Number)

## Similarity

In [None]:
#SHAP_Similarity_Metric = get_similarity_metric(df_instances, df_shap_values, "SHAP", use_dbscan=False)

In [None]:
#SHAP_Similarity_Number = "{:6.2f}".format(SHAP_Similarity_Metric)
#display_text("SHAP Similarity Metric Value: " + SHAP_Similarity_Number)

# XAI Experiments - Metrics Capture

## Suppress Warnings to clean up output

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

## Break out Model Test Data into a list of dataframes

### Create Test Data for Experiment Input

Ensure 'X_test' and 'y_test' Are DataFrames with Proper Columns

In [None]:
# Assuming 'X_test' is a numpy array and you have a list of the original column names
original_feature_names = [col for col in df_downsampled_loaded.columns if col != 'Fraud']

# Ensure X_test_loaded has the correct column names (if necessary)
X_test_loaded.columns = original_feature_names

In [None]:
# Combine X_test_loaded and y_test into a single DataFrame
df_TestData = pd.concat([X_test_loaded, y_test_loaded], axis=1)

### Split the DataFrame into 20 consecutive smaller DataFrames

In [None]:
# Split the DataFrame into 20 consecutive smaller DataFrames
split_size, list_df = split_TestData_into_nn_Blocks(df_TestData, num_splits = 20)

In [None]:
# Count the occurrence of each unique value in the 'Fraud' column
fraud_counts = df_TestData['Fraud'].value_counts()

# Display the counts
print("Breakdown of 'Fraud' and non-Fraud label records in df_TestData:")
print(fraud_counts)

### Add a routine to check output values

In [None]:
# Display starting points in the first nn sub dataframes
startBlockDisplay(df_TestData, split_size, 1)

## Confirm Starting Point in External SHAP XAI XL File

The code below acts so that for each dataframe in the list just created the following actions are carried out;

Check if an XAI results XL spreadsheet called 'SHAP_XAI_Metrics_Experiments.xls' exists;

If not create an empty XL spreadsheet with the name 'SHAP_XAI_Metrics_Experiments.xls', and then define a variable called ‘Sample’ with an integer value of 1 and print the value of 'Sample' to output.

If and XL spreadsheet called 'SHAP_XAI_Metrics_Experiments.xls' does exist, then read the entries in the spreadsheet in the first column named ‘Sample Number’ and create a variable in this Python program named ‘Sample’ that is one integer value higher than the highest integer number column named ‘Sample Number’ in the XL, and print this value of 'Sample' to output.

In [None]:
# Create a sequential number as an identifier for each DataFrame
list_df = {f'df_{i + 1}': list_df[i] for i in range(len(list_df))}

In [None]:
# File path for the SHAP XAI metrics results spreadsheet
SHAP_xai_file_path = 'SHAP_XAI_Metrics_Experiments.xlsx'  # Stored locally

In [None]:
# Call Function to update or create the spreadsheet and determine the 'Sample' number
# Process each dataframe in 'list_df'
sample = return_next_sample_number_to_process(list_df, SHAP_xai_file_path, "SHAP")

## Select Next Dataframe to Process

---------------------------------

	
Extend the Python code so that the code reads in the dataframe from 'list df' that corresponds to the integer value in the 
variable named ‘Sample’. 

Assign this dataframe the name 'df_Selected_from_List'.


----------------------------------

### Initialize Dataframe to Capture Re-start Point as None

In [None]:
# Initialize df_Selected_from_List as None
df_Selected_from_List = None

### Extract test data block to restart XAI metrics process

In [None]:
df_Selected_from_List, key = select_restart_testdata_block(df_Selected_from_List, 
                                                           list_df, 
                                                           SHAP_xai_file_path)

In [None]:
# If no DataFrame is selected (e.g., if 'Sample' exceeds the number of DataFrames in list_df)
if 'df_Selected_from_List' not in locals():
    print("No DataFrame selected. The 'Sample' number may exceed the number of DataFrames in list_df.")

---------------------------

## Generate XAI Metrics from Dataframe

### Generate the SHAP Values for the Test Data Block

In [None]:
#df_Selected_from_List.head(2)

In [None]:
#print(df_Selected_from_List['Fraud'])

#### Pre-Process Values for Data Block

In [None]:
# Scale the feature inputs so that they work with the SHAP generation processs
df_Selected_Scaled_Data_from_List = scale_feature_inputs(df_Selected_from_List, 
                                                         original_feature_names)

In [None]:
#df_Selected_from_List_nolabel = df_Selected_from_List[original_feature_names]

In [None]:
#df_Selected_from_List_wlabel = df_Selected_from_List['Fraud']

In [None]:
#df_Selected_from_List_wlabel

In [None]:
#print(df_Selected_from_List_nolabel)

In [None]:
#scaler = StandardScaler()   
#df_Selected_from_List_scaled = scaler.fit_transform(df_Selected_from_List_nolabel)

In [None]:
#print(df_Selected_from_List_scaled)

In [None]:
#print(df_Selected_from_List_scaled.index)

In [None]:
#df_Selected_from_List_wlabel

In [None]:
#df_Selected_from_List_wlabel = df_Selected_from_List_wlabel.reset_index(drop=True)

In [None]:
#df_Selected_from_List_wlabel

In [None]:
#print(df_Selected_from_List.index)

In [None]:
#print(df_Selected_from_List['Fraud'])

In [None]:
# Convert NumPy array to DataFrame
#df_Selected_from_List_scaled = pd.DataFrame(df_Selected_from_List_scaled, columns=original_feature_names)

In [None]:
# Combine scaled data chunk data with label
#df_Selected_Scaled_Data_from_List2 = pd.concat([df_Selected_from_List_scaled, df_Selected_from_List_wlabel], axis=1)

In [None]:
#print('df_Selected_Scaled_Data_from_List: ')

In [None]:
#print(df_Selected_Scaled_Data_from_List)

In [None]:
#print(df_Selected_from_List)

In [None]:
print('df_Selected_Scaled_Data_from_List - Fraud')
print(df_Selected_Scaled_Data_from_List['Fraud'])

In [None]:
#print('df_Selected_Scaled_Data_from_List - Fraud')
#print(df_Selected_Scaled_Data_from_List2['Fraud'])

In [None]:
print('df_Selected_from_List - Fraud')
print(df_Selected_from_List['Fraud'])

#### Get SHAP Values for Data Block

In [None]:
# Jan 6th - use loaded data
#results_SHAP, exec_time_SHAP = generate_shap_explanations(loaded_model, df_Selected_Scaled_Data_from_List)

In [None]:
# Jan 6th - use loaded data
#results_SHAP2, exec_time_SHAP2 = generate_shap_explanations(loaded_model, df_Selected_Scaled_Data_from_List2)

In [None]:
# Jan 6th - use loaded data
#results_SHAP3, exec_time_SHAP3 = generate_shap_explanations(loaded_model, df_Selected_from_List)

In [None]:
# Jan 6th - use loaded data
results_SHAP, exec_time_SHAP = generate_shap_explanations(loaded_model, df_Selected_Scaled_Data_from_List)

In [None]:
# Unpack the results to get df_instances_SHAP and df_shap_values
df_instances_SHAP, df_shap_values = results_SHAP

---------------------------

### Generate Identity Metric

#### Run a Basic Test First

In [None]:
# Select two random instances from the SHAP value dataframe
df_xai_numerical = df_shap_values

random_indices = np.random.choice(df_xai_numerical.index, size=2, replace=False)
instance_1 = df_xai_numerical.iloc[random_indices[0]]
instance_2 = df_xai_numerical.iloc[random_indices[1]]

# Compute the Euclidean distance between the selected instances - uses custom project function
distance = get_euclidean_distance(instance_1, instance_2)
print(f"Euclidean distance between instance {random_indices[0]} and instance {random_indices[1]}: {distance:.4f}")

#### Retrieve Identity Score

In [None]:
df_instances_SHAP.head(2)

In [None]:
print(df_instances_SHAP.index)

In [None]:
df_shap_values.head(2)

In [None]:
print(df_shap_values.index)

In [None]:
from scipy.spatial import distance
SHAP_Identity_Metric = get_identity_metric(df_instances_SHAP, df_shap_values, "SHAP")

#### Display Identity Score Metric

In [None]:
SHAP_Identity_Number = "{:.2f}%".format(SHAP_Identity_Metric)
display_text("SHAP Identity Metric Score: " + SHAP_Identity_Number)

In [None]:
# Read in XAI Metric for Identity
XAI_Ident_Metric_1 = SHAP_Identity_Metric

---------------------------

### Generate Stability Metric

#### Retrieve Stability Score

In [None]:
# Jan 6th - use loaded data
SHAP_Stability_Metric = get_stability_metric_y(df_shap_values, y_test_loaded, 'SHAP')

#### Display Stability Score Metric

In [None]:
SHAP_Stability_Number = "{:.2f}%".format(SHAP_Stability_Metric)
display_text("SHAP Stability Metric Score: " + SHAP_Stability_Number)

In [None]:
# Read in XAI Metric for Stability
XAI_Stability_Metric_2 = SHAP_Stability_Metric

----------------

### Generate Seperability Metric

#### Retrieve Seperability Score

In [None]:
SHAP_Seperability_Metric = get_seperability_metric(df_instances_SHAP, df_shap_values, "SHAP")

#### Display Seperability Score Metric

In [None]:
SHAP_Seperability_Number = "{:.2f}%".format(SHAP_Seperability_Metric)
display_text("SHAP Seperability Metric Score: " + SHAP_Seperability_Number)

In [None]:
# Read in XAI Metric for Seperability
XAI_Seperability_Metric_3 = SHAP_Seperability_Metric

----------------------------------

### Generate Similarity Metric

#### Retrieve Similarity Score

In [None]:
SHAP_Similarity_Metric = get_similarity_metric(df_instances_SHAP, df_shap_values, "SHAP", use_dbscan=False)

#### Display Similarity Score Metric

In [None]:
SHAP_Similarity_Number = "{:6.2f}".format(SHAP_Similarity_Metric)
display_text("SHAP Similarity Metric Value: " + SHAP_Similarity_Number)

In [None]:
# Read in XAI Metric for Similarity
XAI_Similarity_Metric_4 = SHAP_Similarity_Metric

-------------------

### Display Final Set of Metrics (this run)

In [None]:
# Print the results
print(f"XAI Ident Metric 1: {XAI_Ident_Metric_1}")
print(f"XAI Stability Metric 2: {XAI_Stability_Metric_2}")
print(f"XAI Seperability Metric 1: {XAI_Seperability_Metric_3}")
print(f"XAI Similarity Metric 1: {XAI_Similarity_Metric_4}")
print(f"XAI Time Metric 5: {exec_time_SHAP} seconds")

-------------------

## Write Out Metrics to XL

In [None]:
write_xai_Metrics_to_XL(SHAP_xai_file_path, 
                        sample, 
                        SHAP_Identity_Metric, 
                        SHAP_Stability_Metric, 
                        SHAP_Seperability_Metric, 
                        SHAP_Similarity_Metric, 
                        exec_time_SHAP, 
                        df_Selected_from_List,
                        "SHAP")