# Disseration Experiment 5h
# Generate ANCHOR Output (Credit Default) January Eight¶
Ciaran Finnegan January 2023

# Import Libraries + Custom Functions

## Import Libraries

In [1]:
# Import libs
import numpy as np
import pandas as pd

# Display libraries
from IPython.display import display, HTML
from prettytable import PrettyTable

# Import necessary libraries for ANN model building
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam

# Import necessary library for ANCHOR explainer
import alibi
from alibi.explainers import AnchorTabular
import anchor
from anchor import anchor_tabular

# Libraries required for metrics calculations
from scipy.spatial import distance
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import warnings

# Compute additional evaluation metrics
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score, f1_score

# Classifier training (not used for explainability)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Additional display libraires
import contextlib
import sys
from contextlib import contextmanager

# Libraries used in Experiment Creation of XL Output Metrics
import os
import time
import random
import openpyxl

2024-01-08 19:18:54.622269: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-08 19:18:54.624831: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-08 19:18:54.673530: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-08 19:18:54.674922: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ImportError: cannot import name 'anchor_tabular' from 'anchor' (/opt/conda/lib/python3.8/site-packages/anchor/__init__.py)

## Custom Functions

Dataset Visualisations

In [None]:
%run ./DS_Visualisation_Functions.ipynb

Metrics

In [None]:
%run ./XAI_Metrics_Functions.ipynb

Model Evaluation Functions

In [None]:
%run ./DS_Model_Build_Evaluation_Functions.ipynb

Track Experiment Result Functions

In [None]:
%run ./XAI_Experiment_Functions.ipynb

------------------------------------

In [None]:
ds_file_to_load = 'credit_default_data.csv'
df = pd.read_csv(ds_file_to_load)

In [None]:
# Determine the threshold for missing values
threshold = 0.75 * len(df)

# Identify columns with missing values greater than the threshold
missing_columns = df.columns[df.isnull().sum() > threshold]

# Print the columns with more than 75% missing values
print("Columns with more than 75% missing values:", missing_columns)

# Drop columns with missing values greater than the threshold
df = df.drop(columns=missing_columns)

# Save or continue processing with columns removed that had high volumes of missing data


## Categorical Data 

In [None]:
# List of categorical columns
cat_cols = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

In [None]:
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=cat_cols)

In [None]:
# Check the distribution of the target variable
target_distribution = df_encoded['default'].value_counts()

target_distribution

In [None]:
# Separate the majority and minority classes
df_majority = df_encoded[df_encoded['default'] == 0]
df_minority = df_encoded[df_encoded['default'] == 1]

In [None]:
# Downsample the majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False, 
                                   n_samples=target_distribution[1], 
                                   random_state=42)

In [None]:
# Combine the downsampled majority class with the minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [None]:
# Shuffle the dataset to mix the data points
df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Display the distribution of the target variable in the downsampled dataset
df_downsampled['default'].value_counts()

In [None]:
# Splitting the features and target variable
X = df_downsampled.drop('default', axis=1)
y = df_downsampled['default']

In [None]:
# Splitting the data into training and testing sets
X_train_downsampled, X_test_downsampled, y_train_downsampled, y_test_downsampled = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
# Reset Indexes
X_train_downsampled = X_train_downsampled.reset_index(drop=True)
X_test_downsampled = X_test_downsampled.reset_index(drop=True)

y_train_downsampled = y_train_downsampled.reset_index(drop=True)
y_test_downsampled = y_test_downsampled.reset_index(drop=True)

In [None]:
X_train_downsampled, X_test_downsampled = scale_the_features(X_train_downsampled, X_test_downsampled, df_downsampled)

# Build Model

## Set Up Hyperparameters

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam

In [None]:
# Building the model
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_downsampled.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
# Compiling the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

## Build Neural Network (w/TensorFlow/Keras)

In [None]:
# Training the model
history = model.fit(X_train_downsampled, y_train_downsampled, epochs=15, batch_size=32, validation_split=0.2, verbose=1)
print("Model trained successfully!")

# Load Model

A Neural Network Model has been created in another Kubeflow Notebook and is being used in all the XAI experiments

In [None]:
loaded_model = keras.models.load_model('ccfraud_model')  # If saved as SavedModel

In [None]:
X_test_loaded, y_test_loaded, X_train_loaded, y_train_loaded, df_downsampled_loaded, dfCatCols = load_CC_train_test_data()

In [None]:
X_train_loaded.head(1)

In [None]:
y_train_loaded.head(2)

## Re-Display Model Peformance

For illustration, the evualtion metrics of the NN model will be repeated here.

### Tabular Data

In [None]:
scaler = StandardScaler()   
X_test_loaded_scaled = scaler.fit_transform(X_test_loaded)

In [None]:
y_pred_loaded = display_model_metrics_tabular(loaded_model, X_test_loaded_scaled, y_test_loaded)

### Confusion Matrix

In [None]:
generate_confusion_matrix(y_test_loaded, y_pred_loaded)

## Assess and Display Model Peformance

### Tabular Data

In [None]:
y_pred_funct = display_model_metrics_tabular(model, X_test_downsampled, y_test_downsampled)

### Confusion Matrix

In [None]:
generate_confusion_matrix(y_test_downsampled, y_pred_funct)

# Generate ANCHOR Values

#### Suppress Warnings to clean up output

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

Check layout of X_train_downsampled

In [None]:
X_train_downsampled

In [None]:
# Extract the feature names, excluding the target variable 'default'
column_names = df_downsampled.drop('default', axis=1).columns

In [None]:
# Convert NumPy array to DataFrame
X_train_downsampled = pd.DataFrame(X_train_downsampled, columns=column_names)

In [None]:
X_train_downsampled.head()

In [None]:
from anchor import anchor_tabular
import numpy as np

In [None]:
# Separate the features and the target variable
X = df_encoded.drop('default', axis=1)
y = df_encoded['default']

In [None]:
# Define the explainer
explainer = anchor_tabular.AnchorTabularExplainer(
    class_names=['Not Default', 'Default'],
    feature_names=X.columns.tolist(),
    train_data=X_train_downsampled.values,
    categorical_names={}
)

In [None]:
import contextlib
import numpy as np
import os
import sys
from contextlib import contextmanager

In [None]:
@contextmanager
def suppress_stdout():
    with open(os.devnull, 'w') as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            yield None

In [None]:
def predict_fn(x):
    # Ensure x is in batch format
    if len(x.shape) == 1:
        x = np.expand_dims(x, axis=0)
    # Suppress the output of the progress bar
    with suppress_stdout():
        # Get the model's prediction (probability of the positive class)
        probabilities = model.predict(x, verbose=0)
    # Convert probabilities to class labels (0 or 1)
    labels = (probabilities > 0.5).astype(int)
    return labels.flatten()

In [None]:
# Convert NumPy array to DataFrame
X_test_downsampled = pd.DataFrame(X_test_downsampled, columns=column_names)

In [None]:
# Ensure that the instance passed to explain_instance is in the correct shape
idx = 0
instance_to_explain = X_test_downsampled.iloc[idx].values.reshape(1, -1)

In [None]:
# Generate an explanation for the first instance in the test set
exp = explainer.explain_instance(instance_to_explain, predict_fn, threshold=0.95)

In [None]:
# Show the explanation
exp.show_in_notebook()

In [None]:
exp.show_in_notebook(show_table=True, show_all=False)

#### Pseudocode to Generate Initial ANCHOR Values

For the RF model built above in Python, select a random sample 
of 15 instances in the test data, 10 for Class '0' and 5 for 
Class '1', and generate ANCHOR values as explainers for these  
instances in the test dataset.

Present these ANCHOR values in an easily understood and pleasant 
on the eye tabular output format for the Python Kubeflow Notebook
in which I am writing my Python code. 

Create a second tabular format what shows an equally appealing 
output in my Python Notebook that shows the ANCHOR values and the
feature details for each instance on a single row, across which I
can scroll.

Comment each line of Python code with as much detail as practical. 

Output the ANCHOR values to a CSV file. Output the feature details 
for each corresponding instance for which the ANCHOR Values were
created in a seperate CSV file.

After the code generation provide as much narrative detail 
as possible.

##### Further pseudocode...

Use the AnchorTabular explainer from the alibi library. This explainer provides local explanations for classification models' predictions by identifying a minimal set of conditions (features) in the instance that ensure the model's decision remains unchanged (these conditions are called "anchors").

The steps:

Select a random sample of 15 instances from the test data, 10 from Class '0' and 5 from Class '1'.
Set up the AnchorTabular explainer and fit it to the training data.
Generate anchor explanations for the selected instances.
Present the anchor values in two tabular formats: a summary table and a detailed table.
Output the anchor values and feature details to CSV files.

In [None]:
# Loop through the first five instances in the test dataset
for idx in range(5):
    instance = X_test_downsampled.iloc[idx].values.reshape(1, -1)
    print(f"\nInstance {idx + 1}:")
    
    # Generate an explanation for the instance
    exp = explainer.explain_instance(instance, predict_fn, threshold=0.95)
    
    # Show the explanation in the notebook
    exp.show_in_notebook()

### Create an ANCHOR File Output

In [None]:
# Initialize a list to store the ANCHOR results
anchor_results = []

In [None]:
import re

anchor_results = []

# Loop through the first five instances in the test dataset
for idx in range(5):
    instance = X_test_downsampled.iloc[idx].values.reshape(1, -1)
    # Generate an explanation for the instance
    exp = explainer.explain_instance(instance, predict_fn, threshold=0.95)
    
    # Extract feature importance from the explanation
    feature_importance = {}
    for condition in exp.names():
        # Handle conditions with '='
        if '=' in condition:
            feature, value = condition.split('=')
            feature = feature.strip()
            value = float(value.strip())
            feature_importance[feature] = ('=', value)
        # Handle conditions with '>' or '<'
        elif '>' in condition or '<' in condition:
            parts = re.split('([><])', condition)
            feature, operator, value = [part.strip() for part in parts if part.strip()]
            value = float(value)
            feature_importance[feature] = (operator, value)
        else:
            raise ValueError(f"Unexpected format for ANCHOR explanation: {condition}")

    anchor_results.append(feature_importance)

In [None]:
# Create a DataFrame from the results
df_anchor_results = pd.DataFrame(anchor_results)

In [None]:
# Show the DataFrame
print(df_anchor_results)

In [None]:
# Write the DataFrame to a CSV file
df_anchor_results.to_csv('anchor_results_ANN.csv', index=False)

-----

# Prepare ANCHOR Values for Metrics

In [None]:
@timeit
def generate_anchors_for_instances(df, num_instances=5):
    # Initialize a list to store the ANCHOR results
    new_anchor_results = []
    feature_instances = []

    # Loop through the first five instances in the test dataset
    for idx in range(num_instances):
        #instance = X_test_downsampled.iloc[idx]
        instance = df.iloc[idx]
        feature_instances.append(instance)

        # Generate an explanation for the instance with a lower threshold
        exp = explainer.explain_instance(instance.values.reshape(1, -1), predict_fn, threshold=0.99)

        # Check if an explanation was found
        if exp is not None:
            # Parse the conditions from the explanation and format them
            anchor_explanation = []
            for condition in exp.names():
                if ' > ' in condition or ' < ' in condition:
                    feature, relation, value = condition.split(' ')[0], condition.split(' ')[1], condition.split(' ')[2]
                    try:
                        anchor_explanation.append(f"'{feature} {relation} {float(value):.2f}'")
                    except ValueError:
                        anchor_explanation.append(f"'{condition}'")
                else:
                    anchor_explanation.append(f"'{condition}'")

            # Convert the list of strings to a single string
            anchor_explanation_str = '[' + ', '.join(anchor_explanation) + ']'

            # Add the formatted explanation to the results list
            new_anchor_results.append(anchor_explanation_str)
        else:
            new_anchor_results.append("['No explanation found']")

    # Create a DataFrame from the results
    new_df_anchor_results = pd.DataFrame(new_anchor_results, columns=['Anchor Explanation'])

    # Create a DataFrame from the feature instances
    df_feature_instances = pd.DataFrame(feature_instances)
    
    
    return df_feature_instances, new_df_anchor_results

In [None]:
#df_feature_instances, new_df_anchor_results = generate_anchors_for_instances(X_test_downsampled, 5)
results, exec_time = generate_anchors_for_instances(X_test_downsampled, 5)

In [None]:
df_feature_instances, new_df_anchor_results = results

## Determine Computational Efficiency Value

In [None]:
# Display time to generate DiCE explainers
print(f"ANCHORS Execution Time: {exec_time} seconds")

In [None]:
# Show the DataFrames
print("Anchor Explanations:")
print(new_df_anchor_results)
#print("\nFeature Instances:")
#print(df_feature_instances)

# Write the DataFrames to CSV files
new_df_anchor_results.to_csv('new_anchor_results5.csv', index=False)
df_feature_instances.to_csv('feature_instances5.csv', index=False)

In [None]:
df_feature_instances.head()

In [None]:
new_df_anchor_results

## Parse the Anchor Explanations

In [None]:
import pandas as pd
import ast


# Step 2: Convert the 'Anchor Explanation' column from a string representation of a list back to an actual list
new_df_anchor_results['Anchor Explanation'] = new_df_anchor_results['Anchor Explanation'].apply(ast.literal_eval)

# Step 3: Determine the maximum number of conditions in the ANCHOR explanations across all instances
max_num_conditions = max(new_df_anchor_results['Anchor Explanation'].apply(len))

# Step 4: Initialize a list to store the numerical representations of the ANCHOR explanations
numerical_explanations = []

# Step 5: Loop through each ANCHOR explanation and convert it to a numerical representation
for explanation in new_df_anchor_results['Anchor Explanation']:
    numerical_representation = [-1] * len(df_feature_instances.columns) * max_num_conditions
    for idx, condition in enumerate(explanation):
        # Parse the condition to extract the feature name and value
        feature, relation, value = condition.split(' ')[0], condition.split(' ')[1], condition.split(' ')[2]
        
        # Find the index of the feature in the feature dataframe
        feature_idx = df_feature_instances.columns.get_loc(feature)
        
        # Store the feature index in the numerical representation
        numerical_representation[feature_idx * max_num_conditions + idx] = float(value)
    numerical_explanations.append(numerical_representation)

# Step 6: Create a dataframe from the numerical representations
df_anchors_numerical = pd.DataFrame(numerical_explanations)

# Display the resulting dataframe
print(df_anchors_numerical)


In [None]:
new_df_anchor_results

## Display ANCHORS

In [None]:
# Display the first few rows of each dataset to understand their structure
instance_features_head = df_feature_instances.head()
anchor_explanations_head = new_df_anchor_results.head()
anchor_explanations_numerical = df_anchors_numerical.head()

In [None]:
instance_features_head

In [None]:
anchor_explanations_head

In [None]:
anchor_explanations_numerical

## Generate Outfile for review

In [None]:
anchors_num_explainers_filepath = "anchor_numerical_explainers.csv"

anchor_explanations_numerical.to_csv(anchors_num_explainers_filepath, index=False)

# Generate XAI Metrics 

## Identity Metric 

#### Run a Basic Test First

In [None]:
# Select two random instances from the ANCHOR dataframe
df_xai_numerical = anchor_explanations_numerical

random_indices = np.random.choice(df_xai_numerical.index, size=2, replace=False)
instance_1 = df_xai_numerical.iloc[random_indices[0]]
instance_2 = df_xai_numerical.iloc[random_indices[1]]

# Compute the Euclidean distance between the selected instances - uses custom project function
distance = get_euclidean_distance(instance_1, instance_2)
print(f"Euclidean distance between instance {random_indices[0]} and instance {random_indices[1]}: {distance:.4f}")

#### Retrieve Identity Score

In [None]:
from scipy.spatial import distance

In [None]:
ANCHOR_Identity_Metric = get_identity_metric(df_feature_instances, anchor_explanations_numerical, "ANCHOR")

#### Display Identity Score Metric

In [None]:
ANCHOR_Identity_Number = "{:.2f}%".format(ANCHOR_Identity_Metric)

In [None]:
display_text("ANCHOR Identity Metric Score: " + ANCHOR_Identity_Number)

## Stability Metric

### Invoke Stability Metric Function

In [None]:
ANCHOR_Stability_Metric = get_stability_metric_y(anchor_explanations_numerical, y_test_downsampled, 'ANCHOR')

#### Display Stability Score Metric

In [None]:
ANCHOR_Stability_Number = "{:.2f}%".format(ANCHOR_Stability_Metric)

In [None]:
display_text("ANCHOR Stability Metric Score: " + ANCHOR_Stability_Number)

## Seperability

### Invoke Seperability Metric Function

#### Retrieve Seperability Score

In [None]:
ANCHOR_Seperability_Metric = get_seperability_metric(df_feature_instances, anchor_explanations_numerical, "ANCHOR")

#### Display Seperability Score Metric

In [None]:
ANCHOR_Seperability_Number = "{:.2f}%".format(ANCHOR_Seperability_Metric)

In [None]:
display_text("ANCHOR Seperability Metric Score: " + ANCHOR_Seperability_Number)

## Similarity

### Invoke Similarity Metric Function

#### Retrieve Similarity Score

In [None]:
ANCHOR_Similarity_Metric = get_similarity_metric(df_feature_instances, anchor_explanations_numerical, "ANCHOR", 
                                                 use_dbscan=False)

#### Display Similarity Score Metric

In [None]:
ANCHOR_Similarity_Number = "{:6.2f}".format(ANCHOR_Similarity_Metric)

In [None]:
display_text("ANCHOR Similarity Metric Value: " + ANCHOR_Similarity_Number)