# Disseration Experiment 5k
# Generate ANCHOR Output (Credit Default) February 21¶
Ciaran Finnegan February 2023

# Import Libraries + Custom Functions

## Import Libraries

In [None]:
# Import libs
import numpy as np
import pandas as pd

# Display libraries
from IPython.display import display, HTML
from prettytable import PrettyTable

# Import necessary libraries for ANN model building
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam

# Import necessary library for ANCHOR explainer
import alibi
from alibi.explainers import AnchorTabular
#import anchor
from anchor import anchor_tabular
import re
import ast

# Libraries required for metrics calculations
from scipy.spatial import distance
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import warnings

# Compute additional evaluation metrics
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score, f1_score

# Classifier training (not used for explainability)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Additional display libraires
import contextlib
import sys
from contextlib import contextmanager

# Libraries used in Experiment Creation of XL Output Metrics
import os
import time
import random
import openpyxl

## Custom Functions

Dataset Visualisations

In [None]:
%run ./DS_Visualisation_Functions.ipynb

Metrics

In [None]:
%run ./XAI_Metrics_Functions.ipynb

Model Evaluation Functions

In [None]:
%run ./DS_Model_Build_Evaluation_Functions.ipynb

Track Experiment Result Functions

In [None]:
%run ./XAI_Experiment_Functions.ipynb

------------------------------------

# Load Model

A Neural Network Model has been created in another Kubeflow Notebook and is being used in all the XAI experiments

In [None]:
loaded_model = keras.models.load_model('ccfraud_model')  # If saved as SavedModel

In [None]:
X_test_loaded, y_test_loaded, X_train_loaded, y_train_loaded, df_downsampled_loaded, dfCatCols = load_CC_train_test_data()

In [None]:
X_train_loaded.head(1)

In [None]:
y_train_loaded.head(2)

## Re-Display Model Peformance

For illustration, the evualtion metrics of the NN model will be repeated here.

### Re-Scale Data

In [None]:
#scale_loaded = StandardScaler()   
#X_train_loaded_scaled = scale_loaded.fit_transform(X_train_loaded)
#X_test_loaded_scaled  = scale_loaded.transform(X_test_loaded)

In [None]:
X_train_loaded_scaled, X_test_loaded_scaled, scale_loaded = scale_the_features(X_train_loaded, 
                                                                                X_test_loaded, 
                                                                                df_downsampled_loaded, 
                                                                                'Fraud')

In [None]:
X_train_loaded_scaled

### Re-evaluate loaded model

In [None]:
y_pred_loaded = display_model_metrics_tabular(loaded_model, X_test_loaded_scaled, y_test_loaded)

### Confusion Matrix

In [None]:
generate_confusion_matrix(y_test_loaded, y_pred_loaded)

# Generate ANCHOR Values (Examples Instances)

#### Suppress Warnings to clean up output

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

#### Prepare Data Inputs to Anchor Explainer

Check layout of X_train_downsampled

In [None]:
# Extract the feature names, excluding the target variable 'default'
# Jan 6th - use new model and data
column_names = df_downsampled_loaded.drop('Fraud', axis=1).columns

In [None]:
# Separate the features and the target variable
# Jan 6th - use new model and data
X = df_downsampled_loaded.drop('Fraud', axis=1)
y = df_downsampled_loaded['Fraud']

In [None]:
# Convert NumPy array to DataFrame
X_train_loaded_scaled = pd.DataFrame(X_train_loaded_scaled, columns=column_names)

#### Set Up Anchor Explainer function

In [None]:
#X_train_loaded_scaled

In [None]:
# Define the Anchor explainer
explainer = anchor_tabular.AnchorTabularExplainer(
    
    class_names=['Non Fraud', 'Fraud'],
    
    feature_names=X.columns.tolist(),
    
    # Jan 18th - use new loaded model and data - SCALED
    train_data=X_train_loaded_scaled.values,
    
    categorical_names={}
)

In [None]:
@contextmanager
def suppress_stdout():
    with open(os.devnull, 'w') as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            yield None

In [None]:
def predict_fn(x):
    # Ensure x is in batch format
    if len(x.shape) == 1:
        x = np.expand_dims(x, axis=0)
    # Suppress the output of the progress bar
    with suppress_stdout():
        
        # Get the model's prediction (probability of the positive class)
        #probabilities = model.predict(x, verbose=0)
        
        # Jan 6th - use new model and data
        probabilities = loaded_model.predict(x, verbose=0)
        
    # Convert probabilities to class labels (0 or 1)
    labels = (probabilities > 0.5).astype(int)
    return labels.flatten()

In [None]:
#X_test_loaded

In [None]:
#X_test_loaded_scaled

In [None]:
# Convert NumPy array to DataFrame
X_test_loaded_scaled = pd.DataFrame(X_test_loaded_scaled, columns=column_names)

In [None]:
# Ensure that the instance passed to explain_instance is in the correct shape
idx = 4

# Select an instance from the test data with which to generate an Anchor explanation
# instance_to_explain = X_test_loaded.iloc[idx].values.reshape(1, -1)
# Jan 18th - use new loaded model and data - SCALED
instance_to_explain = X_test_loaded_scaled.iloc[idx].values.reshape(1, -1)

In [None]:
# Generate an explanation for the first instance in the test set
exp = explainer.explain_instance(instance_to_explain, predict_fn, threshold=0.95)

#### Display Anchor Explainers (Single Instances)

In [None]:
# Show the explanation
exp.show_in_notebook()

In [None]:
#exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
def get_unscaled_feature_values_orig(exp, scaler, feature_names):
        
    # Convert feature_names to a list if it's a pandas Index
    if isinstance(feature_names, pd.Index):
        feature_names = feature_names.tolist()
    
    
    original_features_and_values = {}

    for condition in exp.names():
        # Check for the type of condition and split accordingly
        if '<=' in condition:
            left, right = condition.split('<=', 1)
            operator = '<='
        elif '>=' in condition:
            left, right = condition.split('>=', 1)
            operator = '>='
        elif '<' in condition:
            left, right = condition.split('<', 1)
            operator = '<'
        elif '>' in condition:
            left, right = condition.split('>', 1)
            operator = '>'
        else:
            continue  # Skip if the condition format is not recognized

        feature = left.strip()
        scaled_value = float(right.strip())

        # Find the index of the feature in the original dataset
        feature_index = feature_names.index(feature)

        # Create a dummy array for inverse transformation
        dummy_array = np.zeros((1, len(feature_names)))
        dummy_array[0, feature_index] = scaled_value
    

        # Inverse transform to get the original value
        original_value = scaler.inverse_transform(dummy_array)[0, feature_index]
        

        # Adjust the original value if it's very close to 0 or 1
        if -0.01 <= original_value <= 0.02:
            original_value = 0.00
        elif 0.99 <= original_value <= 1.01:
            original_value = 1.00

        # Store the condition with the original value
        original_features_and_values[feature + ' ' + operator] = original_value

    return original_features_and_values

In [None]:
def get_unscaled_feature_values(exp, scaler, feature_names, scaled_feature_names):
    
    # Convert feature_names to a list if it's a pandas Index
    if isinstance(feature_names, pd.Index):
        feature_names = feature_names.tolist()
    
    # Dictionary to store the original features and values
    original_features_and_values = {}

    # Iterate over conditions provided by the explainer
    for condition in exp.names():
        # Split the condition to extract the feature and its scaled value
        if '<=' in condition:
            left, right = condition.split('<=', 1)
            operator = '<='
        elif '>=' in condition:
            left, right = condition.split('>=', 1)
            operator = '>='
        elif '<' in condition:
            left, right = condition.split('<', 1)
            operator = '<'
        elif '>' in condition:
            left, right = condition.split('>', 1)
            operator = '>'
        else:
            continue  # Skip if the condition format is not recognized

        feature = left.strip()
        scaled_value = float(right.strip())

        # Only proceed if the feature was scaled
        if feature in scaled_feature_names:
            # Find the index of the feature in the scaled features
            feature_index_scaled = scaled_feature_names.index(feature)
            
            # Create a dummy array for inverse transformation
            # This array should only include scaled (non-binary) features
            dummy_array_scaled = np.zeros((1, len(scaled_feature_names)))
            dummy_array_scaled[0, feature_index_scaled] = scaled_value

            # Inverse transform to get the original value of the scaled feature
            original_value_scaled = scaler.inverse_transform(dummy_array_scaled)[0, feature_index_scaled]
        else:
            # For binary features, the original value is the same as the scaled value
            original_value_scaled = scaled_value

        # Adjust the original value if it's very close to 0 or 1, this might be necessary for binary features
        if -0.01 <= original_value_scaled <= 0.02:
            original_value_scaled = 0.00
        elif 0.99 <= original_value_scaled <= 1.01:
            original_value_scaled = 1.00

        # Store the condition with the original (or adjusted) value
        original_features_and_values[feature + ' ' + operator] = original_value_scaled

    return original_features_and_values

To improve meaningfulness of ANCHOR explainer set up an inverse scale on the non-binary features, which were previously scaled in model/XAI building

In [None]:
# Identify non-binary scaled features
scaled_features = [col for col in X_test_loaded_scaled.columns if not X_test_loaded_scaled[col].dropna().isin([0, 1]).all()]

print("\n\nAll CC Fraud Non-Binary features (NOT to be scaled):", scaled_features)
print("\n\n") # Gap for formatting

In [None]:
# 'exp' is the Anchor explanation, scaler is the scaler used, 
# and feature_names is a list of feature names
original_features_and_values = get_unscaled_feature_values(exp, 
                                                           scale_loaded, 
                                                           column_names,
                                                           scaled_features)

In [None]:
original_features_and_values

In [None]:
def display_unscaled_feature_values(original_features_and_values):
    # Start with an empty string for HTML content
    html_content = '<div style="font-family: Arial; padding: 10px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9;">'
    html_content += '<h2 style="color: #4CAF50;">Anchor Explanation (Unscaled Feature Values)</h2>'

    # Iterate over the dictionary and add items to the HTML content
    for feature_condition, value in original_features_and_values.items():
        html_content += f'<p><b>{feature_condition}</b>: {value:.2f}</p>'

    html_content += '</div>'
    
    # Display the HTML content
    display(HTML(html_content))

In [None]:
# Show the explanation
exp.show_in_notebook()
# Generate imnproved visuals for Anchor ouput for sample instance
display_unscaled_feature_values(original_features_and_values)

#### Pseudocode to Generate Initial ANCHOR Values

For the RF model built above in Python, select a random sample 
of 15 instances in the test data, 10 for Class '0' and 5 for 
Class '1', and generate ANCHOR values as explainers for these  
instances in the test dataset.

Present these ANCHOR values in an easily understood and pleasant 
on the eye tabular output format for the Python Kubeflow Notebook
in which I am writing my Python code. 

Create a second tabular format what shows an equally appealing 
output in my Python Notebook that shows the ANCHOR values and the
feature details for each instance on a single row, across which I
can scroll.

Comment each line of Python code with as much detail as practical. 

Output the ANCHOR values to a CSV file. Output the feature details 
for each corresponding instance for which the ANCHOR Values were
created in a seperate CSV file.

After the code generation provide as much narrative detail 
as possible.

Further pseudocode...

Use the AnchorTabular explainer from the alibi library. This explainer provides local explanations for classification models' predictions by identifying a minimal set of conditions (features) in the instance that ensure the model's decision remains unchanged (these conditions are called "anchors").

The steps:

Select a random sample of 15 instances from the test data, 10 from Class '0' and 5 from Class '1'.
Set up the AnchorTabular explainer and fit it to the training data.
Generate anchor explanations for the selected instances.
Present the anchor values in two tabular formats: a summary table and a detailed table.
Output the anchor values and feature details to CSV files.

#### Display Anchor Explainers (Multiple Instances)

In [None]:
# Jan 18th - use new loaded model and data - SCALED
#instance_to_explain = X_test_loaded_scaled.iloc[idx].values.reshape(1, -1)

In [None]:
# Loop through the first five instances in the test dataset
for idx in range(7):
       
    # Jan 6th - use new model and data
    #instance = X_test_loaded.iloc[idx].values.reshape(1, -1)
    
    # Jan 18th - use new loaded model and data - SCALED
    instance = X_test_loaded_scaled.iloc[idx].values.reshape(1, -1)
    print(f"\nInstance {idx + 1}:")

    
    ########### - Do not run for experiment - ############
    # Generate an explanation for the instance
    #exp = explainer.explain_instance(instance, predict_fn, threshold=0.95)
    
    # Show the explanation in the notebook
    #exp.show_in_notebook()
    
    # Generate a rescaled output to explain Anchors in actual test set values 
    #iteration_features_and_values = get_unscaled_feature_values(exp, 
    #                                                       scale_loaded, 
    #                                                       column_names)
    #original_features_and_values
    #display_unscaled_feature_values(iteration_features_and_values)
    ########### - Do not run for experiment - ############

### Create an ANCHOR File Output

In [None]:
# Initialize a list to store the ANCHOR results
#anchor_results = []

In [None]:
# anchor_results = []

# Loop through the first five instances in the test dataset
#for idx in range(5):
    
    # instance = X_test_downsampled.iloc[idx].values.reshape(1, -1)
    
    # Jan 6th - use new model and data
#    instance = X_test_loaded.iloc[idx].values.reshape(1, -1)
    
    # Generate an explanation for the instance
#    exp = explainer.explain_instance(instance, predict_fn, threshold=0.95)
    
    # Extract feature importance from the explanation
#    feature_importance = {}
#    for condition in exp.names():
        # Handle conditions with '='
#        if '=' in condition:
#            feature, value = condition.split('=')
#            feature = feature.strip()
#            value = float(value.strip())
#            feature_importance[feature] = ('=', value)
        # Handle conditions with '>' or '<'
#        elif '>' in condition or '<' in condition:
#            parts = re.split('([><])', condition)
#            feature, operator, value = [part.strip() for part in parts if part.strip()]
#            value = float(value)
#            feature_importance[feature] = (operator, value)
#        else:
#            raise ValueError(f"Unexpected format for ANCHOR explanation: {condition}")

#    anchor_results.append(feature_importance)

In [None]:
# Create a DataFrame from the results
#df_anchor_results = pd.DataFrame(anchor_results)

In [None]:
# Show the DataFrame
#print(df_anchor_results)

In [None]:
# Write the DataFrame to a CSV file
#df_anchor_results.to_csv('anchor_results_ANN.csv', index=False)

-----

# Prepare ANCHOR Values for Metrics

## Generate Anchor Values

In [None]:
# Create a function to limit the number of Anchor values to assess to a certain threshold
#def limit_anchor_exps(explanation, precision_weight=0.5, coverage_weight=0.5):
def limit_anchor_exps(condition):
    """
    Calculate a score for an explanation based on its precision and coverage.
    
    :param explanation: An Anchor explanation object.
    :param precision_weight: The weight given to precision in the score calculation.
    :param coverage_weight: The weight given to coverage in the score calculation.
    :return: A score for the explanation.
    """
    #precision = explanation.precision()
    #coverage = explanation.coverage()

    # Calculate the weighted score
    #score = (precision * precision_weight) + (coverage * coverage_weight)
    #return score

    # Example metric: favoring shorter explanations
    return len(condition)

In [None]:
@timeit
def generate_anchors_for_instances(df, num_top_exps, num_instances=2):
    # Initialize a list to store the ANCHOR results
    new_anchor_results = []
    feature_instances = []
    
    # Loop through the first five instances in the test dataset
    for idx in range(num_instances):

        instance = df.iloc[idx]
        feature_instances.append(instance)
        
        print(f'Generate Anchor exp for idx: {idx}...')

        # Generate an explanation for the instance with a lower threshold
        # The threshold and beam parameters are used reduce the generation of 
        # and Anchor set that is excessively computationally expensive
        exp = explainer.explain_instance(instance.values.reshape(1, -1), 
                                         predict_fn, 
                                         threshold=0.99,
                                         beam_size=1)

        
        print(f'Instance: {idx} - full exp is {exp.names()}')

        # Check if an explanation was found
        if exp is not None:
            
            # Score and sort the explanations, then pick the top nn
            print(f'AAA: Number of Top Explantions {num_top_exps}')
            sorted_conditions  = sorted(exp.names(), key=lambda x: limit_anchor_exps(x), reverse=True)[:num_top_exps]
            print(f'Instance: {idx} - sorted exps are {sorted_conditions}')
            
            # Parse the conditions from the explanation and format them
            anchor_explanation = []
                  
            #for condition in exp.names():
            for condition in sorted_conditions:
                if ' > ' in condition or ' < ' in condition:
                    feature, relation, value = condition.split(' ')[0], condition.split(' ')[1], condition.split(' ')[2]
                    try:
                        anchor_explanation.append(f"'{feature} {relation} {float(value):.2f}'")
                    except ValueError:
                        anchor_explanation.append(f"'{condition}'")
                else:
                    anchor_explanation.append(f"'{condition}'")

            # Convert the list of strings to a single string
            anchor_explanation_str = '[' + ', '.join(anchor_explanation) + ']'

            # Add the formatted explanation to the results list
            new_anchor_results.append(anchor_explanation_str)
        else:
            new_anchor_results.append("['No explanation found']")

    # Create a DataFrame from the results
    new_df_anchor_results = pd.DataFrame(new_anchor_results, columns=['Anchor Explanation'])

    # Create a DataFrame from the feature instances
    df_feature_instances = pd.DataFrame(feature_instances)
    
    # Jan 6th - align index of instances to the newly created Anchor values
    df_feature_instances = df_feature_instances.reset_index(drop=True)
    new_df_anchor_results = new_df_anchor_results.reset_index(drop=True)

        
    return df_feature_instances, new_df_anchor_results

In [None]:
#df_feature_instances, new_df_anchor_results = generate_anchors_for_instances(X_test_downsampled, 5)
#results, exec_time = generate_anchors_for_instances(X_test_downsampled, 5)

# Jan 6th - use new model and data
#results, exec_time = generate_anchors_for_instances(X_test_loaded, 5)

# Jan 18th - use new loaded model and data - SCALED
results_testdata_check, exec_time = generate_anchors_for_instances(X_test_loaded_scaled, 2)

In [None]:
df_feature_instances, new_df_anchor_results = results_testdata_check

## Determine Computational Efficiency Value

In [None]:
# Display time to generate DiCE explainers
print(f"ANCHORS Execution Time: {exec_time} seconds")

In [None]:
# Show the DataFrames
#print("Anchor Explanations:")
#print(new_df_anchor_results)
#print("\nFeature Instances:")
#print(df_feature_instances)

# Write the DataFrames to CSV files
#new_df_anchor_results.to_csv('new_anchor_results5.csv', index=False)
#df_feature_instances.to_csv('feature_instances5.csv', index=False)

In [None]:
#df_feature_instances.head()

In [None]:
#new_df_anchor_results

## Parse the Anchor Explanations

In [None]:
def safe_literal_eval(s):
    try:
        # Attempt to evaluate the string as a Python literal
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        # If there's an error, return the original string
        return s

In [None]:
# Apply safe_literal_eval to the 'Anchor Explanation' column
#df_anchor_results_input['Anchor Explanation'] = df_anchor_results_input['Anchor Explanation'].apply(safe_literal_eval)

# Step 2: Convert the 'Anchor Explanation' column from a string representation of a list back to an actual list
new_df_anchor_results['Anchor Explanation'] = new_df_anchor_results['Anchor Explanation'].apply(ast.literal_eval)

# Step 3: Determine the maximum number of conditions in the ANCHOR explanations across all instances
max_num_conditions = max(new_df_anchor_results['Anchor Explanation'].apply(len))

# Step 4: Initialize a list to store the numerical representations of the ANCHOR explanations
numerical_explanations = []

# Step 5: Loop through each ANCHOR explanation and convert it to a numerical representation
for explanation in new_df_anchor_results['Anchor Explanation']:
    numerical_representation = [-1] * len(df_feature_instances.columns) * max_num_conditions
    for idx, condition in enumerate(explanation):
        # Parse the condition to extract the feature name and value
        feature, relation, value = condition.split(' ')[0], condition.split(' ')[1], condition.split(' ')[2]
        
        # Find the index of the feature in the feature dataframe
        feature_idx = df_feature_instances.columns.get_loc(feature)
        
        # Store the feature index in the numerical representation
        numerical_representation[feature_idx * max_num_conditions + idx] = float(value)
    numerical_explanations.append(numerical_representation)

# Step 6: Create a dataframe from the numerical representations
df_anchors_numerical = pd.DataFrame(numerical_explanations)

# Display the resulting dataframe
#print(df_anchors_numerical)

In [None]:
def parse_anchor_exps(df_anchor_results_input):
    
    # Step 1: Convert the 'Anchor Explanation' column from a string representation of a list back to an actual list
    df_anchor_results_input['Anchor Explanation'] = df_anchor_results_input['Anchor Explanation'].apply(ast.literal_eval)

    # Step 2: Determine the maximum number of conditions in the ANCHOR explanations across all instances
    max_num_conditions = max(df_anchor_results_input['Anchor Explanation'].apply(len))

    # Step 3: Initialize a list to store the numerical representations of the ANCHOR explanations
    numerical_explanations = []
    
    # Debug step
    print(f'The value for max_num_conditions is : {max_num_conditions}')

    # Step 4: Loop through each ANCHOR explanation and convert it to a numerical representation
    for explanation in df_anchor_results_input['Anchor Explanation']:
        
        numerical_representation = [-1] * len(df_feature_instances.columns) * max_num_conditions
        
        for idx, condition in enumerate(explanation):
            # Parse the condition to extract the feature name and value
            feature, relation, value = condition.split(' ')[0], condition.split(' ')[1], condition.split(' ')[2]

            # Find the index of the feature in the feature dataframe
            print(f'Explanation : {explanation} for Feature {feature}')
            feature_idx = df_feature_instances.columns.get_loc(feature)

            # Print debug
            print(f'Feature Index {feature_idx * max_num_conditions + idx}')
            print(f'The float(value) is : {float(value)}')
            
            # Store the feature index in the numerical representation
            numerical_representation[feature_idx * max_num_conditions + idx] = float(value)
            
        numerical_explanations.append(numerical_representation)

    # Step 5: Create a dataframe from the numerical representations
    df_anchors_numerical = pd.DataFrame(numerical_explanations)

    # Display the resulting dataframe
    #print(df_anchors_numerical)
    
    return df_anchors_numerical

In [None]:
new_df_anchor_results

## Parse the Anchor Explanations - Two

In [None]:
def parse_anchor_exps_two(df_anchor_results_input, df_feature_instances_input):
    
    # Step 1: Convert the 'Anchor Explanation' column from a string representation of a list back to an actual list
    #df_anchor_results_input['Anchor Explanation'] = df_anchor_results_input['Anchor Explanation'].apply(ast.literal_eval)
    
    # Step 1: Convert the 'Anchor Explanation' column from a string representation of a list back to an actual list
    # Apply safe_literal_eval to the 'Anchor Explanation' column
    df_anchor_results_input['Anchor Explanation'] = df_anchor_results_input['Anchor Explanation'].apply(safe_literal_eval)

    # Step 2: Determine the maximum number of conditions in the ANCHOR explanations across all instances
    max_num_conditions = max(df_anchor_results_input['Anchor Explanation'].apply(len))

    # Step 3: Initialize a list to store the numerical representations of the ANCHOR explanations
    numerical_explanations = []
    # Add another list store for 2nd values
    numerical_explanations_2ndValue = []
    
    # Debug step
    print(f'A:The value for max_num_conditions is : {max_num_conditions}')
    
    # create df to capture second values
    df_secondvalues = pd.DataFrame()
    i2ndValCnt = 1
    
    # Step 4: Loop through each ANCHOR explanation and convert it to a numerical representation
    for explanation in df_anchor_results_input['Anchor Explanation']:
        
        print(f'B: explanation is {explanation}')
        
        # The anchor output dataframe needs to match the feature dataframe in length
        # numerical_representation = [-1] * len(df_feature_instances_input.columns) * max_num_conditions
        numerical_representation = [-1] * len(df_feature_instances_input.columns) 
        numerical_representation_2ndNum = [-1] * len(df_feature_instances_input.columns) 
        
        print(f'C:numerical_representation is : {numerical_representation}')
        print(f'D:numerical_representation lenght is : {len(numerical_representation)}')
        
        print(f'E:numerical_representation_2ndNum is : {numerical_representation_2ndNum}')
        print(f'F:numerical_representation_2ndNum lenght is : {len(numerical_representation_2ndNum)}')
        
        for idx, condition in enumerate(explanation):
            
            print(f"1:Condition to parse: {condition}")
            
            # Regular expression to capture feature name between two numeric values
            match_between = re.search(r'(-?\d+\.\d+|-?\d+)\s*<\s*([A-Za-z_.]+)\s*<=\s*(-?\d+\.\d+|-?\d+)', condition)

            # Regular expression to capture feature name followed by a numeric value
            #match_after = re.search(r'([A-Za-z_.]+)\s*[<>=]+\s*(-?\d+\.\d+|-?\d+)', condition)
            
            # Try to match the pattern: feature <= number
            #match_after = re.search(r'([A-Za-z_.]+)\s*<=?\s*(-?\d+\.\d+|-?\d+)', condition)
            match_after = re.search(r'([A-Za-z0-9_.]+)\s*([<>=]+)\s*(-?\d+.\d+|-?\d+)', condition)

            feature, value, value2 = None, None, None

            if match_between:
                print('1a:Match Between')
                value = float(match_between.group(1))
                feature = match_between.group(2)
                value2 = float(match_between.group(3))
            elif match_after:
                print('1b:Match After')
                feature = match_after.group(1)
                value = float(match_after.group(3))

            
            
            
            
            # Parse the condition to extract the feature name and value
            #feature_old, relation, value_old = condition.split(' ')[0], condition.split(' ')[1], condition.split(' ')[2]

            # Capture segments with numbers, possibly preceded by non-numeric characters
            #segments = re.findall(r'([A-Za-z_.]*-?\d+\.\d+|[A-Za-z_.]*-?\d+)', condition
            #segments = re.findall(r'([A-Za-z_.]*-?\d+\.\d+|[A-Za-z_.]*-?\d+)', condition)
            #segments = re.findall(r'(-?\d+\.\d+|-?\d+|[A-Za-z_.]+)', condition)
            #segments = re.findall(r'(-?\d+\.\d+|-?\d+|\D+)', condition)
            
                                  
            # Extract the feature name and the first numerical value from the segments
            #if segments:
            #    print('Segments found\n')
            #    feature, value, value2 = None, None, None
                #found_first_value = False                  
            #    for segment in segments:
                    # If a segment starts with non-numeric characters, it's likely the feature name
                    # Check if the segment is a numeric value
            #        if re.match(r'-?\d+\.\d+|-?\d+', segment):
            #            if value is None:
            #                value = float(segment)
            #        elif value2 is None:
            #            value2 = float(segment)
            #    else:
                    # Remove any non-alphanumeric characters from feature name candidates
            #        possible_feature = re.sub(r'[^A-Za-z0-9_.]', '', segment)
            #        if possible_feature and possible_feature in df_feature_instances.columns:
            #            feature = possible_feature

            #values = re.findall(r'-?\d+\.\d+|-?\d+', condition)
            #values = re.findall(r'(?<=^|[\s<>=])-?\d+\.\d+|-?\d+', condition)
            
            # Check if there is more than one numerical value and display them
            if value2 != None:
                print(f"2:Multiple numerical values found in condition '{condition}':")
                print(f"2:Value2 is : {value2}")
                    
            # Chack feature extraction from Condition
            #feature3 = re.sub(r'[\d<>=. -]', '', condition).strip()
            print(f"3:Extracted feature value - feature -: {feature}")
            
            print(f'4:The extracted value is : {value}')
            
            # Correct for zero errors with value
            if not value:
                print('4a:Trying to correct - value..')
                value = 0.0

            
            # Handle unexpected formats or values
            #if not value or not feature or feature not in df_feature_instances_input.columns:
            if not feature or feature not in df_feature_instances_input.columns:    
                if not value:
                    print('NOT Value')
                if not feature:
                    print('NOT Feature')
                if feature not in df_feature_instances_input.columns:
                    print('Feature name not found in column list')
                print(f"4b:Unexpected format for condition: {condition}")
                continue

            # Assign the first numerical value to the feature
            #value = float(values[0])
            
            # Find the index of the feature in the feature dataframe
            print(f'5:Explanation : {explanation} for Feature {feature}')
            feature_idx = df_feature_instances_input.columns.get_loc(feature)

            # Print debug
            print(f'6:Feature Index - feature_idx * max_num_conditions + idx : {feature_idx * max_num_conditions + idx}')
            print(f'6:Feature Index - feature_idx + idx : {feature_idx + idx}')
            print(f'6:The float(value) is : {float(value)}')
            
            print(f'6b:[feature_idx] is : {feature_idx}')
            print(f'6b:[idx] is : {idx}')
            
            print(f'7a:[feature_idx * max_num_conditions + idx] is : {feature_idx * max_num_conditions + idx}')
            print(f'7b:[feature_idx + idx] is : {feature_idx + idx}')
            print(f'7c:[feature_idx] is : {feature_idx}')
            
            # Store the feature index in the numerical representation list
            #numerical_representation[feature_idx * max_num_conditions + idx] = float(value)
            #numerical_representation[feature_idx + idx] = float(value)
            numerical_representation[feature_idx] = float(value)
                                  
            # Additional processing can be done with value2 if needed
            if value2 != None:
                print('8: Second value stuff happening...1')
                # Add second value to secondary list
                numerical_representation_2ndNum[feature_idx] = float(value2)
                # Create a new column in which to eventually store the second value
                df_secondvalues[feature +'_Anchor' + str(i2ndValCnt)] = -1
                i2ndValCnt +=1
            else:
                # If no second value then pad out with a 'dummy'
                numerical_representation_2ndNum[feature_idx] = -1         
            
        numerical_explanations.append(numerical_representation)
        # A list generated for 2nd values in Anchor conditions
        numerical_explanations_2ndValue.append(numerical_representation_2ndNum)

    # Step 5: Create a dataframe from the numerical representations
    df_anchors_numerical = pd.DataFrame(numerical_explanations)
    # Account for second value in expression
    df_anchors_numerical_2ndValue = pd.DataFrame(numerical_explanations_2ndValue)
    

    # Display the resulting 2nd value dataframe
    print('\ndf_anchors_numerical_2ndValue...before removing...')
    print(df_anchors_numerical_2ndValue)
    
    # Remove 2nd value columns where all values are the same
    #columns_to_drop = [col for col in df_anchors_numerical_2ndValue.columns if df_anchors_numerical_2ndValue[col].nunique() == 1]
    #df_anchors_numerical_2ndValue.drop(columns=columns_to_drop, inplace=True)
    df_anchors_numerical_2ndValue = df_anchors_numerical_2ndValue.loc[:, df_anchors_numerical_2ndValue.nunique() != 1]
    
    # Display the resulting 2nd value dataframe
    print('\ndf_anchors_numerical_2ndValue...after removing...')
    print(df_anchors_numerical_2ndValue)    
    
    
    # Copy the results of the second values into the dataframe labelled with new columns
    # df_secondvalues.iloc[:,:] = df_anchors_numerical_2ndValue.values
    
    # Add code here to append the second values df to both the anchors and features dataframe outputs
    print(f'\nshape of df_secondvalues: {df_secondvalues.shape}')
    #df_secondvalues.shape
    # ....
    # ....
    
    
    # Display the dataframe with 2nd value columns
    print('\ndf_secondvalues...')
    print(df_secondvalues)
    
    return df_feature_instances_input, df_anchors_numerical

## Display ANCHORS

In [None]:
# Display the first few rows of each dataset to understand their structure
instance_features_head = df_feature_instances.head()
anchor_explanations_head = new_df_anchor_results.head()
anchor_explanations_numerical = df_anchors_numerical.head()

## Generate Outfile for review

In [None]:
#anchors_num_explainers_filepath = "anchor_numerical_explainers.csv"
#anchor_explanations_numerical.to_csv(anchors_num_explainers_filepath, index=False)

----------------------------------------

# XAI Experiments - Metrics Capture

## Suppress Warnings to clean up output

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

## Break out Model Test Data into a list of dataframes

### Create Test Data for Experiment Input

In [None]:
original_feature_names = [col for col in df_downsampled_loaded.columns if col != 'Fraud']

# Ensure X_test_loaded has the correct column names (if necessary)
#X_test_loaded.columns = original_feature_names

In [None]:
# Jan 18th - use new loaded model and data - SCALED
# Convert NumPy array to DataFrame
X_test_loaded_scaled = pd.DataFrame(X_test_loaded_scaled, columns=original_feature_names)

In [None]:
# Combine X_test_loaded and y_test into a single DataFrame
#df_TestData = pd.concat([X_test_loaded, y_test_loaded], axis=1)

In [None]:
# Jan 18th - use new loaded model and data - SCALED
# Combine X_test_loaded and y_test into a single DataFrame
df_TestData = pd.concat([X_test_loaded_scaled, y_test_loaded], axis=1)

### Split the DataFrame into 20 consecutive smaller DataFrames

In [None]:
# Split the DataFrame into 20 consecutive smaller DataFrames
split_size, list_df = split_TestData_into_nn_Blocks(df_TestData, num_splits = 20)

### Check Label Count for Stability Metrics

In [None]:
# Count the occurrence of each unique value in the 'Fraud' column
fraud_counts = df_TestData['Fraud'].value_counts()

# Display the counts
print("Breakdown of 'Fraud' and non-Fraud label records in df_TestData:")
print(fraud_counts)

### Add a routine to check output values

In [None]:
# Display starting points in the first nn sub dataframes
startBlockDisplay(df_TestData, split_size, 1)

## Confirm Starting Point in External ANCHORS XAI XL File

The code below acts so that for each dataframe in the list just created the following actions are carried out;

Check if an XAI results XL spreadsheet called 'ANCHOR_XAI_Metrics_Experiments.xls' exists;

If not create an empty XL spreadsheet with the name 'ANCHOR_XAI_Metrics_Experiments.xls', and then define a variable called ‘Sample’ with an integer value of 1 and print the value of 'Sample' to output.

If and XL spreadsheet called 'ANCHOR_XAI_Metrics_Experiments.xls' does exist, then read the entries in the spreadsheet in the first column named ‘Sample Number’ and create a variable in this Python program named ‘Sample’ that is one integer value higher than the highest integer number column named ‘Sample Number’ in the XL, and print this value of 'Sample' to output.

In [None]:
# Create a sequential number as an identifier for each DataFrame
list_df = {f'df_{i + 1}': list_df[i] for i in range(len(list_df))}

In [None]:
# File path for the ANCHOR XAI metrics results spreadsheet
ANCHOR_xai_file_path = 'ANCHOR_XAI_Metrics_Experiments.xlsx'  # Stored locally

In [None]:
# Call Function to update or create the spreadsheet and determine the 'Sample' number
# Process each dataframe in 'list_df'
sample = return_next_sample_number_to_process(list_df, ANCHOR_xai_file_path, "ANCHOR")

## Select Next Dataframe to Process

---------------------------------

	
Extend the Python code so that the code reads in the dataframe from 'list df' that corresponds to the integer value in the 
variable named ‘Sample’. 

Assign this dataframe the name 'df_Selected_from_List'.


----------------------------------

### Initialize Dataframe to Capture Re-start Point as None

In [None]:
# Initialize df_Selected_from_List as None
df_Selected_from_List = None

### Extract test data block to restart XAI metrics process

In [None]:
df_Selected_from_List, key = select_restart_testdata_block(df_Selected_from_List, 
                                                           list_df, 
                                                           ANCHOR_xai_file_path)

In [None]:
# If no DataFrame is selected (e.g., if 'Sample' exceeds the number of DataFrames in list_df)
if 'df_Selected_from_List' not in locals():
    print("No DataFrame selected. The 'Sample' number may exceed the number of DataFrames in list_df.")

-----------------------------

## Generate XAI Metrics from Dataframe

### Generate the ANCHOR Values for the Test Data Block

#### Pre-Check Values for Data Block

In [None]:
df_Selected_from_List.head(2)

In [None]:
print(df_Selected_from_List.index)

In [None]:
#df_Selected_from_List.head(2)

In [None]:
print(df_Selected_from_List.index)

#### Get Label Values for Stability Metric

In [None]:
y_test_block_labels_df = df_Selected_from_List['Fraud']

In [None]:
y_test_block_labels_df.shape

In [None]:
y_test_block_labels_df

#### Pre-Process Values for Data Block

Extract the label values from the data block

In [None]:
df_Selected_from_List = df_Selected_from_List.drop('Fraud', axis=1)

In [None]:
print(df_Selected_from_List.index)

Set limit value (for debugging)

In [None]:
# Set limit - '64' will process the entire data block
limit_data_block_rows = 64

In [None]:
## Set limit on number of Anchor explanations to process
exp_limit = 15

#### Get Anchor Values for Data Block

In [None]:
results_ANCHOR, exec_time_ANCHOR = generate_anchors_for_instances(df_Selected_from_List, 
                                                                  exp_limit, 
                                                                  limit_data_block_rows)

In [None]:
df_feature_anchor_instances, df_anchor_results_block = results_ANCHOR

### Parse the ANCHOR Values for the Test Data Block

In [None]:
df_feature_anchor_instances.head()

In [None]:
df_anchor_results_block.head()

In [None]:
df_feature_anchor_instances, df_anchors_numerical_parsed = parse_anchor_exps_two(df_anchor_results_block,
                                                    #df_feature_instances)
                                                    df_feature_anchor_instances)

In [None]:
df_anchors_numerical_parsed.head()

In [None]:
df_feature_anchor_instances.head()

In [None]:
# Output the feature file to csv
df_feature_anchor_instances.to_csv('df_feature_anchor_instances.csv', index=False)

In [None]:
# Output the xai generated files to csv
df_anchors_numerical_parsed.to_csv('df_anchors_numerical_parsed.csv', index=False)

-----------------------------

------------------------------

### Generate Identity Metric

#### Run a Basic Test First

In [None]:
# Select two random instances from the ANCHOR dataframe
df_xai_numerical = df_anchors_numerical_parsed

random_indices = np.random.choice(df_xai_numerical.index, size=2, replace=False)
instance_1 = df_xai_numerical.iloc[random_indices[0]]
instance_2 = df_xai_numerical.iloc[random_indices[1]]

# Compute the Euclidean distance between the selected instances - uses custom project function
distance = get_euclidean_distance(instance_1, instance_2)
print(f"Euclidean distance between instance {random_indices[0]} and instance {random_indices[1]}: {distance:.4f}")

#### Retrieve Identity Score

In [None]:
df_anchors_numerical_parsed.shape

In [None]:
print(df_anchors_numerical_parsed.index)

In [None]:
df_feature_anchor_instances.shape

In [None]:
print(df_feature_anchor_instances.index)

In [None]:
from scipy.spatial import distance
ANCHOR_Identity_Metric = get_identity_metric(df_feature_anchor_instances, 
                                             df_anchors_numerical_parsed, 
                                             "ANCHOR")

#### Display Identity Score Metric

In [None]:
ANCHOR_Identity_Number = "{:.2f}%".format(ANCHOR_Identity_Metric)
display_text("ANCHOR Identity Metric Score: " + ANCHOR_Identity_Number)

In [None]:
# Read in XAI Metric for Identity
XAI_Ident_Metric_1 = ANCHOR_Identity_Metric

-------------------------

### Generate Stability Metric

#### Pre-Processing of Stability Input Data

Check Test Set Labels are correctly indexed for Stability Metrics

In [None]:
df_feature_anchor_instances.shape

In [None]:
df_anchors_numerical_parsed.index

In [None]:
y_test_loaded.shape

In [None]:
y_test_block_labels_df.shape

In [None]:
print('y_test_block_labels_df')
print(y_test_block_labels_df)

In [None]:
# Adjust the label value input to match earlier adjustments in DiCE value creations
if limit_data_block_rows > 0:
    y_test_block_labels_df = y_test_block_labels_df.iloc[:limit_data_block_rows]    

In [None]:
y_test_block_labels_df.shape

In [None]:
# Assigning the column name 'Fraud'
y_test_block_labels_df.columns = ['Fraud']

In [None]:
y_test_block_labels_df.index

In [None]:
y_test_block_labels_df = y_test_block_labels_df.reset_index(drop=True)

Determine which label is most common in current data block

In [None]:
df_LabelCount = pd.DataFrame(y_test_block_labels_df)

In [None]:
# Count the occurrence of each unique value in the 'Fraud' column
fraud_counts_label = df_LabelCount['Fraud'].value_counts()

# Display the counts
print("Breakdown of 'Fraud' and non-Fraud label records in df_TestData:")
print(fraud_counts_label)

In [None]:
# Finding the label with the most entries
largest_label = fraud_counts_label.idxmax()

# Assigning it to largest_label_count
#largest_label_count = label_counts[largest_label]
largest_label_count = fraud_counts_label[largest_label]

print("Label with most entries:", largest_label)
print("Count of this label:", largest_label_count)

#### Retrieve Stability Score

In [None]:
ANCHOR_Stability_Metric = get_stability_metric_y(df_anchors_numerical_parsed, 
                                                 y_test_block_labels_df,
                                                 largest_label, 
                                                 'ANCHOR')

#### Display Stability Score Metric

In [None]:
ANCHOR_Stability_Number = "{:.2f}%".format(ANCHOR_Stability_Metric)
display_text("ANCHOR Stability Metric Score: " + ANCHOR_Stability_Number)

In [None]:
# Read in XAI Metric for Stability
XAI_Stability_Metric_2 = ANCHOR_Stability_Metric

----------------

### Generate Seperability Metric

#### Retrieve Seperability Score

In [None]:
print(df_feature_anchor_instances.index)

In [None]:
print(df_anchors_numerical_parsed.index)

In [None]:
#df_feature_anchor_instances

In [None]:
#df_anchors_numerical_parsed

In [None]:
ANCHOR_Seperability_Metric = get_seperability_metric(df_feature_anchor_instances, 
                                                     df_anchors_numerical_parsed, 
                                                     "ANCHOR",
                                                     0.9687, # threshold  #0.51  0.80  0.99
                                                     0.05) # tolerance) #0.35  0.01  0.01

#### Display Seperability Score Metric¶

In [None]:
ANCHOR_Seperability_Number = "{:.2f}%".format(ANCHOR_Seperability_Metric)
display_text("ANCHOR Seperability Metric Score: " + ANCHOR_Seperability_Number)

In [None]:
# Read in XAI Metric for Seperability
XAI_Seperability_Metric_3 = ANCHOR_Seperability_Metric

------------------------

### Generate Similarity Metric

#### Retrieve Similarity Score

In [None]:
print(df_feature_anchor_instances.index)

In [None]:
print(df_anchors_numerical_parsed.index)

In [None]:
ANCHOR_Similarity_Metric = get_similarity_metric(df_feature_anchor_instances, 
                                                 df_anchors_numerical_parsed, 
                                                 "ANCHOR", 
                                                 use_dbscan=False)

#### Display Similarity Score Metric

In [None]:
ANCHOR_Similarity_Number = "{:6.2f}".format(ANCHOR_Similarity_Metric)
display_text("ANCHOR Similarity Metric Value: " + ANCHOR_Similarity_Number)

In [None]:
# Read in XAI Metric for Similarity
XAI_Similarity_Metric_4 = ANCHOR_Similarity_Metric

------------------------

### Display Final Set of Metrics (this run)

In [None]:
# Print the results
print(f"XAI Ident Metric 1: {XAI_Ident_Metric_1}")
print(f"XAI Stability Metric 2: {XAI_Stability_Metric_2}")
print(f"XAI Seperability Metric 1: {XAI_Seperability_Metric_3}")
print(f"XAI Similarity Metric 1: {XAI_Similarity_Metric_4}")
print(f"XAI Time Metric 5: {exec_time_ANCHOR} seconds")

------------------------

## Write Out Metrics to XL

In [None]:
write_xai_Metrics_to_XL(ANCHOR_xai_file_path, 
                        sample, 
                        ANCHOR_Identity_Metric, 
                        ANCHOR_Stability_Metric, 
                        ANCHOR_Seperability_Metric, 
                        ANCHOR_Similarity_Metric, 
                        exec_time_ANCHOR, 
                        df_Selected_from_List,
                        "ANCHOR")