# Disseration Experiment 6k
# Generate DICE Output (Credit Card Fraud) - Experiment Jan 22¶
Ciaran Finnegan January 2024

# Import Libraries + Custom Functions

## Import Libraries

In [None]:
# Import libs
import numpy as np
import pandas as pd

# Display libraries
from IPython.display import display, HTML
from prettytable import PrettyTable
import raiutils
from raiutils.exceptions import UserConfigValidationException


# Import necessary libraries for DICE explainer
import dice_ml
from dice_ml.utils import helpers  # helper functions


# Import necessary libraries for NN Modelling
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


# Libraries required for metrics calculations
from scipy.spatial import distance
from sklearn.cluster import KMeans


# Libraries for Supplementary Model Evaluation
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample


# Classifier training (not used for explainability)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Libraries used in Experiment Creation of XL Output Metrics
import os
import time
import random
import openpyxl

## Custom Functions

Dataset Visualisations

In [None]:
%run ./DS_Visualisation_Functions.ipynb

Metrics

In [None]:
%run ./XAI_Metrics_Functions.ipynb

Model Evaluation Functions

In [None]:
%run ./DS_Model_Build_Evaluation_Functions.ipynb

Track Experiment Result Functions

In [None]:
%run ./XAI_Experiment_Functions.ipynb

# Load Model

A Neural Network Model has been created in another Kubeflow Notebook and is being used in all the XAI experiments

In [None]:
loaded_model = keras.models.load_model('ccfraud_model')  # If saved as SavedModel

In [None]:
X_test_loaded, y_test_loaded, X_train_loaded, y_train_loaded, df_downsampled_loaded, dfCatCols = load_CC_train_test_data()

In [None]:
#X_train_loaded.head(1)

In [None]:
#y_train_loaded.head(2)

## Re-Display Model Peformance

For illustration, the evualtion metrics of the NN model will be repeated here.

### Tabular Data

In [None]:
# Initialize the StandardScaler
scale_loaded = StandardScaler()   
scale_loaded_wf = StandardScaler()   

In [None]:
# Fit and transform the training data
X_train_loaded_scaled = scale_loaded.fit_transform(X_train_loaded)

In [None]:
# Transform the test data
X_test_loaded_scaled  = scale_loaded.transform(X_test_loaded)

In [None]:
# Separate the 'Fraud' column and store it in a new dataframe
df_fraudlabel = df_downsampled_loaded[['Fraud']].copy()

In [None]:
df_fraudlabel

In [None]:
# Remove the 'Fraud' column from the original dataframe
#df_downsampled_loaded = df_downsampled_loaded.drop(columns=['Fraud'])

In [None]:
# Apply the transform() function on the remaining dataframe
#df_downsampled_loaded_scaled = scale_loaded.transform(df_downsampled_loaded)
df_downsampled_loaded_scaled = scale_loaded_wf.fit_transform(df_downsampled_loaded)

In [None]:
# Convert the scaled array back to a DataFrame
df_downsampled_loaded_scaled = pd.DataFrame(df_downsampled_loaded_scaled, 
                                            columns=df_downsampled_loaded.columns,
                                            index=df_downsampled_loaded.index)

In [None]:
# Concatenate the scaled dataframe with the 'Fraud' column
#df_downsampled_loaded_scaled = pd.concat([df_downsampled_loaded_scaled, df_fraudlabel], axis=1)

In [None]:
df_downsampled_loaded_scaled

In [None]:
df_downsampled_loaded_scaled['Fraud']

In [None]:
y_pred_loaded = display_model_metrics_tabular(loaded_model, X_test_loaded_scaled, y_test_loaded)

### Confusion Matrix

In [None]:
generate_confusion_matrix(y_test_loaded, y_pred_loaded)

# Generate DiCE Values (Examples Instances)

## Generate the Counterfactuals

### Use Tensor Flow - Prepare DiCE parameters - CC Dataset

#### Read External File Containing list of Continous Features

An analysis, external to this Notebook, has taken place to identify the set of continous features that will be use din this experiment to generate Counterfactual values.

In [None]:
# Read in file with list of continuous features for which to generate the DiCE Counterfactuals
def read_cc_features(file_path):
    # Read the CSV file
    data = pd.read_csv(file_path)

    # Filter the data where Rank is 39 or between 41 and 53 (inclusive)
    filtered_data = data[(data['RANK'] == 39) | ((data['RANK'] >= 41) & (data['RANK'] <= 55))]

    # Extract the 'Feature' column values and return them as a list
    feature_list = filtered_data['FEATURE'].tolist()
    return feature_list

In [None]:
# Assume the file is in the same directory as Notebook
file_path = 'Select_CC_Fraud_Features_v1_1.csv'
cc_continuous_features_list = read_cc_features(file_path)

In [None]:
# Display List of continous features loaded from external XL file.
#cc_continuous_features_list

#### Verify DiCE Counterfactual Data Inputs

In [None]:
# Get the list of columns from loaded dataframe
#original_cols_names = df_downsampled_loaded.columns.tolist()

In [None]:
# Convert NumPy array to DataFrame
#df_downsampled_loaded_scaled = pd.DataFrame(df_downsampled_loaded_scaled, columns=original_cols_names)

In [None]:
df_downsampled_loaded_scaled['Fraud']

#### Build DiCE Counterfactual Function

In [None]:
# Jan 18th - use new loaded model and data - SCALED
# Define the data for DiCE based on your DataFrame
d = dice_ml.Data(dataframe=df_downsampled_loaded_scaled, 
                 continuous_features=cc_continuous_features_list, 
                 outcome_name='Fraud')

In [None]:
# Use the backend as TensorFlow and link the model
m = dice_ml.Model(model=loaded_model, backend='TF2')

In [None]:
# Initialize DiCE
exp = dice_ml.Dice(d, m)

#### Example 1: Sample DiCE Counterfactual example (x2)

The code below provides examples of generated counterfactuals. In the first two examples, for each instance entry the code has generated five counterfactuals.

In [None]:
# Assuming 'X_test' is a numpy array and you have a list of the original column names
#feature_names = [col for col in df_downsampled_loaded.columns if col != 'Fraud']

In [None]:
# Convert NumPy array to DataFrame
#X_test_loaded_scaled = pd.DataFrame(X_test_loaded_scaled, columns=feature_names)

In [None]:
# Jan 18th - use new loaded model and data - SCALED
#query_instances = X_test_loaded_scaled.iloc[0:2]#.drop('Fraud', axis=1)  # Taking the first two instances

In [None]:
# Change query instances to match the instances you are interested in
# Jan 18th - use new loaded model and data - SCALED
query_instances = df_downsampled_loaded_scaled.iloc[0:2].drop('Fraud', axis=1)  # Taking the first two instances

In [None]:
# Generate counterfactual explanations
counterfactuals = exp.generate_counterfactuals(query_instances, total_CFs=5, desired_class="opposite")

In [None]:
# Visualize the counterfactual explanations
counterfactuals.visualize_as_dataframe(show_only_changes=True)

Reverse Scale the ouptut to make the visualisation more meaningful

In [None]:
# Step 1: Retrieve counterfactuals as a DataFrame
cf_df = counterfactuals.cf_examples_list[0].final_cfs_df

In [None]:
# Step 2: Inverse scale the counterfactuals
#cf_df_inverse_scaled = pd.DataFrame(scaler.inverse_transform(cf_df), columns=cf_df.columns)

In [None]:
# Step 2: Inverse scale the counterfactuals
cf_df_inverse_scaled = pd.DataFrame(scale_loaded_wf.inverse_transform(cf_df), columns=cf_df.columns)

In [None]:
# Step 3: Visualize the inverse scaled counterfactuals
# You can now use cf_df_inverse_scaled for a more interpretable visualization
# For example, you can print it or use any visualization library like matplotlib, seaborn, etc.
#print(cf_df_inverse_scaled)

------------------------

#### Example 2: Counterfactual - Highlighted Display

This example uses a display routine to improve the visual highlighting of the counterfactuals.

In [None]:
# Generate counterfactuals
# Jan 18th - use new loaded model and data - SCALED
query_instance = df_downsampled_loaded_scaled.iloc[0:1].drop('Fraud', axis=1)

In [None]:
# Generate counterfactuals
dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=5, desired_class="opposite")

#### Visualize Counterfactuals (Single Set) - No Highlights

In [None]:
dice_exp.visualize_as_dataframe()

In [None]:
def generate_and_visualize_counterfactuals(query_instance, scaler, exp, total_CFs=5, desired_class="opposite"):
    # Generate counterfactuals
    dice_expv = exp.generate_counterfactuals(query_instance, total_CFs=total_CFs, desired_class=desired_class)
    
    # Extract counterfactuals as a DataFrame
    cf_dfv = dice_expv.cf_examples_list[0].final_cfs_df
    
    # Inverse scale the counterfactuals
    cf_df_inverse_scaled = pd.DataFrame(scaler.inverse_transform(cf_dfv), columns=cf_dfv.columns)
    
    # Visualize the inverse scaled counterfactuals
    return cf_df_inverse_scaled

In [None]:
# Usage
query_instance = df_downsampled_loaded_scaled.iloc[0:1].drop('Fraud', axis=1)
#inverse_scaled_cfs = generate_and_visualize_counterfactuals(query_instance, scale_loaded, exp)
#print(inverse_scaled_cfs)

#### Visualize Counterfactuals (Single Set) - With Highlights

##### Create Display Function

In [None]:
def highlight_differences(query_instance, counterfactuals_df):
    """
    Compares a query instance (as a Series) with counterfactual instances in a DataFrame.
    
    Args:
    - query_instance (pd.Series): The original data instance.
    - counterfactuals_df (pd.DataFrame): DataFrame containing counterfactual instances.
    
    Returns:
    - A styled DataFrame where:
        * The original instance is highlighted entirely.
        * Cells with differences in counterfactuals are highlighted.
    """
    # Convert query_instance to DataFrame and concatenate with counterfactuals_df
    combined_df = pd.concat([query_instance.to_frame().T, counterfactuals_df], axis=0).reset_index(drop=True)
    
    def highlight_cells(row):
        """Helper function to apply the styling."""
        if row.name == 0:  # If it's the original instance
            return ['background-color: lightblue' for _ in row.index]
        
        # For counterfactual rows
        colors = []
        for col in row.index:
            original_value = query_instance[col]
            cf_value = row[col]
            
            # Convert to the same data type if they are different
            if type(original_value) != type(cf_value):
                try:
                    original_value = type(cf_value)(original_value)
                except ValueError:
                    try:
                        cf_value = type(original_value)(cf_value)
                    except ValueError:
                        pass
            
            # Handle float comparisons with a small tolerance
            if isinstance(original_value, float) and isinstance(cf_value, float):
                if abs(original_value - cf_value) < 1e-9:
                    colors.append('')
                else:
                    colors.append('background-color: yellow')
            elif original_value != cf_value:
                colors.append('background-color: yellow')
            else:
                colors.append('')
        return colors
    
    styled_df = combined_df.style.apply(highlight_cells, axis=1)
    return styled_df

# This refined version of the function should handle potential data type mismatches better.

##### Display Differences - with Highlights

In [None]:
# 2. Extract counterfactuals to a DataFrame
your_actual_counterfactuals_df = dice_exp.cf_examples_list[0].final_cfs_df

In [None]:
# Inverse scale the counterfactuals
actual_counterfactuals_df_inverse_scaled = pd.DataFrame(scale_loaded_wf.inverse_transform(your_actual_counterfactuals_df), 
                                                        columns=your_actual_counterfactuals_df.columns)

In [None]:
#your_actual_counterfactuals_df.head()

In [None]:
query_instance_series = df_downsampled_loaded.iloc[0]

In [None]:
#styled_result = highlight_differences(query_instance_series, your_actual_counterfactuals_df)

In [None]:
styled_result = highlight_differences(query_instance_series, actual_counterfactuals_df_inverse_scaled)

In [None]:
# 3. Visualize differences
display(styled_result)

In [None]:
display(df_downsampled_loaded)

In [None]:
def highlight_differences_modified(query_instance, counterfactuals_df):
    # Adjust 'Fraud' value in counterfactuals_df
    fraud_value = 1 if query_instance['Fraud'] == 0 else 0
    counterfactuals_df['Fraud'] = fraud_value

    # Convert query_instance to DataFrame and concatenate with counterfactuals_df
    combined_df = pd.concat([query_instance.to_frame().T, counterfactuals_df], axis=0).reset_index(drop=True)

    def highlight_cells(row):
        
        """Helper function to apply the styling."""
        if row.name == 0:  # If it's the original instance
            return ['background-color: lightblue' for _ in row.index]
        
        # Styling function
        colors = []
        for col in row.index:
            original_value = query_instance[col]
            cf_value = row[col]

            # Convert to the same data type if they are different
            if type(original_value) != type(cf_value):
                try:
                    original_value = type(cf_value)(original_value)
                except ValueError:
                    try:
                        cf_value = type(original_value)(cf_value)
                    except ValueError:
                        pass

            # Handle float comparisons with increased tolerance
            if isinstance(original_value, float) and isinstance(cf_value, float):
                if abs(original_value - cf_value) <= 1.00:
                    colors.append('')
                else:
                    colors.append('background-color: yellow')
            elif original_value != cf_value:
                colors.append('background-color: yellow')
            else:
                colors.append('')
        return colors

    styled_df = combined_df.style.apply(highlight_cells, axis=1).format("{:.2f}", na_rep="-")
    return styled_df

In [None]:
styled_result = highlight_differences_modified(query_instance_series, actual_counterfactuals_df_inverse_scaled)

Display DiCE Counterfactuals for Instance

In [None]:
# 3. Visualize differences
display(styled_result)

# Prepare DiCE Input for Metric Calculations

In [None]:
@timeit
def generate_counterfactuals_for_instances(df, exp_block, num_instances=20, sLabel='Fraud'):
    """
    Generate counterfactual explanations for a specified number of instances from a dataframe.
    
    Args:
    - df (pd.DataFrame): The dataframe containing the original instances.
    - num_instances (int): The number of instances for which to generate counterfactuals.
    
    Returns:
    - original_instances_df (pd.DataFrame): DataFrame containing the original instances.
    - counterfactuals_df (pd.DataFrame): DataFrame containing the counterfactual explanations.
    """
    # Prepare an empty dataframe for counterfactuals
    counterfactuals_list = []
    
    #######################
    
    # Select a subset of the data for explanation (first nn instances)
    if num_instances > 0:
        
        #instances_to_explain = data_features.iloc[:limit, :]#25
        # Select the first 'num_instances' from the dataframe
        original_instances_df = df.head(num_instances)
    
    else:
        # Select all input feature for which to generate SHAP values
        original_instances_df = df
        
    #######################
    
    # Select the first 'num_instances' from the dataframe
    #original_instances_df = df.head(num_instances)
    
    for index, instance in original_instances_df.iterrows():
    #for _, instance in original_instances_df.iterrows():
        # Convert the instance to DataFrame
        
        print(f"Processing row number: {index}")
        #print(instance['OnlinePOSCount.cnt.day.present'])
        
        instance_df = instance.drop(sLabel).to_frame().T
        #instance_df = instance.drop('default').to_frame().T
        #instance_df = instance.to_frame().T
        
        print(f'instance_df[OnlinePOSCount.cnt.day.present]: {instance_df["OnlinePOSCount.cnt.day.present"]}')

        # Generate counterfactual for the instance
        # dice_exp = exp.generate_counterfactuals(instance_df, total_CFs=1, desired_class="opposite")
        #exp_block
        dice_exp_block = exp_block.generate_counterfactuals(instance_df, total_CFs=1, desired_class="opposite")
        
        
        # Extract the counterfactual to a DataFrame
        cf_df = dice_exp_block.cf_examples_list[0].final_cfs_df.drop(sLabel, axis=1)
        #cf_df = dice_exp.cf_examples_list[0].final_cfs_df.drop('default', axis=1)
        #cf_df = dice_exp.cf_examples_list[0].final_cfs_df
        
        # Append the counterfactual to the list
        counterfactuals_list.append(cf_df.iloc[0])
    
    # Reset Indexes of Output for alignment into XAI Metrics functions
    original_instances_df = original_instances_df.reset_index(drop=True)
    counterfactuals_df = pd.DataFrame(counterfactuals_list).reset_index(drop=True)
    
    return original_instances_df, counterfactuals_df

# XAI Experiments - Metrics Capture

## Suppress Warnings to clean up output

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

## Break out Model Test Data into a list of dataframes

### Create Test Data for Experiment Input

Step 1: Ensure 'X_test' and 'y_test' Are DataFrames with Proper Columns

In [None]:
# Assuming 'X_test' is a numpy array and you have a list of the original column names
original_feature_names = [col for col in df_downsampled_loaded.columns if col != 'Fraud']

In [None]:
# Ensure X_test_loaded has the correct column names (if necessary)
#X_test_loaded.columns = original_feature_names

In [None]:
# Jan 18th - use new loaded model and data - SCALED
# Convert NumPy array to DataFrame
X_test_loaded_scaled = pd.DataFrame(X_test_loaded_scaled, columns=original_feature_names)

In [None]:
# Combine X_test_loaded and y_test into a single DataFrame
df_TestData = pd.concat([X_test_loaded_scaled, y_test_loaded], axis=1)

In [None]:
df_TestData

### Split the DataFrame into 20 consecutive smaller DataFrames

In [None]:
# Split the DataFrame into 20 consecutive smaller DataFrames
split_size, list_df = split_TestData_into_nn_Blocks(df_TestData, num_splits = 20)

### Check Label Count for Stability Metrics

In [None]:
# Count the occurrence of each unique value in the 'Fraud' column
fraud_counts = df_TestData['Fraud'].value_counts()

# Display the counts
print("Breakdown of 'Fraud' and non-Fraud label records in df_TestData:")
print(fraud_counts)

### Add a routine to check output values

In [None]:
# Display starting points in the first nn sub dataframes
startBlockDisplay(df_TestData, split_size, 1)

## Confirm Starting Point in External DiCE XAI XL File

The code below acts so that for each dataframe in the list just created the following actions are carried out;

Check if an XAI results XL spreadsheet called 'DiCE_XAI_Metrics_Experiments.xls' exists;

If not create an empty XL spreadsheet with the name 'DiCE_XAI_Metrics_Experiments.xls', and then define a variable called ‘Sample’ with an integer value of 1 and print the value of 'Sample' to output.

If and XL spreadsheet called 'DiCE_XAI_Metrics_Experiments.xls' does exist, then read the entries in the spreadsheet in the first column named ‘Sample Number’ and create a variable in this Python program named ‘Sample’ that is one integer value higher than the highest integer number column named ‘Sample Number’ in the XL, and print this value of 'Sample' to output.

In [None]:
# Create a sequential number as an identifier for each DataFrame
list_df = {f'df_{i + 1}': list_df[i] for i in range(len(list_df))}

In [None]:
# File path for the XAI results spreadsheet
DiCE_xai_file_path = 'DICE_XAI_Metrics_Experiments.xlsx'  # Stored locally

In [None]:
# Call Function to update or create the spreadsheet and determine the 'Sample' number
# Process each dataframe in 'list_df'
sample = return_next_sample_number_to_process(list_df, DiCE_xai_file_path, "DiCE")

## Select Next Dataframe to Process

---------------------------------

	
Extend the Python code so that the code reads in the dataframe from 'list df' that corresponds to the integer value in the 
variable named ‘Sample’. 

Assign this dataframe the name 'df_Selected_from_List'.


----------------------------------

### Initialize Dataframe to Capture Re-start Point as None

In [None]:
# Initialize df_Selected_from_List as None
df_Selected_from_List = None

### Extract test data block to restart XAI metrics process

In [None]:
df_Selected_from_List, key = select_restart_testdata_block(df_Selected_from_List, 
                                                           list_df, 
                                                           DiCE_xai_file_path)

In [None]:
# If no DataFrame is selected (e.g., if 'Sample' exceeds the number of DataFrames in list_df)
if 'df_Selected_from_List' not in locals():
    print("No DataFrame selected. The 'Sample' number may exceed the number of DataFrames in list_df.")

------------------------------------

## Generate XAI Metrics from Dataframe

In [None]:
#df_Selected_from_List

### Generate DiCE Counterfactuals for the Test Data Block

#### Pre-Process Values for Data Block

In [None]:
df_Selected_from_List.head(2)

In [None]:
print(df_Selected_from_List.index)

#### Scale the feature values

Call Scaling Function

In [None]:
# Assuming 'X_test' is a numpy array and you have a list of the original column names
original_feature_names = [col for col in df_downsampled_loaded.columns if col != 'Fraud']

In [None]:
# Scale the feature inputs so that they work with the SHAP generation processs
#df_Selected_Scaled_Data_from_List = scale_feature_inputs(df_Selected_from_List, 
#                                                         original_feature_names)

In [None]:
df_Selected_Scaled_Data_from_List = df_Selected_from_List

In [None]:
df_Selected_Scaled_Data_from_List

#### Review DiCE Data Block

In [None]:
df_Selected_Scaled_Data_from_List['OnlinePOSCount.cnt.day.present']

In [None]:
#def generate_counterfactuals_for_instances(df, num_instances=20, sLabel='Fraud'):

In [None]:
df_downsampled_loaded_scaled

In [None]:
df_Selected_Scaled_Data_from_List

In [None]:
def check_values_outside_range(df_downsampled_loaded_scaled, df_Selected_Scaled_Data_from_List):
    # Get the range of values in 'OnlinePOSCount.cnt.day.present' of df_downsampled_loaded_scaled
    min_value = df_downsampled_loaded_scaled['OnlinePOSCount.cnt.day.present'].min()
    max_value = df_downsampled_loaded_scaled['OnlinePOSCount.cnt.day.present'].max()

    print(f"Range in df_downsampled_loaded_scaled: {min_value} to {max_value}")
    
    
    min_value2 = df_Selected_Scaled_Data_from_List['OnlinePOSCount.cnt.day.present'].min()
    max_value2 = df_Selected_Scaled_Data_from_List['OnlinePOSCount.cnt.day.present'].max()

    print(f"Range in df_Selected_Scaled_Data_from_List: {min_value2} to {max_value2}")

    # Iterate through df_Selected_Scaled_Data_from_List and print values outside the range
    for value in df_Selected_Scaled_Data_from_List['OnlinePOSCount.cnt.day.present']:
        if value < min_value or value > max_value:
            print(f"Value outside range: {value}")

In [None]:
check_values_outside_range(df_downsampled_loaded_scaled, df_Selected_Scaled_Data_from_List)

In [None]:
#d2 = dice_ml.Data(dataframe=df_downsampled_loaded_scaled, 
#                 continuous_features=cc_continuous_features_list, 
#                 outcome_name='Fraud')

In [None]:
d2 = dice_ml.Data(dataframe=df_TestData, 
                 continuous_features=cc_continuous_features_list, 
                 outcome_name='Fraud')

In [None]:
m2 = dice_ml.Model(model=loaded_model, backend='TF2')

In [None]:
exp_block = dice_ml.Dice(d2, m2)

#### DiCE Data Pre-Check

In [None]:
# Set option to display all columns (you can adjust the number as needed)
pd.set_option('display.max_columns', None)

In [None]:
# Jan 18th - use new loaded model and data - SCALED
query_instances_block = df_Selected_Scaled_Data_from_List.iloc[0:3].drop('Fraud', axis=1)  # Taking the first instance

In [None]:
counterfactuals_block = exp_block.generate_counterfactuals(query_instances_block, total_CFs=1, desired_class="opposite")

In [None]:
counterfactuals_block.visualize_as_dataframe(show_only_changes=True)

Extract the label values from the data block

In [None]:
y_test_block_labels_df = df_Selected_Scaled_Data_from_List['Fraud']

In [None]:
y_test_block_labels_df.shape

#### Get DiCE Values for Data Block

Set limit value (for debugging)

In [None]:
# A 'zero' limit value will process the entire data block
limit_data_block_rows = 0

In [None]:
results_DiCE, exec_time_Dice = generate_counterfactuals_for_instances(df_Selected_Scaled_Data_from_List,
                                                                      exp_block,
                                                                      limit_data_block_rows)

In [None]:
original_df_DiCE, cf_df_DiCE = results_DiCE

In [None]:
print(original_df_DiCE.index)

In [None]:
print(cf_df_DiCE.index)

In [None]:
original_df_DiCE

In [None]:
cf_df_DiCE

### Generate Identity Metric

#### Pre-Process Identity Inputs

In [None]:
#original_df_DiCE, cf_df_DiCE = scale_feature_xai_inputs(original_df_DiCE, 
#                                                        cf_df_DiCE, 
#                                                        df_downsampled_loaded)

In [None]:
#scaler = StandardScaler()

In [None]:
#original_df_DiCE = scaler.fit_transform(original_df_DiCE)

In [None]:
#cf_df_DiCE = scaler.fit_transform(cf_df_DiCE)

In [None]:
# Extract the feature names, including the target variable 'Fraud'
#column_names_wDefault = df_downsampled_loaded.columns

In [None]:
# Convert NumPy array to DataFrame
#original_df_DiCE = pd.DataFrame(original_df_DiCE, columns=column_names_wDefault)

In [None]:
#column_names = df_downsampled_loaded.drop('Fraud', axis=1).columns

In [None]:
# Convert NumPy array to DataFrame
#cf_df_DiCE = pd.DataFrame(cf_df_DiCE, columns=column_names)

In [None]:
# Convert all values to float for consistent data type
#original_df_DiCE = original_df_DiCE.astype(float)
#cf_df_DiCE = cf_df_DiCE.astype(float)

#### Run a Basic Test First

In [None]:
# Select two random instances from the DiCE dataframe
df_xai_numerical = cf_df_DiCE

random_indices = np.random.choice(df_xai_numerical.index, size=2, replace=False)
instance_1 = df_xai_numerical.iloc[random_indices[0]]
instance_2 = df_xai_numerical.iloc[random_indices[1]]

# Compute the Euclidean distance between the selected instances - uses custom project function
distance = get_euclidean_distance(instance_1, instance_2)
print(f"Euclidean distance between instance {random_indices[0]} and instance {random_indices[1]}: {distance:.4f}")

#### Retrieve Identity Score

In [None]:
from scipy.spatial import distance

In [None]:
DiCE_Identity_Metric = get_identity_metric(original_df_DiCE, cf_df_DiCE, "DiCE")

#### Display Identity Score Metric

In [None]:
DiCE_Ident_Number = "{:.2f}%".format(DiCE_Identity_Metric)

In [None]:
display_text("DiCE Identity Metric Score: " + DiCE_Ident_Number)

In [None]:
# Read in XAI Metric for Identity
XAI_Ident_Metric_1 = DiCE_Identity_Metric

----------------------------------

### Generate Stability Metric

#### Pre-Processing of Stability Input Data

In [None]:
original_df_DiCE.shape

In [None]:
cf_df_DiCE.index

In [None]:
y_test_loaded.shape

In [None]:
y_test_block_labels_df.shape

In [None]:
print('y_test_block_labels_df')
print(y_test_block_labels_df)

In [None]:
# Adjust the label value input to match earlier adjustments in DiCE value creations
if limit_data_block_rows > 0:
    y_test_block_labels_df = y_test_block_labels_df.iloc[:limit_data_block_rows]    

In [None]:
y_test_block_labels_df.shape

In [None]:
# Assigning the column name 'Fraud'
y_test_block_labels_df.columns = ['Fraud']

In [None]:
y_test_block_labels_df.index

In [None]:
y_test_block_labels_df = y_test_block_labels_df.reset_index(drop=True)

In [None]:
y_test_block_labels_df.index

In [None]:
df_LabelCount = pd.DataFrame(y_test_block_labels_df)

In [None]:
# Count the occurrence of each unique value in the 'Fraud' column
fraud_counts_label = df_LabelCount['Fraud'].value_counts()

# Display the counts
print("Breakdown of 'Fraud' and non-Fraud label records in df_TestData:")
print(fraud_counts_label)

In [None]:
# Counting the occurrences of each label
#label_counts = df['Fraud'].value_counts()

# Finding the label with the most entries
#largest_label = label_counts.idxmax()
largest_label = fraud_counts_label.idxmax()

# Assigning it to largest_label_count
#largest_label_count = label_counts[largest_label]
largest_label_count = fraud_counts_label[largest_label]

print("Label with most entries:", largest_label)
print("Count of this label:", largest_label_count)

#### Retrieve Stability Score

In [None]:
#DiCE_Stability_Metric = get_stability_metric_y(cf_df_DiCE, 
#                                               y_test_loaded,
#                                               largest_label, 
#                                               'DiCE')

In [None]:
DiCE_Stability_Metric = get_stability_metric_y(cf_df_DiCE, 
                                               y_test_block_labels_df,
                                               largest_label, 
                                               'DiCE')

#### Display Stability Score Metric

In [None]:
DiCE_Stbly_Number = "{:.2f}%".format(DiCE_Stability_Metric)

In [None]:
display_text("DiCE Stability Metric Score: " + DiCE_Stbly_Number)

In [None]:
# Read in XAI Metric for Stability
XAI_Stability_Metric_2 = DiCE_Stability_Metric

-----------------------------

### Generate Seperability Metric

#### Retrieve Seperability Score

In [None]:
print(original_df_DiCE.index)

In [None]:
print(cf_df_DiCE.index)

In [None]:
original_df_DiCE

In [None]:
cf_df_DiCE

In [None]:
#DiCE_Seperability_Metric = get_seperability_metric(original_df_DiCE, cf_df_DiCE, "DiCE")

In [None]:
DiCE_Seperability_Metric = get_seperability_metric(original_df_DiCE, 
                                                   cf_df_DiCE, 
                                                   "DiCE",
                                                   0.80, # threshold  #0.51
                                                   0.35) # tolerance)

#### Display Seperability Score Metric

In [None]:
DiCE_Seperability_Number = "{:.2f}%".format(DiCE_Seperability_Metric)

In [None]:
display_text("DiCE Seperability Metric Score: " + DiCE_Seperability_Number)

In [None]:
# Read in XAI Metric for Seperability
XAI_Seperability_Metric_3 = DiCE_Seperability_Metric

----------------------------

### Generate Similarity Metric

#### Retrieve Similarity Score

In [None]:
print(original_df_DiCE.index)

In [None]:
print(cf_df_DiCE.index)

In [None]:
DiCE_Similarity_Metric = get_similarity_metric(original_df_DiCE, 
                                               cf_df_DiCE, 
                                               "DiCE", 
                                               use_dbscan=False)

#### Display Similarity Score Metric

In [None]:
DiCE_Similarity_Number = "{:6.2f}".format(DiCE_Similarity_Metric)

In [None]:
display_text("DiCE Similarity Metric Value: " + DiCE_Similarity_Number)

In [None]:
# Read in XAI Metric for Similarity
XAI_Similarity_Metric_4 = DiCE_Similarity_Metric

-------------------------------

### Display Final Set of Metrics (this run)

In [None]:
# Print the results
print(f"XAI Ident Metric 1: {XAI_Ident_Metric_1}")
print(f"XAI Stability Metric 2: {XAI_Stability_Metric_2}")
print(f"XAI Seperability Metric 1: {XAI_Seperability_Metric_3}")
print(f"XAI Similarity Metric 1: {XAI_Similarity_Metric_4}")
print(f"XAI Time Metric 5: {exec_time_Dice} seconds")

------------------------------------

## Write Out Metrics to XL

In [None]:
print(type(df_Selected_from_List))

In [None]:
write_xai_Metrics_to_XL(DiCE_xai_file_path, 
                        sample, 
                        DiCE_Identity_Metric, 
                        DiCE_Stability_Metric, 
                        DiCE_Seperability_Metric, 
                        DiCE_Similarity_Metric, 
                        exec_time_Dice, 
                        df_Selected_from_List,
                        "DiCE")