# Disseration Experiment 3e
# Model Build and SHAP Metric x 2  (Credit Default) October Twenty Eight¶
Ciaran Finnegan October 2023

# Import Libraries + Custom Functions

## Import Libraries

In [None]:
# Import libs
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
import shap
import random

from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

# Import libraries to build ANN model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


from sklearn.preprocessing import LabelEncoder
import warnings
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample


# Classifier training (not used for explainability)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#Import libraries for explainer metrics
from sklearn.cluster import KMeans
from scipy.spatial import distance


# Additional display libraires
import contextlib
import os
import sys
from contextlib import contextmanager

## Custom Functions

Dataset Visualisations

In [None]:
%run ./DS_Visualisation_Functions.ipynb

Metrics

In [None]:
%run ./XAI_Metrics_Functions.ipynb

Model Evaluation Functions

In [None]:
%run ./DS_Model_Evaluation_Functions.ipynb

# Data Visualisation and Exploration

## Import Data

In [None]:
ds_file_to_load = 'credit_default_data.csv'
df = pd.read_csv(ds_file_to_load)

## Data Exploration

### Dataset Structure

In [None]:
# Display the first few rows of the dataset to understand its structure
styled_dataframe(df.head())

In [None]:
# Reset default Pandas display options
pd.reset_option('display.max_columns')
pd.reset_option('display.expand_frame_repr')
pd.reset_option('display.max_colwidth')
# Display the dataframe
display(df.head())

### Generate Visualizations

In [None]:
# Set up the target and features to be visualised

sTarget_feature = 'default'
sFeature_analysis_1 = 'LIMIT_BAL'
sFeature_analysis_2 = 'AGE'
sFeature_analysis_3 = 'SEX'
sFeature3_ticklabel1 = 'Male'
sFeature3_ticklabel2 = 'Female'

#### Generate Visualizations to better understand the data distribution and relationships between features.

#### Bar and Box Plot Visualisations

In [None]:
generate_box_plots(df, sTarget_feature, 
                       sFeature_analysis_1, 
                       sFeature_analysis_2, 
                       sFeature_analysis_3,
                       sFeature3_ticklabel1, 
                       sFeature3_ticklabel2)

#### Heatmap Visualisation

In [None]:
# Would need feature reduction to work effectively - or some other filtering

In [None]:
generate_heatmap(df, "Credit Default")

#### Distributions

In [None]:
generate_distributions(df, 
                       sFeature_analysis_1, 
                       sFeature_analysis_2, 
                       sFeature_analysis_3)

# Feature Engineering

## Check for Missing Data

In [None]:
# Determine the threshold for missing values
threshold = 0.75 * len(df)

# Identify columns with missing values greater than the threshold
missing_columns = df.columns[df.isnull().sum() > threshold]

# Print the columns with more than 75% missing values
print("Columns with more than 75% missing values:", missing_columns)

# Drop columns with missing values greater than the threshold
df = df.drop(columns=missing_columns)

# Save or continue processing with columns removed that had high volumes of missing data

In [None]:
# Display the first few rows of the dataset to re-check structure once any columns with 
# significant amounts of missing data have been removed
df.head()

## Categorical Data 

In [None]:
# List of categorical columns
cat_cols = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

In [None]:
# One-hot encode categorical variables
#df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df_encoded = pd.get_dummies(df, columns=cat_cols)

In [None]:
df.head()

In [None]:
# Display the first few rows of the dataset to understand its structure
df_encoded.head()

In [None]:
# display all columns
pd.set_option('display.max_columns', None)
print(df_encoded)

# Build Model

## Downsample Majority Class

In [None]:
# Check the distribution of the target variable
target_distribution = df_encoded['default'].value_counts()

target_distribution

In [None]:
# Separate the majority and minority classes
df_majority = df_encoded[df_encoded['default'] == 0]
df_minority = df_encoded[df_encoded['default'] == 1]

In [None]:
# Downsample the majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False, 
                                   n_samples=target_distribution[1], 
                                   random_state=42)

In [None]:
# Combine the downsampled majority class with the minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [None]:
# Shuffle the dataset to mix the data points
df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Display the distribution of the target variable in the downsampled dataset
df_downsampled['default'].value_counts()

## Split Features + Target

In [None]:
# Split data into features and target
#X = df_encoded.drop('default', axis=1)
#y = df_encoded['default']

In [None]:
# Splitting the features and target variable
X = df_downsampled.drop('default', axis=1)
y = df_downsampled['default']

## Split Data into Test/Training Datasets

In [None]:
# Split into inference and training splits
#X_train, X_inf, y_train, y_inf = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
# Splitting the data into training and testing sets
X_train_downsampled, X_test_downsampled, y_train_downsampled, y_test_downsampled = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
# Reset Indexes
X_train_downsampled = X_train_downsampled.reset_index(drop=True)
X_test_downsampled = X_test_downsampled.reset_index(drop=True)

y_train_downsampled = y_train_downsampled.reset_index(drop=True)
y_test_downsampled = y_test_downsampled.reset_index(drop=True)

In [None]:
# Split Train into train test
#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.30, random_state=42)

In [None]:
#X_train = X_train.reset_index(drop=True)
#X_test= X_test.reset_index(drop=True)
#X_inf = X_inf.reset_index(drop=True)

#y_train = y_train.reset_index(drop=True)
#y_test= y_test.reset_index(drop=True)
#y_inf = y_inf.reset_index(drop=True)

### Basic Additional Data Exploration (Training Data)

In [None]:
# Train model Stats
print("Number of Features:", X_train_downsampled.shape[1])
print("Number Continuous Features:", X_train_downsampled.shape[1] - len(cat_cols))
print("Number Categorical Features:", len(cat_cols))
print("Number Train Examples:", X_train_downsampled.shape[0])
print("Number Positive Train Examples:", (y_train_downsampled == 1).sum())
print("Number Negative Train Examples:", (y_train_downsampled == 0).sum())

## Build Neural Network (w/TensorFlow/Keras)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt

In [None]:
# Define the model with hyperparameter tuning
def build_model(hp):
    model = Sequential()
    
    model.add(Dense(units=hp.Int('units_input', min_value=32, max_value=512, step=32), 
                    activation='relu', input_shape=(X_train_downsampled.shape[1],)))
    model.add(Dropout(rate=hp.Float('dropout_input', min_value=0.0, max_value=0.5, default=0.25, step=0.05)))
    
    for i in range(hp.Int('n_layers', 1, 3)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32), activation='relu'))
        model.add(Dropout(rate=hp.Float('dropout_' + str(i), min_value=0.0, max_value=0.5, default=0.25, step=0.05)))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
# Set up the tuner
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,#10
    executions_per_trial=1,
    directory='my_dir',
    project_name='helloworld'
)

In [None]:
# Perform hyperparameter tuning
tuner.search(X_train_downsampled, 
             y_train_downsampled, 
             epochs=50,#50 
             validation_split=0.2, 
             verbose=1)

In [None]:
# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units_input')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

In [None]:
# Build the model with the best hyperparameters
model = build_model(best_hps)

In [None]:
# Train the model
history = model.fit(
    X_train_downsampled, 
    y_train_downsampled, 
    epochs=50, #50
    batch_size=32, 
    validation_split=0.2, 
    verbose=1
)

# Evaluate Model

## Assess and Display Model Peformance

### Tabular Data

In [None]:
y_pred_funct = display_model_metrics_tabular(model, X_test_downsampled, y_test_downsampled)

### Confusion Matrix

In [None]:
generate_confusion_matrix(y_test_downsampled, y_pred_funct)

# Generate Shap Values

In [None]:
# Create a SHAP explainer
explainer = shap.KernelExplainer(model.predict, shap.sample(X_train_downsampled, 100)) #100

In [None]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    # Your code that produces warnings goes here
    shap_values = explainer.shap_values(X_test_downsampled.iloc[:100,:], silent=True) #100

In [None]:
# Create a SHAP summary plot
shap.summary_plot(shap_values, X_test_downsampled.iloc[:100,:], feature_names=X_train_downsampled.columns)

In [None]:
#plt.show()

# Prepare Data for Metric Calculations

In [None]:
import shap
import pandas as pd
import warnings

In [None]:
def generate_shap_explanations(model, data, target_column='default', 
                                output_instance_file='instances2.csv', 
                                output_shap_file='shap_values2.csv'):
    # Drop the target column from the data
    data_features = data.drop(columns=[target_column])
    
    # Select a subset of the data for explanation (first 25 instances)
    instances_to_explain = data_features.iloc[:25, :]#25
    
    # Create a SHAP explainer
    explainer = shap.KernelExplainer(model.predict, shap.sample(data_features, 100)) #100
    
    # Generate SHAP values for the instances
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        # Your code that produces warnings goes here
        shap_values = explainer.shap_values(instances_to_explain)
    
    # Convert the SHAP values to a DataFrame
    if isinstance(shap_values, list):
        # For multi-class models, average the SHAP values over all classes
        shap_values = np.mean(shap_values, axis=0)
    df_shap_values = pd.DataFrame(shap_values, columns=data_features.columns)
    
    # Output the SHAP values to a csv file
    df_shap_values.to_csv(output_shap_file, index=False)
    
    # Output the instances to a csv file
    instances_to_explain.to_csv(output_instance_file, index=False)
    
    return instances_to_explain, df_shap_values

In [None]:
# Assume that `model` is your trained model and `df_encoded` is your dataframe
df_instances, df_shap_values = generate_shap_explanations(model, df_encoded)

# Generate Shap Values

## Single Random Observation

In [None]:
# Select a random observation from the test dataset
random_observation = X_test_downsampled.sample(1, random_state=42)

In [None]:
# Generate SHAP values for the instances
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    # Your code that produces warnings goes here
    shap_values_random_observation = explainer.shap_values(random_observation)

In [None]:
# Get the SHAP values for class 1 (default) for this observation
shap_values_observation_class1 = shap_values_random_observation[0]

In [None]:
# Convert SHAP values to a Series for easier manipulation
shap_values_series = pd.Series(shap_values_observation_class1[0], index=random_observation.columns)

In [None]:
# Sort the features based on absolute SHAP value
sorted_features = shap_values_series.abs().sort_values(ascending=False)

In [None]:
# Display the top 20 features for the random observation in an aesthetically pleasing tabular format
top_20_features_observation = sorted_features.head(20)
top_20_features_df_observation = pd.DataFrame({'Feature': top_20_features_observation.index, 
                                               'SHAP Value': top_20_features_observation.values})

In [None]:
# Display the index (row number) of the selected observation
print(f"Selected Row Number from Test Data: {random_observation.index[0]}")

In [None]:
# Display the SHAP values for the top 20 features of the observation
print("\nTop 20 Features and Their SHAP Values:")
display(HTML(xai_styles + top_20_features_df_observation.to_html(index=False)))

# Generate XAI Metrics 

## Identity

#### Run a Basic Test First

In [None]:
# Select two random instances from the SHAP value dataframe
df_xai_numerical = df_shap_values

random_indices = np.random.choice(df_xai_numerical.index, size=2, replace=False)
instance_1 = df_xai_numerical.iloc[random_indices[0]]
instance_2 = df_xai_numerical.iloc[random_indices[1]]

# Compute the Euclidean distance between the selected instances - uses custom project function
distance = get_euclidean_distance(instance_1, instance_2)
print(f"Euclidean distance between instance {random_indices[0]} and instance {random_indices[1]}: {distance:.4f}")

#### Retrieve Identity Score

In [None]:
from scipy.spatial import distance

In [None]:
SHAP_Identity_Metric = get_identity_metric(df_instances, df_shap_values, "SHAP")

In [None]:
def match_percentage_ident1(features_df, shap_values_df):
    """
    For each instance in the feature dataframe, this function identifies the closest instance 
    based on Euclidean distance. It then does the same for the corresponding SHAP value. 
    The function checks if the closest instances for both features and SHAP values match.
    
    Returns:
        Percentage of instances where the closest feature and SHAP value instances match.
    """

    # Initialize match count to zero
    match_count = 0
    
    # Loop through each instance in the feature dataframe
    for idx, instance in features_df.iterrows():
        # Compute the Euclidean distance between the current instance and all other instances
        feature_distances = features_df.drop(index=idx).apply(lambda row: distance.euclidean(row, instance), axis=1)
        
        # Identify the index of the closest instance
        closest_feature_idx = feature_distances.idxmin()
        
        # Repeat the process for SHAP values
        shap_instance = shap_values_df.loc[idx]
        shap_distances = shap_values_df.drop(index=idx).apply(lambda row: distance.euclidean(row, shap_instance), axis=1)
        closest_shap_idx = shap_distances.idxmin()
        
        # Check if the closest instances for both features and SHAP values match
        if closest_feature_idx == closest_shap_idx:
            match_count += 1
        
        # Print the distances for debugging purposes
        print(f"Instance {idx}:   Current matches: {match_count}")
        print(f"\tClosest feature instance: {closest_feature_idx} (Distance: {feature_distances[closest_feature_idx]:.4f})")
        print(f"\tClosest SHAP instance: {closest_shap_idx} (Distance: {shap_distances[closest_shap_idx]:.4f})")

    # Compute the matching percentage
    percentage = (match_count / len(features_df)) * 100
    print(f"\nPercentage of matches: {percentage:.2f}%   {match_count} Matches of {len(features_df)} Entries")
    
    return percentage

In [None]:
# Test the function
match_percentage_ident1(df_instances, df_shap_values)

#### Display Identity Score Metric

In [None]:
SHAP_Identity_Metric

In [None]:
SHAP_Identity_Number = "{:.2f}%".format(SHAP_Identity_Metric)

In [None]:
display_text("SHAP Identity Metric Score: " + SHAP_Identity_Number)

## Stability

### Invoke Stability Metric Function

#### Retrieve Stability Score

In [None]:
SHAP_Stability_Metric = get_stability_metric_y(df_shap_values, y_test_downsampled, 'SHAP')

#### Display Stability Score Metric

In [None]:
SHAP_Stability_Metric

In [None]:
SHAP_Stability_Number = "{:.2f}%".format(SHAP_Stability_Metric)

In [None]:
display_text("SHAP Stability Metric Score: " + SHAP_Stability_Number)

In [None]:
def calc_stability_csv(shap_values_df):
    """
    This function performs the following steps:
    1. Clusters the SHAP values into two clusters using the k-means algorithm.
    2. Assigns the actual target value from the test dataset to each instance in the SHAP values dataframe.
    3. Calculates the percentage of rows where the target class '0' matches the cluster value '0'.
    4. Outputs the final dataframe with cluster assignments and actual target values to a CSV file.
    
    Returns:
        Percentage of instances where target class '0' matches cluster value '0'.
    """
    
    # Cluster the SHAP values into two clusters
    kmeans = KMeans(n_clusters=2, random_state=42).fit(shap_values_df)
    
    # Get the cluster labels
    cluster_labels = kmeans.labels_
    
    # Create a new dataframe with an additional column indicating the cluster assignment
    clustered_df = shap_values_df.copy()
    clustered_df['Cluster'] = cluster_labels
    
    # Rename clusters so that the largest cluster is always labeled '0'
    if sum(cluster_labels) > len(cluster_labels) / 2:
        clustered_df['Cluster'] = clustered_df['Cluster'].map({0: '1', 1: '0'})
    
    # Print the number of instances assigned to each cluster
    cluster_0_count = clustered_df[clustered_df['Cluster'] == '0'].shape[0]
    cluster_1_count = clustered_df[clustered_df['Cluster'] == '1'].shape[0]
    print(f"Number of Instances in Cluster '0': {cluster_0_count}")
    print(f"Number of Instances in Cluster '1': {cluster_1_count}")
    
    # Assign the appropriate subset of y_test values to the dataframe based on the selected indices
    clustered_df['Actual'] = y_test_downsampled.loc[clustered_df.index].values
    
    # Calculate the percentage of rows where the target class '0' matches the cluster value '0'
    matches_0 = clustered_df[(clustered_df['Cluster'] == '0') & (clustered_df['Actual'] == 0)].shape[0]
    total_class_0 = clustered_df[clustered_df['Actual'] == 0].shape[0]
    
    # Calculate the percentage of rows where the target class '1' matches the cluster value '1'
    matches_1 = clustered_df[(clustered_df['Cluster'] == '1') & (clustered_df['Actual'] == 1)].shape[0]
    total_class_1 = clustered_df[clustered_df['Actual'] == 1].shape[0]
    
    # Print the results for class '0'
    print(f"\nFor Class '0':")
    print(f"Total Instances: {total_class_0}")
    print(f"Matching Cluster '0' Instances: {matches_0}")
    
    # Print the results for class '1'
    print(f"\nFor Class '1':")
    print(f"Total Instances: {total_class_1}")
    print(f"Matching Cluster '1' Instances: {matches_1}")
    
    # Output the final dataframe to a CSV file
    clustered_df.to_csv('clustered_stability.csv', index=True)
    print("\nOutput saved to 'clustered_stability.csv'")
    
    return (matches_0 / total_class_0) * 100

In [None]:
# Test the function
calc_stability_csv(df_shap_values)