<a href="https://colab.research.google.com/github/MDankloff/ClusterCompas/blob/main/V3_COMPAS_Clustering_K_Means.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random
import os
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy import stats
from scipy.stats import ttest_ind
from scipy.stats import f_oneway

import warnings
warnings.filterwarnings('ignore')

# Data preparation

In [2]:
data_shaperr = pd.read_csv('/content/Compas_w_error_shap.csv')
data_shaper = data_shaperr.drop(['sex', 'race', 'predicted_class', 'true_class'], axis=1)
#data_shaperr.head()
#data_shaperr.info()
data_shaper.iloc[3]

age_unscaled                                  31.0
decile_score_unscaled                          7.0
priors_count_unscaled                          0.0
sex_Female_unscaled                            0.0
sex_Male_unscaled                              1.0
race_African-American_unscaled                 1.0
race_Asian_unscaled                            0.0
race_Caucasian_unscaled                        0.0
race_Hispanic_unscaled                         0.0
race_Native American_unscaled                  0.0
race_Other_unscaled                            0.0
age_scaled                               -0.321161
decile_score_scaled                       0.871941
priors_count_scaled                       -0.71124
sex_Female_scaled                        -0.489624
sex_Male_scaled                           0.489624
race_African-American_scaled              0.975623
race_Asian_scaled                         -0.06675
race_Caucasian_scaled                    -0.718015
race_Hispanic_scaled           

In [3]:
#Master Dataset
META_COL = ['clusters', 'new_clusters']
ERROR_COL = ['errors', 'TP', 'TN', 'FN', 'FP']
BASIC_COL_unscaled = ['age_unscaled', 'decile_score_unscaled', 'priors_count_unscaled']
DUMMY_unscaled = ['sex_Female_unscaled', 'sex_Male_unscaled','race_African-American_unscaled', 'race_Asian_unscaled', 'race_Caucasian_unscaled', 'race_Hispanic_unscaled',
                      'race_Native American_unscaled', 'race_Other_unscaled']

BASIC_COL_scaled = ['age_scaled', 'decile_score_scaled', 'priors_count_scaled']
DUMMY_scaled = ['sex_Female_scaled', 'sex_Male_scaled', 'race_Native American_scaled','race_Other_scaled',
                'race_African-American_scaled', 'race_Asian_scaled', 'race_Caucasian_scaled', 'race_Hispanic_scaled']

SHAP_COL_Basic_scaled = ['Shap_age_basicscaled', 'Shap_decile_score_basicscaled', 'Shap_priors_count_basicscaled']
SHAP_COL_Dummy_scaled = ['Shap_sex_Female_basicscaled', 'Shap_sex_Male_basicscaled',
                         'Shap_race_African-American_basicscaled', 'Shap_race_Asian_basicscaled', 'Shap_race_Caucasian_basicscaled',
                         'Shap_race_Hispanic_basicscaled', 'Shap_race_Native American_basicscaled', 'Shap_race_Other_basicscaled']
#SHAP_META =['clusters', 'new_clusters', 'predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP']

#SHAP_COL_Basic_unscaled = ['Shap_age_basicunscaled', 'Shap_decile_score_basicunscaled', 'Shap_priors_count_basicunscaled']
#SHAP_COL_Dummy_unscaled = ['Shap_sex_Female_basicunscaled', 'Shap_sex_Male_basicunscaled','Shap_race_African-American_basicunscaled',
#'Shap_race_Asian_basicunscaled', 'Shap_race_Caucasian_basicunscaled','Shap_race_Hispanic_basicunscaled', 'Shap_race_Native American_basicunscaled', 'Shap_race_Other_basicunscaled']

META_COL_VIZ = ['Error_Type']

# DATA PREP UTILS

In [21]:
#Seperate TPFN & TNFP dataset
'''Drop rows where both TP and FN are 0 '''
def drop_zero_TP_FN(data):
    return data.loc[(data['TP'] == 1) | (data['FN'] == 1)]

'''Drop rows where both TN and FP are 0'''
def drop_zero_TN_FP(data):
    return data.loc[(data['TN'] == 1) | (data['FP'] == 1)]

TPFN_all = drop_zero_TP_FN(data_shaper)
TNFP_all = drop_zero_TN_FP(data_shaper)

#scaled and unscaled version
TPFN_scaled = TPFN_all.drop(BASIC_COL_unscaled + DUMMY_unscaled, axis=1)
TPFN_unscaled = TPFN_all.drop(BASIC_COL_scaled + DUMMY_scaled, axis=1)
TNFP_scaled = TNFP_all.drop(BASIC_COL_unscaled + DUMMY_unscaled, axis=1)
TNFP_unscaled = TNFP_all.drop(BASIC_COL_scaled + DUMMY_scaled, axis=1)

#TNFP.head()
#TPFN_scaled.iloc[3]
TNFP_unscaled.iloc[3]

age_unscaled                                  37.0
decile_score_unscaled                          1.0
priors_count_unscaled                          0.0
sex_Female_unscaled                            1.0
sex_Male_unscaled                              0.0
race_African-American_unscaled                 1.0
race_Asian_unscaled                            0.0
race_Caucasian_unscaled                        0.0
race_Hispanic_unscaled                         0.0
race_Native American_unscaled                  0.0
race_Other_unscaled                            0.0
errors                                         0.0
TP                                             0.0
TN                                             1.0
FN                                             0.0
FP                                             0.0
Error_Type                                      TN
Shap_age_basicscaled                     -0.057228
Shap_decile_score_basicscaled            -0.176166
Shap_priors_count_basicscaled  

In [22]:
#drop Na's
TPFN_scaled = TPFN_scaled.dropna()
TPFN_unscaled = TPFN_unscaled.dropna()
TNFP_scaled = TNFP_scaled.dropna()
TNFP_unscaled = TNFP_unscaled.dropna()

In [23]:
'''New Initialization Function for Dataset. Returns a dataset with scaled features and new columns for clusters = 0 and new_clusters = -1 which is required for HBAC'''

def initialize_dataset(data, unscaled =True, meta_col =True, with_classes =True):

#1. Make deep copy
  new_data = data.copy(deep=True)

#2. Ignore cluster and new cluster for scaling and add them back in step 4
  if meta_col:
    new_data = new_data.drop(columns =META_COL + ['clusters', 'new_clusters'], errors = 'ignore')

#3. Scale 1) All features or 2) only SHAP_COL, and ERROR_COL
  if unscaled:
    features_to_scale = SHAP_COL_Dummy_scaled + SHAP_COL_Basic_scaled + ERROR_COL + BASIC_COL_unscaled + DUMMY_unscaled
  else:
    features_to_scale = SHAP_COL_Dummy_scaled + SHAP_COL_Basic_scaled + ERROR_COL

  new_data[features_to_scale] = StandardScaler().fit_transform(new_data[features_to_scale])

#4. add back META_COLUMN
  if with_classes: #making sure that the class columns are retained in new dataset
    for col in META_COL:
      if col in data.columns:
          new_data[col] = data[col]

  new_data['clusters'] = 0
  new_data['new_clusters'] = -1

#5. Return dataset
  return new_data

#example usage
TPFN_init = initialize_dataset(TPFN_unscaled, unscaled = True, meta_col = True, with_classes = True)
TNFP_init = initialize_dataset(TNFP_unscaled, unscaled = True, meta_col = True, with_classes = True)
#TPFN_scaled2 = initialize_dataset(TPFN_scaled, unscaled = False, meta_col = True, with_classes = True) #will give the same dataset
#TNFP_scaled2 = initialize_dataset(TNFP_scaled, unsclaed = False, meta_col = True, with_classes = True) #will give the same dataset


In [25]:
TPFN_init.iloc[3]
#TNFP_init.iloc[3]

age_unscaled                              3.291936
decile_score_unscaled                    -1.641946
priors_count_unscaled                    -0.514283
sex_Female_unscaled                      -0.409297
sex_Male_unscaled                         0.409297
race_African-American_unscaled           -1.177237
race_Asian_unscaled                      -0.056183
race_Caucasian_unscaled                   1.513227
race_Hispanic_unscaled                   -0.267351
race_Native American_unscaled            -0.050236
race_Other_unscaled                      -0.211441
errors                                   -0.856052
TP                                        0.856052
TN                                             0.0
FN                                       -0.856052
FP                                             0.0
Error_Type                                      TP
Shap_age_basicscaled                      0.337445
Shap_decile_score_basicscaled             0.561073
Shap_priors_count_basicscaled  

In [7]:
'''Initialize dataset to scale the features and errors which can be in/excluded for clustering.
Returns a scaled dataset with new columns "clusters" = 0 and "new_clusters" = -1, which is required for HBAC

def initialize_dataset(data, with_errors=True, just_features=True, scale_features=True, with_classes=True, with_Dummy= True):

    new_data = data.copy(deep=True).dropna() #remove rows with NaN values

    if just_features: #Check if the columns exist before dropping
      new_data = new_data.drop(columns=META_COL, errors = 'ignore')
      if 'clusters' in new_data.columns:
        new_data = new_data.drop('clusters', axis=1)
      if 'new_clusters' in new_data.columns:
        new_data = new_data.drop('new_clusters', axis=1)

    if with_Dummy:
      for col in DUMMY_RACE + DUMMY_GENDER:
        if col in new_data.columns:
          one_hot = pd.get_dummies(new_data[col], prefix=col)
          new_data = new_data.drop(col, axis=1) #drop original dummy columns
          new_data = pd.concat([new_data, one_hot], axis=1)

    #Seperate features before scaling - drop metacols from features for scaling
    features = new_data.drop(META_COL, errors='ignore') #? already happened

    if scale_features:
    #Ensure only numeric columns are scaled + try using minmax and standard scaler
      numeric_cols = features.select_dtypes(include=['number']).columns
      new_data[numeric_cols] = StandardScaler().fit_transform(features[numeric_cols])
      #new_data[numeric_cols] = MinMaxScaler().fit_transform(features[numeric_cols])

    if with_errors:
        if all(col in new_data.columns for col in ERROR_COL):
          new_data[ERROR_COL] *= 0.8 #scaling factor

    if with_classes: #making sure that the class columns are retained in new dataset
      for col in META_COL:
        if col in data.columns:
          new_data[col] = data[col]

    new_data['clusters'] = 0
    new_data['new_clusters'] = -1

    return new_data

TPFN = initialize_dataset(TPFN)
TPFN.head()
'''

'Initialize dataset to scale the features and errors which can be in/excluded for clustering.\nReturns a scaled dataset with new columns "clusters" = 0 and "new_clusters" = -1, which is required for HBAC \n\ndef initialize_dataset(data, with_errors=True, just_features=True, scale_features=True, with_classes=True, with_Dummy= True):\n\n    new_data = data.copy(deep=True).dropna() #remove rows with NaN values\n\n    if just_features: #Check if the columns exist before dropping\n      new_data = new_data.drop(columns=META_COL, errors = \'ignore\')\n      if \'clusters\' in new_data.columns:\n        new_data = new_data.drop(\'clusters\', axis=1)\n      if \'new_clusters\' in new_data.columns:\n        new_data = new_data.drop(\'new_clusters\', axis=1)\n\n    if with_Dummy:\n      for col in DUMMY_RACE + DUMMY_GENDER:\n        if col in new_data.columns:\n          one_hot = pd.get_dummies(new_data[col], prefix=col)\n          new_data = new_data.drop(col, axis=1) #drop original dummy colu

In [None]:
'''undo Dummy for DUMMY_RACE or DUMMY_GENDER'''
def undo_dummy(data, with_Dummy, col_label, numeric_values=True, short_label=None):
  data[col_label] = ''
  for i, c in enumerate(with_Dummy):
    values = np.sort(data[c].unique())
    if numeric_values:
      data.loc[data[c] == values[1], col_label] = i
    else:
      if short_label is None:
        raise ValueError("short label must be provided if numeric_values is False")
        data.loc[data[c] == values[1], col_label] = short_label[i]
    data = data.drop(c, axis=1)
  return(data)

#data = undo_dummy(data, DUMMY_RACE, col_label='race', numeric_values=False, short_label=SHORT_LABEL_RACE)
#data = undo_dummy(data, DUMMY_GENDER, col_label='gender', numeric_values=False, short_label=SHORT_LABEL_GENDER)

# UTILS for BIAS in ERROR DIFFERENCE

In [None]:
#Calculate average Error rate based on unscaled error rate by counting the amount of max values (1) and dividing them by the total nr of rows - replacing old accuracy_error()
def get_error_rate(data, column = 'errors'):
  if len(data) == 0:
    print ('calculating error rate on an empty cluster')
    return
  max_value = data[column].max()
  count_max_value = (data[column] == max_value).sum()
  average_error_rate = count_max_value / len(data)
  return average_error_rate

'''Calculate BIAS in terms of Error Difference
bias_type can be 'negative', 'positive' or 'absolute'
baseline can be 'all' which is the overall error rate, or 'other' or 'best' '''

def get_error_diff(data, cluster_id, cluster_col, bias_type = 'negative', baseline= 'all', #function= get_error_rate
                   ):
  cluster_x= data.loc[data[cluster_col] == cluster_id]
  remaining_clusters = data.loc[data[cluster_col] != cluster_id]

  if len(cluster_x) == 0:
    print ('calculating error difference on an empty cluster')
    return

  if baseline == 'all':
    error_diff = get_error_rate(cluster_x) - get_error_rate(data)

  elif baseline == 'other':
    if len(remaining_clusters) == 0:
      print ("This cluster is the entire dataset. Cluster:", cluster_id)
      return
    error_diff = get_error_rate(cluster_x) - get_error_rate(remaining_clusters)

  elif baseline == 'best':
    best_cluster = get_min_bias_cluster(data, cluster_col, bias_type, baseline)
    error_diff = get_error_rate(cluster_x) - best_cluster[1]

  else:
    print ('unknown baseline')
    return

  #if full_info:
    #return [error_diff, function(cluster_x), function(remaining_clusters)]

  if bias_type == 'negative':
    pass #no change needed
  elif bias_type == 'positive':
    error_diff = -error_diff
  elif bias_type == 'absolute':
    error_diff = np.absolute(error_diff)
  else:
    print("unknown bias type")
    return

  return error_diff

# UTILS for VISUALS

In [None]:
def pca_plot(data, title, alpha):
    # Extract features for PCA and drop Meta_colums
    pca_features = data.drop(META_COL + ERROR_COL, axis=1)
    other_features = data[META_COL + ERROR_COL]

    # Apply PCA with 2 components to scaled features and create a df for the resulting principal components
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(pca_features)
    pca_df = pd.DataFrame(pca_result, index=pca_features.index, columns=['PC1', 'PC2'])

    # Create temporary dataset that contains both principal components and other features
    temp_dataset = pca_df.join(other_features, how='left')

    # Create scatterplot using seaborn
    scatterplot = sns.scatterplot(data=temp_dataset, x='PC1', y='PC2', alpha=alpha, hue="clusters", palette='tab10', style='Error_Type')
    scatterplot.set_title(title)
    scatterplot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), ncol=1)

    explained_variance_ratio = pca.explained_variance_ratio_
    print(f"Explained Variance Ratio: PC1 = {explained_variance_ratio[0]:.2f}, PC2 = {explained_variance_ratio[1]:.2f}")

    plt.show()

def tsne_plot(data, title, perplexity, learning_rate, n_iter, alpha = 0.5):
    # Extract features for TSNE and drop Meta_colums
    tsne_features = data.drop(META_COL + ERROR_COL, axis=1)
    other_features = data[META_COL + ERROR_COL]

    tsne = TSNE(n_components=2, perplexity= 30, learning_rate= 200, n_iter= 1000)
    tsne_result = tsne.fit_transform(tsne_features)
    tsne_df = pd.DataFrame(tsne_result, index = tsne_features.index, columns=['t-SNE Component 1', 't-SNE Component 2'])

    temp_dataset = tsne_df.join(other_features, how='left')

    # Create scatterplot using seaborn
    scatterplot = sns.scatterplot(data=temp_dataset, x='t-SNE Component 1', y='t-SNE Component 2', alpha=alpha, hue="clusters", palette='tab10', style='Error_Type')
    scatterplot.set_title(title)
    scatterplot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), ncol=1)

    plt.show()


In [None]:
FSE_tpfn = initialize_dataset(TPFN_data)
#pca_plot(TP_FN, 'Compas', 0.6)
tsne_plot(FSE_tpfn, 'Compas', 30, 200, 1000)

KeyError: "['TP', 'TN', 'FN', 'FP'] not found in axis"

In [None]:
FSE_tnfp = initialize_dataset(TNFP_data)
#pca_plot(TN_FP, 'Compas', 0.6)
tsne_plot(FSE_tnfp, 'Compas', 30, 200, 1000)

# UTILS FOR CLUSTERING

In [None]:
#get cluster with max error difference
def get_max_bias_cluster(data, cluster_col= 'clusters', bias_type = 'negative', baseline = 'all', function = get_error_diff):
  max_bias = 0 #min possible bias
  max_bias_cluster = -2

  for cluster_id in data[cluster_col].unique():
    if cluster_id == -1: #outliers in dbscan
      continue

    current_bias = function(data, cluster_id, cluster_col, bias_type, baseline) #pos function to find highest bias

    #if isinstance(current_bias, list):
     # current_bias = current_bias[0]

    if current_bias > max_bias:
      max_bias = current_bias
      max_bias_cluster = cluster_id

    return(max_bias_cluster, max_bias)

#get cluster with min error difference
def get_min_bias_cluster(data, cluster_col= 'clusters', bias_type = 'negative', baseline = 'all'):
  min_bias = 1 #max possible bias and look for smt smaller
  min_bias_cluster = -2

  for cluster_id in data[cluster_col].unique():
    if cluster_id == -1: #outliers in dbscan
      continue
    current_bias = get_error_diff(data, cluster_id, cluster_col, bias_type, baseline)
    if current_bias < min_bias:
      min_bias = current_bias
      min_bias_cluster = cluster_id
  return(min_bias_cluster, min_bias)

#get size of the smallest cluster
def get_min_cluster_size(data, cluster_col = 'new_clusters'):
  min_cluster_size = len(data)
  for i in data['new_clusters'].unique():
    if i == -1: #exclude the -1 clusters as they may present outliers (in dbscan?)
      continue
      size = len(data.loc[data['new_clusters'] == i])
      if size < min_cluster_size: #update if new cluster size is smaller
        min_cluster_size = size
  return(min_cluster_size)

def get_random_cluster(data, cluster_col, min_splittable_cluster_size, previous_cluster, all_cluster_ids):
  for candidate_cluster_id in all_cluster_ids:
    if candidate_cluster_id == -1 or candidate_cluster_id == previous_cluster:
      continue
      print ('This is the random cluster we picked:', candidate_cluster_id)

      candidate_cluster = data.loc[data[cluster_col] == candidate_cluster_id]
      if len(candidate_cluster) >= min_splittable_cluster_size:
        print('it is too small:', len(candidate_cluster))
        continue
      else:
        return candidate_cluster_id

def select_new_cluster(data, cluster_col='clusters', error_column='errors', overall_error_rate=0.5, bias_type='negative', baseline='all'):
    smallest_diff = float('inf')
    selected_cluster = None

    for cluster_id in data[cluster_col].unique():
        if cluster_id == -1: #skip outlier
            continue

        error_diff = get_error_diff(data, cluster_id, cluster_col, bias_type, baseline) #calculate the error_diff for each cluster

        if error_diff is None:
            continue

        abs_diff = abs(overall_error_rate - (get_error_rate(data[data[cluster_col] == cluster_id]))) #get cluster with the smallest absolute difference with the overall error rate (0.5)

        if abs_diff < smallest_diff:
            smallest_diff = abs_diff
            selected_cluster = cluster_id
    return selected_cluster

def exit_clustering(data, msg='', bias_type='', iter=''):
  print('Iteration ', iter, ': ', msg)
  print('Overall error rate: ', get_error_rate(data))
  for c in np.sort(data['clusters'].unique()):
    print('Cluster: ', c, '\tSize: ', len(data.loc[data['clusters'] == c]), '\tError rate: ', get_error_rate(data.loc[data['clusters'] == c]))
  pca_plot(data,'HBAC-DBSCAN on COMPAS - ' + bias_type + ' bias', hue='clusters', s=15, alpha=0.8)
  return data


# K-MEANS CLUSTERING

TP FN DATA

In [None]:
#FSE_tpfn
def hbac_kmeans(data = FSE_tpfn, max_iter = 300, show_plot= True):
  clus_model_kwargs = { "n_clusters": 2, #split in two clusters
    "init": "k-means++", # method for initializing k-means++: first centroid is chosen randomly and subsequent centriods are selected based on max distance from the nearest centriod
    "n_init": 10, #K-means is sensitive to the initial placement of cluster centers - running it 10 times with different initial seeds
    "max_iter": max_iter,} #max mr of iterations for k-means in a single run. If convergence is not achieved within 300 the algorithm stops

  x = 0 #initial cluster nr
  initial_bias = 0

  error_list = []
  error_rate = get_error_rate(data, column = 'FN') #Calculating mean error rate on FN column

  min_splittable_cluster_size = round(0.05 * len(data))
  min_acceptable_cluster_size = round(0.03 * len(data))
  print("error rate:", error_rate)

  #Loop for clustering iterations
  for i in range(1, max_iter):
    if len(data['clusters'].unique()) != 1:
      error_list.append(get_error_rate(data)) #the error rate is calculated if the nr of unique clusters is not equal to 1
      data['new_clusters'] = -1
    candidate_cluster = data.loc[data['clusters'] == x]

    if len(candidate_cluster) < min_splittable_cluster_size:
      x = get_random_cluster(data, 'clusters', min_splittable_cluster_size, x, data['clusters'].unique())
      continue

    kmeans = KMeans(**clus_model_kwargs).fit(candidate_cluster.drop(META_COL,axis=1))

    candidate_cluster['new_clusters'] = pd.DataFrame(kmeans.predict(candidate_cluster.drop(META_COL, axis=1)), index=candidate_cluster.index)
    data['new_clusters'] = candidate_cluster['new_clusters'].combine_first(data['new_clusters'])

    discr_bias = get_error_diff(data, x, 'clusters', bias_type = 'negative', baseline= 'all')
    print('discriminative bias:', discr_bias)

    min_cluster_size = get_min_cluster_size(data)
    print('Smallest cluster size:', min_cluster_size)

    if (discr_bias >= initial_bias) & (min_cluster_size > min_acceptable_cluster_size):
      print ("adding a new cluster")
      n_cluster = max(data['clusters'])
      data['clusters'][data['new_clusters'] == 1] = n_cluster + 1

      if show_plot:
        #pca_plot(data, 'K-means for False Negatives', 0.6)
        tsne_plot(data, 'K-means for False Negatives', perplexity = 30, learning_rate = 200, n_iter = 1000, alpha = 0.5)
        plt.show()

      x = select_new_cluster(data, error_column = 'FN')
      initial_bias = discr_bias

    else:
      x = get_random_cluster(data,'clusters', min_splittable_cluster_size, x, data['clusters'].unique())

  print('MAX_ITER')
  print(error_list)
  return data


In [None]:
hbac_kmeans(data = FSE_tpfn, max_iter = 300, show_plot= False) #True)
plt.show()

In [None]:
c= get_max_bias_cluster(FSE_tpfn)
#highest_bias_cluster = TP_FN[TP_FN['clusters']==c]
#len(highest_bias_cluster)

Mean_error_rate_TPFN = get_error_rate(FSE_tpfn)

print ('Mean error rate of full TPFN data set:', Mean_error_rate_TPFN)

print (f" cluster {c} has the highest discrimination bias for TPFN data")


TN FP DATA

In [None]:
'''TN FP DATA'''
def hbac_kmeans(data = FSE_tnfp, max_iter = 300, show_plot= True):
  clus_model_kwargs = { "n_clusters": 2, "init": "k-means++", "n_init": 10, "max_iter": max_iter,}

  x = 0 #initial cluster nr
  initial_bias = 0

  error_list = []
  error_rate = get_error_rate(data, column = 'FP') #Calculating mean error rate on FP column

  min_splittable_cluster_size = round(0.05 * len(data))
  min_acceptable_cluster_size = round(0.03 * len(data))
  print("error rate:", error_rate)

  #Loop for clustering iterations
  for i in range(1, max_iter):
    if len(data['clusters'].unique()) != 1:
      error_list.append(get_error_rate(data)) #the error rate is calculated if the nr of unique clusters is not equal to 1
      data['new_clusters'] = -1
    candidate_cluster = data.loc[data['clusters'] == x]

    if len(candidate_cluster) < min_splittable_cluster_size:
      x = get_random_cluster(data, 'clusters', min_splittable_cluster_size, x, data['clusters'].unique())
      continue

    kmeans = KMeans(**clus_model_kwargs).fit(candidate_cluster.drop(META_COL,axis=1))

    candidate_cluster['new_clusters'] = pd.DataFrame(kmeans.predict(candidate_cluster.drop(META_COL, axis=1)), index=candidate_cluster.index)
    data['new_clusters'] = candidate_cluster['new_clusters'].combine_first(data['new_clusters'])

    discr_bias = get_error_diff(data, x, 'clusters', bias_type = 'negative', baseline= 'all')
    print('discriminative bias:', discr_bias)

    min_cluster_size = get_min_cluster_size(data)
    print('Smallest cluster size:', min_cluster_size)

    if (discr_bias >= initial_bias) & (min_cluster_size > min_acceptable_cluster_size):
      print ("adding a new cluster")
      n_cluster = max(data['clusters'])
      data['clusters'][data['new_clusters'] == 1] = n_cluster + 1

      if show_plot:
        #pca_plot(data, 'K-means for False Positives', 0.6)
        tsne_plot(data, 'K-means for False Negatives', perplexity = 30, learning_rate = 200, n_iter = 1000, alpha = 0.5)
        plt.show()

      x = select_new_cluster(data, error_column = 'FP')
      initial_bias = discr_bias

    else:
      x = get_random_cluster(data,'clusters', min_splittable_cluster_size, x, data['clusters'].unique())

  print('MAX_ITER')
  print(error_list)
  return data

In [None]:
hbac_kmeans(data = TN_FP, max_iter = 300, show_plot= False) #True)

In [None]:
c= get_max_bias_cluster(TN_FP)

Mean_error_rate_TNFP = get_error_rate(TN_FP)

print ('Mean error rate of full TNFP data set:', Mean_error_rate_TNFP)

print (f" cluster {c} has the highest discrimination bias for TNFP data")


# ANOVA SIGNIFICANCE TESTING

In [None]:
'''SHAP, ERROR & BASIC COLUMNS FOR TPFN ANOVA'''
groupstpfn = TP_FN.groupby('clusters')['FN'].apply(list)
anovatpfn = [np.array(groupstpfn) for groupstpfn in groupstpfn]

f_stat, p_val = f_oneway(*anovatpfn)

print('F-statistic:', f_stat)
print('p-value:', p_val)

alpha = 0.05
if p_val < alpha:
  print("there are statistically significant differences between the clusters.")
else:
  print("there are no statistically significant differences between the clusters")

In [None]:
'''SHAP, ERROR & BASIC COLUMNS FOR TNFP ANOVA'''
groupstnfp = TN_FP.groupby('clusters')['FP'].apply(list)
anovatnfp = [np.array(groupstnfp) for groupstnfp in groupstnfp]

f_stat, p_val = f_oneway(*anovatnfp)

print('F-statistic:', f_stat)
print('p-value:', p_val)

alpha = 0.05
if p_val < alpha:
  print("there are statistically significant differences between the clusters.")
else:
  print("there are no statistically significant differences between the clusters")

# SHAP AND ERROR NO FEATURES


**RQ1: is there a relationshop between shap & risk of error?**

TP_FN & TN_FP = *all columns* (above)

SE = shap and error columns

FE = all columns except shap (selma)


--------------------------------------------------
**RQ2: does shap help in finding disc clusters?**

FS = all columns except error

S = only SHAP_COL

F = only BASIC_COL


In [None]:
'''SE DF: Shap and Error '''
SE_tpfn_ = TPFN_data[META_COL + SHAP_COL]
SE_tnfp_ = TNFP_data[META_COL + SHAP_COL]
#SE_tnfp_.drop('Error_Type', axis = 1)

SE_tpfn = initialize_dataset(SE_tpfn_)
SE_tnfp = initialize_dataset(SE_tnfp_)

SE_tpfn.info()
SE_tnfp.info()

In [None]:
hbac_kmeans(data = SE_tnfp, max_iter = 300, show_plot= False)
hbac_kmeans(data = SE_tpfn, max_iter = 300, show_plot= False)

In [None]:
'''SHAP & ERROR COLUMNS FOR TPFN ANOVA'''
groupsSE_tpfn = SE_tpfn.groupby('clusters')['FN'].apply(list)
anovaSE_tpfn = [np.array(groupsSE_tpfn) for groupsSE_tpfn in groupsSE_tpfn]

f_stat, p_val = f_oneway(*anovaSE_tpfn)

print('F-statistic:', f_stat)
print('p-value:', p_val)

alpha = 0.05
if p_val < alpha:
  print("there are statistically significant differences between the clusters.")
else:
  print("there are no statistically significant differences between the clusters")

In [None]:
'''SHAP & ERROR COLUMNS FOR TNFP ANOVA'''
groupsSE_tnfp = SE_tnfp.groupby('clusters')['FP'].apply(list)
anovaSE_tnfp = [np.array(groupsSE_tnfp) for groupsSE_tnfp in groupsSE_tnfp]

f_stat, p_val = f_oneway(*anovaSE_tnfp)

print('F-statistic:', f_stat)
print('p-value:', p_val)

alpha = 0.05
if p_val < alpha:
  print("there are statistically significant differences between the clusters.")
else:
  print("there are no statistically significant differences between the clusters")

# ERROR AND FEATURES NO SHAP

In [None]:
'''FE_df: FEATURES AND ERROR NO SHAP (Selma) '''

FE_tpfn_ = TPFN_data.drop(SHAP_COL, axis = 1) #[BASIC_COL + META_COL + DUMMY_RACE + DUMMY_GENDER]
FE_tnfp_ = TNFP_data.drop(SHAP_COL, axis = 1) #[BASIC_COL + META_COL + DUMMY_RACE + DUMMY_GENDER]

FE_tpfn = initialize_dataset(FE_tpfn_)
FE_tnfp = initialize_dataset(FE_tnfp_)

#FE_tpfn_.info()
FE_tpfn.info()
#FE_tnfp.info()
#FE_tnfp_.info()