<a href="https://colab.research.google.com/github/MDankloff/ClusterCompas/blob/main/V2_COMPAS_Clustering_K_Means.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random
import os
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy import stats
from scipy.stats import ttest_ind

import warnings
warnings.filterwarnings('ignore')

In [21]:
data_shaperr = pd.read_csv('/content/Shap_error_data.csv')

# Data preparation

In [22]:
META_COL = ['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type', 'clusters', 'new_clusters']
SHAP_COL = ['Shap_age', 'Shap_priors_count' , 'Shap_sex_Female', 'Shap_sex_Male',
            'Shap_race_African-American', 'Shap_race_Asian', 'Shap_race_Caucasian', 'Shap_race_Hispanic',
            'Shap_race_Native American', 'Shap_race_Other']
BASIC_COL = ['age', 'priors_count', 'sex_Female', 'sex_Male', 'race_African-American', 'race_Asian', 'race_Caucasian',
             'race_Hispanic', 'race_Native American', 'race_Other', 'error_scaled' ]
META_COL_VIZ = ['predicted_class', 'true_class', 'TP', 'TN', 'FN', 'FP', 'error_scaled', 'Error_Type', 'new_clusters']
DUMMY_RACE = ['race_African-American', 'race_Asian', 'race_Caucasian',
             'race_Hispanic', 'race_Native American', 'race_Other']
SHORT_LABEL_RACE = ['Afr.Am.', 'Asian', 'Cauc.', 'Hisp.', 'Native', 'Other']
DUMMY_GENDER = ['sex_Female', 'sex_Male']
SHORT_LABEL_GENDER = ['Female', 'Male']

In [23]:
'''Drop rows where both TP and FN are 0 '''
def drop_zero_TP_FN(data):
    return data.loc[(data['TP'] == 1) | (data['FN'] == 1)]

'''Drop rows where both TN and FP are 0'''
def drop_zero_TN_FP(data):
    return data.loc[(data['TN'] == 1) | (data['FP'] == 1)]

TPFN_data = drop_zero_TP_FN(data_shaperr)
TNFP_data = drop_zero_TN_FP(data_shaperr)

#TNFP_data.head()
#TPFN_data.info()

In [24]:
'''Initialize dataset to scale the features and errors which can be in/excluded for clustering.
Returns a scaled dataset with new columns "clusters" = 0 and "new_clusters" = -1, which is required for HBAC '''

def initialize_dataset(data, with_errors=True, just_features=True, scale_features=True, with_classes=True, with_Dummy= True):

    new_data = data.copy(deep=True)
    features = new_data.drop(META_COL, axis=1)

    if with_errors:
        error_columns = ['TP', 'TN', 'FN', 'FP', 'errors']
        new_data[error_columns] *= 0.8 #scaling factor

    if just_features:
        new_data = new_data.drop(META_COL, axis = 1)

    if scale_features:
        to_scale = features.columns
        new_data[to_scale] = StandardScaler().fit_transform(features[to_scale])
        #new_data[to_scale] = MinMaxScaler().fit_transform(features[to_scale])
        '''try using minmax and standardscaler'''

    if with_Dummy:
      for col in DUMMY_RACE + DUMMY_GENDER:
        if col in data.columns:
          one_hot = pd.get_dummies(data[col], prefix=col)
          new_data = data.drop(col, axis=1)
          new_data = pd.concat([data, one_hot], axis=1)

    if with_classes:
      for col in META_COL:
        if col in data.columns:
          new_data[col] = data[col]

    new_data['clusters'] = 0
    new_data['new_clusters'] = -1

    return new_data

In [30]:
'''undo Dummy for DUMMY_RACE or DUMMY_GENDER'''
def undo_dummy(data, with_Dummy, col_label, numeric_values=True, short_label=None):
  data[col_label] = ''
  for i, c in enumerate(with_Dummy):
    values = np.sort(data[c].unique())
    if numeric_values:
      data.loc[data[c] == values[1], col_label] = i
    else:
      if short_label is None:
        raise ValueError("short label must be provided if numeric_values is False")
        data.loc[data[c] == values[1], col_label] = short_label[i]
    data = data.drop(c, axis=1)
  return(data)

#data = undo_dummy(data, DUMMY_RACE, col_label='race', numeric_values=False, short_label=SHORT_LABEL_RACE)
#data = undo_dummy(data, DUMMY_GENDER, col_label='gender', numeric_values=False, short_label=SHORT_LABEL_GENDER)

In [33]:
#print(data_shaperr.shape)
#data_shaperr.head()
#data_shaperr.info()

# UTILS for BIAS in ERROR DIFFERENCE

In [34]:
#Calculate Error rate based on mean - replacing old accuracy_error()
def get_error_rate(data):
  if len(data) == 0:
    print ('calculating error rate on an empty cluster')
    return
  return data.loc[:, 'errors'].mean()

'''Calculate BIAS in terms of Error Difference
bias_type can be 'negative', 'positive' or 'absolute'
baseline can be 'all' which is the overall error rate, or 'other' or 'best' '''

def get_error_diff(data, cluster_id, cluster_col, bias_type = 'negative', baseline= 'all', full_info=False):
  cluster_x= data.loc[data[cluster_col] == cluster_id]
  remaining_clusters = data.loc[data[cluster_col] != cluster_id]

  if len(cluster_x) == 0:
    print ('calculating error difference on an empty cluster')
    return

  if baseline == 'all':
    error_diff = get_error_rate(cluster_x) - get_error_rate(data)

  elif baseline == 'other':
    if len(remaining_clusters) == 0:
      print ("This cluster is the entire dataset. Cluster:", cluster_id)
      return
    error_diff = get_error_rate(cluster_x) - get_error_rate(remaining_clusters)

  #elif baseline == 'best':
    #best_cluster = get_cluster_w_min_bias(data, cluster_col, bias_type, baseline)
    #error_diff = get_error_rate(cluster_x) - best_cluster[1]

  else:
    print ('unknown baseline')
    return

  if full_info:
    return [error_diff, get_error_rate(cluster_x), get_error_rate(remaining_clusters)]

  if bias_type == 'negative':
    pass #no change needed

  elif bias_type == 'positive':
    error_diff = -error_diff

  elif bias_type == 'absolute':
    error_diff = np.absolute(error_diff)

  else:
    print("unknown bias type")
    return

  return error_diff