<a href="https://colab.research.google.com/github/MDankloff/ClusterCompas/blob/main/V2_COMPAS_Clustering_K_Means.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random
import os
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy import stats
from scipy.stats import ttest_ind

import warnings
warnings.filterwarnings('ignore')

In [3]:
data_shaperr = pd.read_csv('/content/Shap_error_data.csv')

# Data preparation

In [4]:
META_COL = ['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type', 'clusters', 'new_clusters']
SHAP_COL = ['Shap_age', 'Shap_priors_count' , 'Shap_sex_Female', 'Shap_sex_Male',
            'Shap_race_African-American', 'Shap_race_Asian', 'Shap_race_Caucasian', 'Shap_race_Hispanic',
            'Shap_race_Native American', 'Shap_race_Other']
BASIC_COL = ['age', 'priors_count', 'sex_Female', 'sex_Male', 'race_African-American', 'race_Asian', 'race_Caucasian',
             'race_Hispanic', 'race_Native American', 'race_Other', 'error_scaled' ]
META_COL_VIZ = ['predicted_class', 'true_class', 'TP', 'TN', 'FN', 'FP', 'error_scaled', 'Error_Type', 'new_clusters']
ONE_HOT_RACE = ['race_African-American', 'race_Asian', 'race_Caucasian',
             'race_Hispanic', 'race_Native American', 'race_Other']
SHORT_LABEL_RACE = ['Afr.Am.', 'Asian', 'Cauc.', 'Hisp.', 'Native', 'Other']
ONE_HOT_GENDER = ['sex_Female', 'sex_Male']
SHORT_LABEL_GENDER = ['Female', 'Male']

In [5]:
'''Drop rows where both TP and FN are 0 '''
def drop_zero_TP_FN(data):
    return data.loc[(data['TP'] == 1) | (data['FN'] == 1)]

'''Drop rows where both TN and FP are 0'''
def drop_zero_TN_FP(data):
    return data.loc[(data['TN'] == 1) | (data['FP'] == 1)]

TPFN_data = drop_zero_TP_FN(data_shaperr)
TNFP_data = drop_zero_TN_FP(data_shaperr)

#TNFP_data.head()
#TPFN_data.info()

In [6]:
'''Initialize dataset to scale the features and errors which can be in/excluded for clustering.
Returns a scaled dataset with new columns "clusters" = 0 and "new_clusters" = -1, which is required for HBAC '''

def initialize_dataset(data, with_errors=True, just_features=True, scale_features=True, with_classes=True):
    new_data = data.copy(deep=True)
    #features = new_data.drop(['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type'], axis=1)
    features = new_data.drop(META_COL, axis=1)

    if with_errors:
        error_columns = ['TP', 'TN', 'FN', 'FP', 'errors']
        new_data[error_columns] *= 0.8 #scaling factor

    if just_features:
        #drop_columns = ['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type']
        new_data = new_data.drop(META_COL, axis = 1)

    if scale_features:
        to_scale = features.columns
        #to_scale = data.drop(['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type'], axis=1).columns
        new_data[to_scale] = StandardScaler().fit_transform(features[to_scale])
        #new_data[to_scale] = MinMaxScaler().fit_transform(features[to_scale])
        '''try using minmax and standardscaler'''

    if with_classes:
      for col in META_COL:
        if col in data.columns:
          new_data[col] = data[col]
      #['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type',
                  #'Shap_age', 'Shap_priors_count' , 'Shap_sex_Female', 'Shap_sex_Male', 'Shap_race_African-American',
                  #'Shap_race_Asian', 'Shap_race_Caucasian', 'Shap_race_Hispanic', 'Shap_race_Native American', 'Shap_race_Other'
                  #]:new_data[col] = data[col]

    new_data['clusters'] = 0
    new_data['new_clusters'] = -1

    return new_data