<a href="https://colab.research.google.com/github/MDankloff/ClusterCompas/blob/main/COMPAS_Clustering_K_Means_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random
import os
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from scipy import stats
from scipy.stats import ttest_ind

import warnings
warnings.filterwarnings('ignore')

# Load data

In [20]:
data_shaperr = pd.read_csv('/content/Shap_error_data.csv')

In [21]:
features = data_shaperr.drop(['Error_Type', 'errors', 'predicted_class', 'true_class'], axis =1)
#features.info()

# Split data into tp/fn and tn/fp

In [22]:
'''Drop rows where both TP and FN are '''
def drop_zero_TP_FN(data):
    return data.loc[(data['TP'] == 1) | (data['FN'] == 1)]

'''Drop rows where both TN and FP are 0'''

def drop_zero_TN_FP(data):
    return data.loc[(data['TN'] == 1) | (data['FP'] == 1)]

TPFN_data = drop_zero_TP_FN(data_shaperr)
TNFP_data = drop_zero_TN_FP(data_shaperr)

# Utils

In [18]:
'''Pca on scaled features'''
def pca_plot(data, title, alpha):
    # Extract features for PCA and drop certain columns
    pca_features = data.drop(['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type' 'clusters', 'new_clusters',
                              'Shap_age', 'Shap_priors_count' , 'Shap_sex_Female', 'Shap_sex_Male', 'Shap_race_African-American', 'Shap_race_Asian', 'Shap_race_Caucasian', 'Shap_race_Hispanic', 'Shap_race_Native American', 'Shap_race_Other'
                              ], axis=1)
    other_features = data[['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type', 'clusters', 'new_clusters',
                           'Shap_age', 'Shap_priors_count' , 'Shap_sex_Female', 'Shap_sex_Male', 'Shap_race_African-American', 'Shap_race_Asian', 'Shap_race_Caucasian', 'Shap_race_Hispanic', 'Shap_race_Native American', 'Shap_race_Other'
                           ]]

    # Apply PCA with 2 components to scaled features and create a df for the resulting principal components
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(pca_features)
    pca_df = pd.DataFrame(pca_result, index=pca_features.index, columns=['Principal Component 1', 'Principal Component 2'])

    # Create temporary dataset that contains both principal components and other features
    temp_dataset = pca_df.join(other_features, how='left')

    # Create scatterplot using seaborn
    scatterplot = sns.scatterplot(data=temp_dataset, x='Principal Component 1', y='Principal Component 2', alpha=alpha, hue="clusters", palette='tab10', style='Error_Type')
    scatterplot.set_title(title)
    scatterplot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), ncol=1)

    explained_variance_ratio = pca.explained_variance_ratio_
    print(f"Explained Variance Ratio: PC1 = {explained_variance_ratio[0]:.2f}, PC2 = {explained_variance_ratio[1]:.2f}")

    plt.show()

'''Initialization of dataset to scale the features and errors, which can be in/excluded for clustering.
Returns a scaled dataset with new columns "clusters" = 0 and "new_clusters" = -1, which is required for HBAC '''

def initialize_dataset(data, with_errors=True, just_features=True, scale_features=True, with_classes=True):
    new_data = data.copy(deep=True)

    if with_errors:
        scaling_factor = 0.8
        error_columns = ['TP', 'TN', 'FN', 'FP']
        new_data[error_columns] = new_data[error_columns] * scaling_factor

    if just_features:
        new_data = new_data.drop(['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type',
                                  'Shap_age', 'Shap_priors_count' , 'Shap_sex_Female', 'Shap_sex_Male', 'Shap_race_African-American',
                                  'Shap_race_Asian', 'Shap_race_Caucasian', 'Shap_race_Hispanic', 'Shap_race_Native American', 'Shap_race_Other']
                                  , axis=1)

    if scale_features:
        numeric_columns = new_data.select_dtypes(include=['number']).columns
        scaler = StandardScaler()
        new_data[numeric_columns] = scaler.fit_transform(new_data[numeric_columns])

    if with_classes:
      for col in ['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type',
                  'Shap_age', 'Shap_priors_count' , 'Shap_sex_Female', 'Shap_sex_Male', 'Shap_race_African-American',
                  'Shap_race_Asian', 'Shap_race_Caucasian', 'Shap_race_Hispanic', 'Shap_race_Native American', 'Shap_race_Other'
                  ]:
            new_data[col] = data[col]

    new_data['clusters'] = 0
    new_data['new_clusters'] = -1

    return new_data

'''Calculate accuracy on error'''

def accuracy_error (results, error =None):
  if len(results) == 0:
    print ("you are calculating the accuracy on an empty cluster")
  correct = results.loc[results[error] == 0]
  acc = len(correct)/ len(results)
  return acc

'''Calculate bias based on accuracy_error. The accuracy of a selected cluster - accruacy of the remaining clusters
 Colster col: name of the DF column where the cluster assignments are'''

def bias_w_error (data, cluster_id, cluster_col):
  cluster_x = data.loc[data[cluster_col] == cluster_id]
  if len(cluster_x) ==0:
    print("this is an empty cluster", cluster_id)
  remaining_clusters = data.loc[data[cluster_col] != cluster_id]
  if len(remaining_clusters) ==0:
    print ("This cluster is the entire dataset. Cluster:", cluster_id)
  return accuracy_error(cluster_x) - accuracy_error(remaining_clusters)

'''Returns a value for max negative and positive bias. returns a value'''

def get_max_bias(data, bias_type = 'negative', function = bias_w_error):
  max_bias = float('inf') if bias_type == 'negative' else -float('inf') #initializes max_bias with either positive of negative infinity (special floating point value) based on bias_type param
  for cluster_number in data['new_clusters'].unique():
    if cluster_number == -1:#outliers in dbscan
      continue
    current_bias = function(data, cluster_number, 'new_clusters') #for each cluster the bias_w_error function is calculated
    if (bias_type == 'negative' and current_bias < max_bias) or (bias_type == 'positive' and current_bias > max_bias):
      max_bias = current_bias
  print(f'Maximum {bias_type} bias is:', max_bias)
  return max_bias


'''Returns a cluster for max neg bias (for newly added clusters)'''

def get_cluster_max_bias(data, function = bias_w_error):
  max_pos_bias = 100 #max_abs bias selma code
  max_bias_cluster = -2
  for cluster_number in data['clusters'].unique():
    if cluster_number == -1:
      continue
    current_bias = (function(data, cluster_number, 'clusters')) #pos function to find the highest bias
    print(f"{cluster_number} has bias {current_bias}")
    if current_bias < max_pos_bias:
      max_pos_bias = current_bias
      max_bias_cluster = cluster_number
  print ('cluster with the highest discriminating bias:', max_bias_cluster)
  return max_bias_cluster

'''Select a new cluster to split on based on the smallest absolute difference from the overall error rate of 0.5
Function requires a df.columns named 'clusters' and an error column (fp or fn)'''

def select_new_cluster(data, error_column=None, overall_error_rate=0.5):
    smallest_diff = 1
    selected_cluster = None

    if error_column is None:
        error_column = 'FP'  # Default to 'FP' if error_column is not specified

    for cluster_number in data['clusters'].unique():
        if cluster_number == -1:
            continue
        cluster_data = data[data['clusters'] == cluster_number]
        cluster_error_rate = cluster_data[error_column].mean()  # Use specified error column
        abs_diff = abs(overall_error_rate - cluster_error_rate)

        if abs_diff < smallest_diff:
            smallest_diff = abs_diff
            selected_cluster = cluster_number

    return selected_cluster

'''Calculate variance based on error'''

def calculate_variance(data):
  variance_list_local = []
  for j in data['clusters'].unique():
    average_acc = accuracy_error(data)
    bias_clus = bias_w_error(data, j, 'clusters')
    variance_list_local.append(bias_clus)
  variance = np.variance(variance_list_local)
  return variance

'''Calculate bias_acc_global'''

def calculate_bias_global_average(data, cluster_id, cluster_col, ave_acc):
  cluster_x = data.loc[data[cluster_col] == cluster_id]
  return accuracy_error(cluster_x) - ave_acc

'''Get min splittable cluster size - returns size of smallest new cluster'''
def min_split_cluster_size(data):
  min_cluster_size = len(data)
  for i in data['new_clusters'].unique():
    if i == -1:
      continue
    size = len(data.loc[data['new_clusters']==i])
    if size < min_cluster_size:
      min_cluster_size = size
  return min_cluster_size


'''Select a random cluster from provided list of clusters that is not -1'''
def get_random_cluster(clusters):
  result = -1
  while (result == -1):
    result = random.randint(0, len(clusters.unique()))
  print('This is the random cluster we picked:', result)
  return result

'''Plot cluster '''
def plot_clusters(data):
  scatterplot = sns.scatterplot(data=data, x='1st', y='2nd', hue="clusters", size = 'errors', sizes=(100, 20), palette = "tab10")
  plt.show()

'''Tsne plot'''
def tsne_plot(data, title, perplexity = 30, learning_rate = 200, n_iter = 1000, alpha = 0.5):
    tsne_features = data.drop(['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type', 'clusters', 'new_clusters',
                              'Shap_age', 'Shap_priors_count' , 'Shap_sex_Female', 'Shap_sex_Male', 'Shap_race_African-American', 'Shap_race_Asian', 'Shap_race_Caucasian', 'Shap_race_Hispanic', 'Shap_race_Native American', 'Shap_race_Other' ], axis=1)
    other_features = data[['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type', 'clusters', 'new_clusters',
                           'Shap_age', 'Shap_priors_count' , 'Shap_sex_Female', 'Shap_sex_Male', 'Shap_race_African-American', 'Shap_race_Asian', 'Shap_race_Caucasian', 'Shap_race_Hispanic', 'Shap_race_Native American', 'Shap_race_Other']]


    tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learning_rate, n_iter=n_iter)
    tsne_result = tsne.fit_transform(tsne_features)
    tsne_df = pd.DataFrame(tsne_result, index = features.index, columns=['t-SNE Component 1', 't-SNE Component 2'])

    temp_dataset = tsne_df.join(other_features, how='left')

    # Create scatterplot using seaborn
    scatterplot = sns.scatterplot(data=temp_dataset, x='t-SNE Component 1', y='t-SNE Component 2', alpha=alpha, hue="clusters", palette='tab10', style='Error_Type')
    scatterplot.set_title(title)
    scatterplot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), ncol=1)

    plt.show()

# Clustering with K-means

In [None]:
clus_model_kwargs = { "n_clusters": 2,
    "init": "k-means++", #method used to initialize the initial cluster centroids.
    "n_init": 10, #nr of times K-means  will be run with different centroid seeds
    "max_iter": 300,
}

In [None]:
def bias_with_error_rate_kmeans(data = TPFN_data, plot_clusters = True):

    # Initialize the dataset
    data = initialize_dataset(data)
    variance_list = []
    acc = accuracy_error(data, error = 'FN') #Calculating accuracy based on FN error

    #Loop for clustering iterations
    for i in range(1, max_iter):
        print('This is the current cluster: ', x) #for each iteration the current cluster is printed
        eps = eps - 0.001 #the eps value is incrementedly decreased to identify dense areas
        if len(data['clusters'].unique()) != 1:
            variance_list.append(calculate_variance(data)) #variance calculation is performed if the nr of unique clusters is not equal to 1

        data['new_clusters'] = -2
        candidate_cluster = data.loc[data['clusters'] == x]

        if len(candidate_cluster) < min_splittable_cluster_size:
            x = get_random_cluster(data['clusters'])
            continue