<a href="https://colab.research.google.com/github/MDankloff/ClusterCompas/blob/main/COMPAS_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
#from sklearn import DBSCAN
import warnings
warnings.filterwarnings("ignore")
import os
import random

ImportError: cannot import name 'DBSCAN' from 'sklearn' (/usr/local/lib/python3.10/dist-packages/sklearn/__init__.py)

# Load SHAP_ERROR_DATA

In [33]:
Shap_error_data = pd.read_csv('/content/Shap_error_data.csv')
#Shap_error_data.info()
features = Shap_error_data.drop(['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP'], axis=1)

In [34]:
print(Shap_error_data.isna().sum())
len(Shap_error_data.index)

age                              0
priors_count                     0
sex_Female                       0
sex_Male                         0
race_African-American            0
race_Asian                       0
race_Caucasian                   0
race_Hispanic                    0
race_Native American             0
race_Other                       0
Shap_age                         0
Shap_priors_count                0
Shap_sex_Female                  0
Shap_sex_Male                    0
Shap_race_African-American       0
Shap_race_Asian                  0
Shap_race_Caucasian              0
Shap_race_Hispanic               0
Shap_race_Native American        0
Shap_race_Other                  0
predicted_class               1507
true_class                    1507
errors                        1507
TP                            1507
TN                            1507
FN                            1507
FP                            1507
Error_Type                    1507
dtype: int64


5050

In [36]:
print(Shap_error_data.iloc[3])

age                           1.557897
priors_count                  1.966547
sex_Female                   -0.491321
sex_Male                      0.491321
race_African-American         0.978838
race_Asian                    -0.06462
race_Caucasian               -0.711101
race_Hispanic                -0.314295
race_Native American         -0.054582
race_Other                   -0.244563
Shap_age                      1.950683
Shap_priors_count            -1.257692
Shap_sex_Female                0.14411
Shap_sex_Male                -0.151978
Shap_race_African-American    1.787196
Shap_race_Asian              -0.018007
Shap_race_Caucasian           0.707991
Shap_race_Hispanic            0.264025
Shap_race_Native American     0.046934
Shap_race_Other               0.250397
predicted_class                    NaN
true_class                         NaN
errors                             NaN
TP                                 NaN
TN                                 NaN
FN                       

# Preprocessing: Initialize / scaling dataset

In [37]:
def initialize_dataset (raw_data, with_errors = True, just_features = True, scale_features = True, with_classes = True):
  #input is the original dataset, whether errors are included, only features should be used, to scale the features, class labels are included
  #Initialisation of the dataset. Scales the features and errors, which can be included or exluded for clustering
  #it returns a scaled dataset with new columns "clusters" = 0 and "new_clusters" = -1, which is required for HBAC

  new_data = raw_data.copy(deep=True)

  if with_errors:
    scaling_factor = 0.8  # default scaling factor - there is a trade-off between scaling of weighing the errors to guide biased clusters while preventing too large and uninformative clusters
    new_data['scaled_TP'] = new_data['TP'] * scaling_factor
    new_data['scaled_TN'] = new_data['TN'] * scaling_factor
    new_data['scaled_FN'] = new_data['FN'] * scaling_factor
    new_data['scaled_FP'] = new_data['FP'] * scaling_factor

  if just_features:
    new_data = new_data.drop(['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP'], axis=1)

  if scale_features:
    to_scale = raw_data.drop(['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP'], axis=1).columns
    new_data[to_scale] = StandardScaler().fit_transform(features[to_scale])

  if with_classes:
    for col in ['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP']:
      new_data[col] = raw_data[col]


    new_data['clusters'] = 0
    new_data['new_clusters'] = -1
  return new_data

In [48]:
##Drop rows where both TP and FN are zero
def drop_zero_TP_FN(data):
  return data.loc[(data['TP'] == 1) | (data['FN'] == 1)]

TP_FN_data = drop_zero_TP_FN(Shap_error_data)

#Drop rows where both TN and FP are zero
def drop_zero_TN_FP(data):
  return data.loc[(data['TN']  == 1 ) & (data['FP'] == 1)]

TN_FP_data = drop_zero_TN_FP(Shap_error_data)


In [49]:
if (TP_FN_data['TP'] == 0 & (TP_FN_data['FN'] == 0) ).any():
    print("There's at least one occurrence of TP 0 and FN 0 in the dataset.")
else:
    print("There's no occurrence of 1 in the column.")

There's at least one occurrence of TP 0 and FN 0 in the dataset.


In [51]:
print(TP_FN_data[['TP', 'FN']])

       TP   FN
1     1.0  0.0
6     1.0  0.0
9     0.0  1.0
11    1.0  0.0
14    0.0  1.0
...   ...  ...
5039  0.0  1.0
5040  1.0  0.0
5041  0.0  1.0
5043  1.0  0.0
5048  1.0  0.0

[1584 rows x 2 columns]


In [52]:
print(TP_FN_data.iloc[3])

age                           -1.08491
priors_count                 -0.505199
sex_Female                     2.03533
sex_Male                      -2.03533
race_African-American         -1.02162
race_Asian                    -0.06462
race_Caucasian                 1.40627
race_Hispanic                -0.314295
race_Native American         -0.054582
race_Other                   -0.244563
Shap_age                     -0.256999
Shap_priors_count             1.045115
Shap_sex_Female               4.433496
Shap_sex_Male                 4.233697
Shap_race_African-American    0.804825
Shap_race_Asian              -0.052872
Shap_race_Caucasian           1.299981
Shap_race_Hispanic            0.284534
Shap_race_Native American     0.046934
Shap_race_Other               0.040714
predicted_class                    1.0
true_class                         1.0
errors                             0.0
TP                                 1.0
TN                                 0.0
FN                       

# PCA
needs to be separated still for TN/FP and TP/FN

In [None]:
def pca_plot(data, title, alpha):

    #extract features for PCA and drop the other columns in other_features df
    pca_features = data.drop(['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type'], axis=1)
    other_features = data[['predicted_class', 'true_class', 'errors', 'TP', 'TN', 'FN', 'FP', 'Error_Type']]

    # Scale the PCA features before using PCA
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(pca_features)

    # Apply PCA with 2 components to scaled features and create a df for the resulting principal components
    pca_result = PCA(n_components=2).fit_transform(scaled_features)
    pca_df = pd.DataFrame(pca_result, index=pca_features.index)

    #create temporary dataset that contains both principal components and other features
    temp_dataset = pca_df.join(other_features, how='left')
    temp_dataset.rename(columns={0: 'Principal Component 1', 1: 'Principal Component 2'}, inplace=True)

    scatterplot = sns.scatterplot(data=temp_dataset,x ='Principal Component 1', y='Principal Component 2', alpha=alpha, hue="Error_Type", palette="tab10", style="Error_Type")

    scatterplot.set_title(title)
    scatterplot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), ncol=1)

    explained_variance_ratio = PCA(n_components=2).fit(scaled_features).explained_variance_ratio_
    print(f"Explained Variance Ratio: PC1 = {explained_variance_ratio[0]:.2f}, PC2 = {explained_variance_ratio[1]:.2f}")

    plt.show()


## Clustering Functions

In [None]:
#Calculate error metric

#Calculate bias

#Calculate absolute difference

#Calculate variance

#Get min splittable cluster size

##########################################

#Select a new cluster to split on based on the smallest absolute difference from the overall error rate of 0.5
#Function requires a df.columns named 'clusters' and 'FP'

def select_new_cluster(data, error_column='FP', overall_error_rate=0.5):
  smallest_diff = 1
  cluster_number = 0

  for cluster_number in data['clusters'].unique():
    if cluster_number == -1:
      continue
    cluster_data = data[data['clusters'] == cluster_number]
    cluster_error_rate = cluster_data([error_column]).mean()
    abs_diff = abs(overall_error_rate - cluster_error_rate)

    if abs_diff < smallest_diff:
      smallest_diff = abs_diff
      selected_cluster = cluster_number

  return selected_cluster

#############################################

#Select a random cluster
#returns a random cluster from provided list of clusters that is not -1
def get_random_cluster(clusters):
  result = -1
  while (result == -1):
    result = random.randint(0, len(clusters.unique()))
  print('This is the random cluster we picked:', result)
  return result

#############################################

#Plot cluster
def plot_clusters(data):
  scatterplot = sns.scatterplot(data=data, x='1st', y='2nd', hue="clusters", size = 'errors', sizes=(100, 20), palette = "tab10")
  plt.show()


In [None]:
def bias_with_error_rate(full_data, input_columns, error_column, max_iter=10, min_splittable_cluster_size=10, dbscan_max_iter=100):
#Cluster the data based on the FP or FN for error column and selects the best cluster iteratively.
  for i in range(1, max_iter):
