<a href="https://colab.research.google.com/github/MDankloff/ClusterCompas/blob/main/COMPAS_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")
import os

# Specify the directory path
directory_path = '//Users/mirthedankloff/Documents/XAI_Bias_Error/COMPAS'

# Traverse the directory and print the names of all files
for dirname, _, filenames in os.walk(directory_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [95]:
compas_2 = pd.read_csv('/content/compas-scores-two-years.csv')

columns_to_remove = ['name','first','last', 'dob', 'compas_screening_date', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
                     'c_charge_degree', 'c_charge_desc', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid', 'vr_case_number', 'vr_charge_degree', 'vr_offense_date',
                     'vr_charge_desc', 'screening_date', 'v_screening_date', 'in_custody', 'out_custody', 'start', 'end', 'decile_score.1', 'v_score_text', 'type_of_assessment', 'v_type_of_assessment']
compas_2 = compas_2.drop(columns=columns_to_remove)
compas_2.head()
#print(compas_2)

Unnamed: 0,id,sex,age,age_cat,race,decile_score,priors_count,is_recid,is_violent_recid,score_text,v_decile_score,priors_count.1,event,two_year_recid
0,1,Male,69,Greater than 45,Other,1,0,0,0,Low,1,0,0,0
1,3,Male,34,25 - 45,African-American,3,0,1,1,Low,1,0,1,1
2,4,Male,24,Less than 25,African-American,4,4,1,0,Low,3,4,0,1
3,5,Male,23,Less than 25,African-American,8,1,0,0,High,6,1,0,0
4,6,Male,43,25 - 45,Other,1,2,0,0,Low,1,2,0,0


Change Objects to Categories

In [96]:
dummy_cols = ['sex', 'age_cat', 'race']
compas_2_dummies = pd.get_dummies(compas_2[dummy_cols], columns = dummy_cols, prefix=dummy_cols)
compas_2 = pd.concat([compas_2, compas_2_dummies], axis=1)

print(compas_2.columns)
compas_2.head()

Index(['id', 'sex', 'age', 'age_cat', 'race', 'decile_score', 'priors_count',
       'is_recid', 'is_violent_recid', 'score_text', 'v_decile_score',
       'priors_count.1', 'event', 'two_year_recid', 'sex_Female', 'sex_Male',
       'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25',
       'race_African-American', 'race_Asian', 'race_Caucasian',
       'race_Hispanic', 'race_Native American', 'race_Other'],
      dtype='object')


Unnamed: 0,id,sex,age,age_cat,race,decile_score,priors_count,is_recid,is_violent_recid,score_text,...,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other
0,1,Male,69,Greater than 45,Other,1,0,0,0,Low,...,1,0,1,0,0,0,0,0,0,1
1,3,Male,34,25 - 45,African-American,3,0,1,1,Low,...,1,1,0,0,1,0,0,0,0,0
2,4,Male,24,Less than 25,African-American,4,4,1,0,Low,...,1,0,0,1,1,0,0,0,0,0
3,5,Male,23,Less than 25,African-American,8,1,0,0,High,...,1,0,0,1,1,0,0,0,0,0
4,6,Male,43,25 - 45,Other,1,2,0,0,Low,...,1,1,0,0,0,0,0,0,0,1


In [97]:
#Check if some columns hold same values and if columns have empty values

if compas_2['is_recid'].equals(compas_2['two_year_recid']):
  print("same")
else:
  print("not same")

missing_values = compas_2.isna()
print(missing_values.sum())

not same
id                         0
sex                        0
age                        0
age_cat                    0
race                       0
decile_score               0
priors_count               0
is_recid                   0
is_violent_recid           0
score_text                 0
v_decile_score             0
priors_count.1             0
event                      0
two_year_recid             0
sex_Female                 0
sex_Male                   0
age_cat_25 - 45            0
age_cat_Greater than 45    0
age_cat_Less than 25       0
race_African-American      0
race_Asian                 0
race_Caucasian             0
race_Hispanic              0
race_Native American       0
race_Other                 0
dtype: int64


In [98]:
compas_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7214 entries, 0 to 7213
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       7214 non-null   int64 
 1   sex                      7214 non-null   object
 2   age                      7214 non-null   int64 
 3   age_cat                  7214 non-null   object
 4   race                     7214 non-null   object
 5   decile_score             7214 non-null   int64 
 6   priors_count             7214 non-null   int64 
 7   is_recid                 7214 non-null   int64 
 8   is_violent_recid         7214 non-null   int64 
 9   score_text               7214 non-null   object
 10  v_decile_score           7214 non-null   int64 
 11  priors_count.1           7214 non-null   int64 
 12  event                    7214 non-null   int64 
 13  two_year_recid           7214 non-null   int64 
 14  sex_Female               7214 non-null  

In [99]:
compas_2.to_csv('/content/cleaned_compas_data.csv', index=False)
