# Fair AI with COMPAS Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Loading & Exploration

In [2]:
# compas scores raw
df_compas = pd.read_csv("../data/compas-scores-raw.csv")
print(f"Shape compas-scores-raw: {df_compas.shape}")
print(f"Columns: {df_compas.columns.tolist()}")
df_compas.head()

Shape compas-scores-raw: (60843, 28)
Columns: ['Person_ID', 'AssessmentID', 'Case_ID', 'Agency_Text', 'LastName', 'FirstName', 'MiddleName', 'Sex_Code_Text', 'Ethnic_Code_Text', 'DateOfBirth', 'ScaleSet_ID', 'ScaleSet', 'AssessmentReason', 'Language', 'LegalStatus', 'CustodyStatus', 'MaritalStatus', 'Screening_Date', 'RecSupervisionLevel', 'RecSupervisionLevelText', 'Scale_ID', 'DisplayText', 'RawScore', 'DecileScore', 'ScoreText', 'AssessmentType', 'IsCompleted', 'IsDeleted']


Unnamed: 0,Person_ID,AssessmentID,Case_ID,Agency_Text,LastName,FirstName,MiddleName,Sex_Code_Text,Ethnic_Code_Text,DateOfBirth,...,RecSupervisionLevel,RecSupervisionLevelText,Scale_ID,DisplayText,RawScore,DecileScore,ScoreText,AssessmentType,IsCompleted,IsDeleted
0,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,7,Risk of Violence,-2.08,4,Low,New,1,0
1,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,8,Risk of Recidivism,-1.06,2,Low,New,1,0
2,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,18,Risk of Failure to Appear,15.0,1,Low,New,1,0
3,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,...,1,Low,7,Risk of Violence,-2.84,2,Low,New,1,0
4,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,...,1,Low,8,Risk of Recidivism,-1.5,1,Low,New,1,0


In [3]:
# Cox Violent Parsed Dataset
df_cox = pd.read_csv("../data/cox-violent-parsed.csv")
print(f"Shape cox-violent-parsed: {df_cox.shape}")
print(f"Columns: {df_cox.columns.tolist()}")
df_cox.head()

Shape cox-violent-parsed: (18316, 52)
Columns: ['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob', 'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid', 'is_violent_recid', 'vr_case_number', 'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc', 'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date', 'v_type_of_assessment', 'v_decile_score', 'v_score_text', 'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1', 'start', 'end', 'event']


Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event
0,1.0,miguel hernandez,miguel,hernandez,14/08/2013,Male,18/04/1947,69,Greater than 45,Other,...,Risk of Violence,1,Low,14/08/2013,07/07/2014,14/07/2014,0,0,327,0
1,2.0,miguel hernandez,miguel,hernandez,14/08/2013,Male,18/04/1947,69,Greater than 45,Other,...,Risk of Violence,1,Low,14/08/2013,07/07/2014,14/07/2014,0,334,961,0
2,3.0,michael ryan,michael,ryan,31/12/2014,Male,06/02/1985,31,25 - 45,Caucasian,...,Risk of Violence,2,Low,31/12/2014,30/12/2014,03/01/2015,0,3,457,0
3,4.0,kevon dixon,kevon,dixon,27/01/2013,Male,22/01/1982,34,25 - 45,African-American,...,Risk of Violence,1,Low,27/01/2013,26/01/2013,05/02/2013,0,9,159,1
4,5.0,ed philo,ed,philo,14/04/2013,Male,14/05/1991,24,Less than 25,African-American,...,Risk of Violence,3,Low,14/04/2013,16/06/2013,16/06/2013,4,0,63,0


In [4]:
# Cox Violent Parsed Filtered Dataset
df_cox_filt = pd.read_csv("../data/cox-violent-parsed_filt.csv")
print(f"Shape cox-violent-parsed_filt: {df_cox_filt.shape}")
print(f"Columns: {df_cox_filt.columns.tolist()}")
df_cox_filt.head()

Shape cox-violent-parsed_filt: (18316, 40)
Columns: ['id', 'name', 'first', 'last', 'sex', 'dob', 'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_days_from_compas', 'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_charge_degree', 'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in', 'violent_recid', 'is_violent_recid', 'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc', 'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date', 'v_type_of_assessment', 'v_decile_score', 'v_score_text', 'priors_count.1', 'event']


Unnamed: 0,id,name,first,last,sex,dob,age,age_cat,race,juv_fel_count,...,vr_charge_desc,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,priors_count.1,event
0,1.0,miguel hernandez,miguel,hernandez,Male,18/04/1947,69,Greater than 45,Other,0,...,,Risk of Recidivism,1,Low,14/08/2013,Risk of Violence,1,Low,0,0
1,2.0,miguel hernandez,miguel,hernandez,Male,18/04/1947,69,Greater than 45,Other,0,...,,Risk of Recidivism,1,Low,14/08/2013,Risk of Violence,1,Low,0,0
2,3.0,michael ryan,michael,ryan,Male,06/02/1985,31,25 - 45,Caucasian,0,...,,Risk of Recidivism,5,Medium,31/12/2014,Risk of Violence,2,Low,0,0
3,4.0,kevon dixon,kevon,dixon,Male,22/01/1982,34,25 - 45,African-American,0,...,Felony Battery (Dom Strang),Risk of Recidivism,3,Low,27/01/2013,Risk of Violence,1,Low,0,1
4,5.0,ed philo,ed,philo,Male,14/05/1991,24,Less than 25,African-American,0,...,,Risk of Recidivism,4,Low,14/04/2013,Risk of Violence,3,Low,4,0


In [5]:
# propublica data
df_propublica = pd.read_csv("../data/propublica_data_for_fairml.csv")
print(f"Shape propublica_data_for_fairml: {df_propublica.shape}")
print(f"Columns: {df_propublica.columns.tolist()}")
df_propublica.head()

Shape propublica_data_for_fairml: (6172, 12)
Columns: ['Two_yr_Recidivism', 'Number_of_Priors', 'score_factor', 'Age_Above_FourtyFive', 'Age_Below_TwentyFive', 'African_American', 'Asian', 'Hispanic', 'Native_American', 'Other', 'Female', 'Misdemeanor']


Unnamed: 0,Two_yr_Recidivism,Number_of_Priors,score_factor,Age_Above_FourtyFive,Age_Below_TwentyFive,African_American,Asian,Hispanic,Native_American,Other,Female,Misdemeanor
0,0,0,0,1,0,0,0,0,0,1,0,0
1,1,0,0,0,0,1,0,0,0,0,0,0
2,1,4,0,0,1,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,1
4,1,14,1,0,0,0,0,0,0,0,0,0


In [6]:
# Matched Columns between datasets
print("Matched Columns:")
compas_cols = set(df_compas.columns)
cox_cols = set(df_cox.columns)
cox_filt_cols = set(df_cox_filt.columns)
propublica_cols = set(df_propublica.columns)

print(f" - COMPAS y Cox: {compas_cols.intersection(cox_cols)}")
print(f" - Cox y Cox filtered: {cox_cols.intersection(cox_filt_cols)}")
print(f" - ProPublica y Cox: {propublica_cols.intersection(cox_cols)}")

Matched Columns:
 - COMPAS y Cox: set()
 - Cox y Cox filtered: {'race', 'r_charge_desc', 'first', 'c_charge_desc', 'type_of_assessment', 'vr_charge_degree', 'r_offense_date', 'vr_offense_date', 'juv_other_count', 'days_b_screening_arrest', 'decile_score.1', 'id', 'v_score_text', 'priors_count', 'r_days_from_arrest', 'priors_count.1', 'last', 'c_days_from_compas', 'dob', 'is_violent_recid', 'r_jail_in', 'v_decile_score', 'decile_score', 'name', 'vr_charge_desc', 'is_recid', 'juv_fel_count', 'age_cat', 'screening_date', 'sex', 'c_charge_degree', 'score_text', 'c_jail_in', 'event', 'violent_recid', 'juv_misd_count', 'v_type_of_assessment', 'age', 'r_charge_degree', 'c_jail_out'}
 - ProPublica y Cox: set()
