In [1]:
import pickle
from sklearn.model_selection import train_test_split
import pandas as pd
from scipy.stats import chi2_contingency

## Import data and model

In [7]:
raw_data = pd.read_csv("../data/data_clean.csv")
data = raw_data[raw_data['birth date'] < 2020].copy()

data['age'] = data['parole board interview date'] - data['birth date']
data['jail duration'] = data['parole board interview date'] - data['year of entry']

others_parole_type =  ['PIE', 'SP CONSDR', 'ECPDO', 'MEDICAL','RESCISSION', 'DEPORT']
data['parole board interview type'] = data['parole board interview type'].replace(others_parole_type, 'OTHERS').replace('SUPP MERIT', 'MERIT TIME').replace('PV REAPP', 'REAPPEAR')

data = data.dropna(axis=0, subset=['crime 1 - class', 'parole eligibility date'])

df_one_hot = pd.get_dummies(data, columns=[
    "sex", "race / ethnicity"], drop_first=True)

df_one_hot = pd.get_dummies(df_one_hot, columns=[
    "crime 1 - class", "crime 2 - class",
    "crime 3 - class", "crime 4 - class",
    "parole board interview type"])

df_one_hot.drop(columns=['release date','birth date', 'year of entry'],inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df_one_hot.drop('y',axis=1), df_one_hot['y'], 
                                                    stratify=df_one_hot['y'], test_size=0.3, random_state=42)

In [4]:
# Load the model from the .pkl file
xgb_classifier = pickle.load(open("../model.pkl", "rb"))

In [8]:
data_train, _, _, _ = train_test_split(data.drop('y',axis=1), data['y'],
                                                    stratify=data['y'], test_size=0.3, random_state=42)

## FPDP

### Feature Sex

In [9]:
X_train.head()

Unnamed: 0,parole board interview date,parole eligibility date,min_sentence,max_sentence,count of crimes,age,jail duration,sex_MALE,race / ethnicity_ASIAN/PACIFIC,race / ethnicity_BLACK,...,crime 3 - class_E,crime 4 - class_A,crime 4 - class_B,crime 4 - class_C,crime 4 - class_D,crime 4 - class_E,parole board interview type_INITIAL,parole board interview type_MERIT TIME,parole board interview type_OTHERS,parole board interview type_REAPPEAR
22045,2015,2015.0,1.0,3.0,1,58.0,1,True,False,False,...,False,False,False,False,False,False,True,False,False,False
12484,2013,2014.0,4.0,4.0,1,37.0,0,True,False,False,...,False,False,False,False,False,False,False,True,False,False
20431,2014,2015.0,4.0,4.0,1,42.0,0,True,False,False,...,False,False,False,False,False,False,True,False,False,False
9179,2013,2011.0,25.0,50.0,4,51.0,24,True,False,False,...,True,False,True,False,False,False,False,False,False,True
6694,2013,2013.0,10.0,8.0,1,62.0,6,True,False,True,...,False,False,False,False,False,False,True,False,False,False


In [13]:
X_train['sex_MALE'].unique()

array([ True, False])

In [36]:
# Set 'True' for all instances
X_train_true = X_train.copy()
X_train_true['sex_MALE'] = True

# Create a contingency table
contingency_table = pd.crosstab(X_train_true['sex_MALE'], xgb_classifier.predict(X_train_true))
print(contingency_table)

# Perform the chi-square test
chi2, p, a, b = chi2_contingency(contingency_table)
print(f"Chi-square value: {chi2}")
print(f"P-value: {p}")

col_0         0     1
sex_MALE             
True      12579  8530
Chi-square value: 0.0
P-value: 1.0


In [26]:
# Set 'False' for all instances
X_train_false = X_train.copy()
X_train_false['sex_MALE'] = False

# Create a contingency table
contingency_table = pd.crosstab(X_train['sex_MALE'], xgb_classifier.predict(X_train_false))
print(contingency_table)

# Perform the chi-square test
chi2, p, a, b = chi2_contingency(contingency_table)
print(f"Chi-square value: {chi2}")
print(f"P-value: {p}")

col_0        0      1
sex_MALE             
False      199   1167
True      7029  12714
Chi-square value: 250.11268086767475
P-value: 2.4539982933323717e-56


#### Feature Age

In [28]:
X_train_age_1 = X_train.copy()
X_train_age_1['age'] = 20

contingency_table = pd.crosstab(X_train['age'], xgb_classifier.predict(X_train_age_1))

# Perform the chi-square test
chi2, p, a, b = chi2_contingency(contingency_table)
print(f"Chi-square value: {chi2}")
print(f"P-value: {p}")

Chi-square value: 629.662595144586
P-value: 1.6336405480366143e-89


In [29]:
X_train_age_2 = X_train.copy()
X_train_age_2['age'] = 30

contingency_table = pd.crosstab(X_train['age'], xgb_classifier.predict(X_train_age_2))

# Perform the chi-square test
chi2, p, a, b = chi2_contingency(contingency_table)
print(f"Chi-square value: {chi2}")
print(f"P-value: {p}")

Chi-square value: 523.8284017912417
P-value: 2.3379528220323925e-69


In [35]:
X_train_age_2 = X_train.copy()
X_train_age_2['age'] = 30

contingency_table = pd.crosstab(X_train['age'], xgb_classifier.predict(X_train_age_2))

# Perform the chi-square test
chi2, p, a, b = chi2_contingency(contingency_table)
print(f"Chi-square value: {chi2}")
print(f"P-value: {p}")

Chi-square value: 523.8284017912417
P-value: 2.3379528220323925e-69
