In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from collections import defaultdict
import urllib
import os,sys
from sklearn import feature_extraction
from sklearn import preprocessing

from collections import defaultdict

# find the csv file
csv_file = 'compas-scores-two-years.csv'
# read the file into a data frame
df = pd.read_csv(csv_file, engine='python', on_bad_lines='skip')
print(df.head())
with open(csv_file, "r") as f:
    first_line = f.readline().strip()
# view some of the data
parts = first_line.split(",")
for part in parts: 
    print(part)
df.info()
df.describe(include='all').transpose()

# make copy of data for later
df_copy = pd.read_csv(csv_file, engine='python', on_bad_lines='skip')

   id                name   first         last compas_screening_date   sex  \
0   1    miguel hernandez  miguel    hernandez            2013-08-14  Male   
1   3         kevon dixon   kevon        dixon            2013-01-27  Male   
2   4            ed philo      ed        philo            2013-04-14  Male   
3   5         marcu brown   marcu        brown            2013-01-13  Male   
4   6  bouthy pierrelouis  bouthy  pierrelouis            2013-03-26  Male   

          dob  age          age_cat              race  ...  v_decile_score  \
0  1947-04-18   69  Greater than 45             Other  ...               1   
1  1982-01-22   34          25 - 45  African-American  ...               1   
2  1991-05-14   24     Less than 25  African-American  ...               3   
3  1993-01-21   23     Less than 25  African-American  ...               6   
4  1973-01-22   43          25 - 45             Other  ...               1   

   v_score_text  v_screening_date  in_custody  out_custody  pr

In [41]:
FEATURES_CLASS = ["age_cat", "race", "sex", "priors_count", "c_charge_degree"]
CONTI_FEATURE = ["priors_count"]
CLASS_FEATURE = "two_year_recid"
SENSITIVE_FEATURE = "race"
data = df.to_dict('list')
for k in data.keys():
	data[k] = np.array(data[k])
y = data[CLASS_FEATURE]
y[y==0] = -1


print ("\nNumber of people recidivating within two years")
print (pd.Series(y).value_counts())
X = np.array([]).reshape(len(y), 0)
x_control = defaultdict(list)
feature_names = []

for attr in FEATURES_CLASS:
		vals = data[attr]
		if attr in CONTI_FEATURE:
			vals = [float(v) for v in vals]
			vals = preprocessing.scale(vals) # 0 mean and 1 variance  
			vals = np.reshape(vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col

		else: # for binary categorical variables, the label binarizer uses just one var instead of two
			lb = preprocessing.LabelBinarizer()
			lb.fit(vals)
			vals = lb.transform(vals)

		# add to sensitive features dict
		if attr in SENSITIVE_FEATURE:
			x_control[attr] = vals


		# add to learnable features
		X = np.hstack((X, vals))

		if attr in CONTI_FEATURE: # continuous feature, just append the name
			feature_names.append(attr)
		else: # categorical features
			if vals.shape[1] == 1: # binary features that passed through lib binarizer
				feature_names.append(attr)
			else:
				for k in lb.classes_: # non-binary categorical features, need to add the names for each cat
					feature_names.append(attr + "_" + str(k))


# convert the sensitive feature to 1-d array
x_control = dict(x_control)
for k in x_control.keys():
		x_control[k] = np.array(x_control[k]).flatten()
	
print(x_control)
print(X.shape)
print(y.shape)
print(feature_names)


Number of people recidivating within two years
-1    3963
 1    3251
Name: count, dtype: int64
{'race': array([0, 0, 0, ..., 1, 0, 0], shape=(43284,))}
(7214, 12)
(7214,)
['age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'race_African-American', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other', 'sex', 'priors_count', 'c_charge_degree']


In [42]:
def my_split_into_train_test(x_all, y_all, x_control_all, train_size):

    split_point = int(round(float(x_all.shape[0]) * train_size))
    x_all_train = x_all[:split_point]
    x_all_test = x_all[split_point:]
    y_all_train = y_all[:split_point]
    y_all_test = y_all[split_point:]
    x_control_all_train = {}
    x_control_all_test = {}
    for k in x_control_all.keys():
        x_control_all_train[k] = x_control_all[k][:split_point]
        x_control_all_test[k] = x_control_all[k][split_point:]

    return x_all_train, y_all_train, x_control_all_train, x_all_test, y_all_test, x_control_all_test
def my_split (x_all,y_all,train_size):
  split_point = int(round(float(x_all.shape[0]) * train_size))
  x_all_train = x_all[:split_point]
  x_all_test = x_all[split_point:]
  y_all_train = y_all[:split_point]
  y_all_test = y_all[split_point:]

  return x_all_train, y_all_train, x_all_test, y_all_test
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
x_train, y_train, x_test, y_test = my_split(X,y,0.2)

In [43]:
# train model 1 on balanced weights
model1 = LogisticRegression(class_weight='balanced')
model1.fit(x_train, y_train)
y_pred = model1.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model1accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model1report = classification_report(y_test, y_pred)
scores1 = cross_val_score(model1, X, y, cv=5, scoring="accuracy")
print(scores1)

Accuracy: 0.6570784959279155
              precision    recall  f1-score   support

          -1       0.69      0.67      0.68      3151
           1       0.62      0.64      0.63      2620

    accuracy                           0.66      5771
   macro avg       0.65      0.66      0.65      5771
weighted avg       0.66      0.66      0.66      5771

[0.67428967 0.66389466 0.64449064 0.67567568 0.65950069]


In [44]:
# train model 2 on balanced weights
model2 = RandomForestClassifier(class_weight='balanced')
model2.fit(x_train, y_train)
y_pred = model2.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model2accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model2report = classification_report(y_test, y_pred)
scores2 = cross_val_score(model2, X, y, cv=5, scoring="accuracy")
print(scores2)

Accuracy: 0.628140703517588
              precision    recall  f1-score   support

          -1       0.65      0.69      0.67      3151
           1       0.60      0.56      0.58      2620

    accuracy                           0.63      5771
   macro avg       0.62      0.62      0.62      5771
weighted avg       0.63      0.63      0.63      5771

[0.66112266 0.65003465 0.63063063 0.66181566 0.64840499]


In [45]:
# re-train model 1 on normal weights
model3 = LogisticRegression()
model3.fit(x_train, y_train)
y_pred = model3.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model3accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model3report = classification_report(y_test, y_pred)
scores3 = cross_val_score(model3, X, y, cv=5, scoring="accuracy")
print(scores3)

Accuracy: 0.660370819615318
              precision    recall  f1-score   support

          -1       0.66      0.78      0.71      3151
           1       0.66      0.52      0.58      2620

    accuracy                           0.66      5771
   macro avg       0.66      0.65      0.65      5771
weighted avg       0.66      0.66      0.65      5771

[0.67706168 0.66597367 0.65003465 0.67636868 0.65603329]


In [46]:
# re-train model 2 on normal weights
model4 = RandomForestClassifier()
model4.fit(x_train, y_train)
y_pred = model4.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model4accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model4report = classification_report(y_test, y_pred)
scores4 = cross_val_score(model4, X, y, cv=5, scoring="accuracy")
print(scores4)

Accuracy: 0.6357650320568359
              precision    recall  f1-score   support

          -1       0.65      0.71      0.68      3151
           1       0.61      0.54      0.58      2620

    accuracy                           0.64      5771
   macro avg       0.63      0.63      0.63      5771
weighted avg       0.63      0.64      0.63      5771

[0.66042966 0.65349965 0.63201663 0.64795565 0.65464632]


In [47]:
# re-train both models after removing possible sensitive info for less bias
FEATURES_CLASS = ["priors_count", "c_charge_degree"]
CONTI_FEATURE = ["priors_count"]
CLASS_FEATURE = "two_year_recid"
SENSITIVE_FEATURE = "race"
data = df_copy.to_dict('list')
for k in data.keys():
	data[k] = np.array(data[k])
y = data[CLASS_FEATURE]
y[y==0] = -1


print ("\nNumber of people recidivating within two years")
print (pd.Series(y).value_counts())
X = np.array([]).reshape(len(y), 0)
x_control = defaultdict(list)
feature_names = []

for attr in FEATURES_CLASS:
	vals = data[attr]
	if attr in CONTI_FEATURE:
		vals = vals.astype(float)
		vals = preprocessing.scale(vals).reshape(len(y), -1)
		feature_names.append(attr)
	else:
		lb = preprocessing.LabelBinarizer()
		lb.fit(vals)
		vals = lb.transform(vals)
		if vals.shape[1] == 1:
			feature_names.append(attr)
		else:
			for k in lb.classes_:
				feature_names.append(f"{attr}_{k}")
	X = np.hstack((X, vals))
	
	if attr in CONTI_FEATURE: # continuous feature, just append the name
		feature_names.append(attr)
	else: # categorical features
		if vals.shape[1] == 1: # binary features that passed through lib binarizer
			feature_names.append(attr)
		else:
			for k in lb.classes_: # non-binary categorical features, need to add the names for each cat
				feature_names.append(attr + "_" + str(k))


# convert the sensitive feature to 1-d array
x_control = dict(x_control)
for k in x_control.keys():
		x_control[k] = np.array(x_control[k]).flatten()
	
print(x_control)
print(X.shape)
print(y.shape)
print(feature_names)
def my_split_into_train_test(x_all, y_all, x_control_all, train_size):

    split_point = int(round(float(x_all.shape[0]) * train_size))
    x_all_train = x_all[:split_point]
    x_all_test = x_all[split_point:]
    y_all_train = y_all[:split_point]
    y_all_test = y_all[split_point:]
    x_control_all_train = {}
    x_control_all_test = {}
    for k in x_control_all.keys():
        x_control_all_train[k] = x_control_all[k][:split_point]
        x_control_all_test[k] = x_control_all[k][split_point:]

    return x_all_train, y_all_train, x_control_all_train, x_all_test, y_all_test, x_control_all_test
def my_split (x_all,y_all,train_size):
  split_point = int(round(float(x_all.shape[0]) * train_size))
  x_all_train = x_all[:split_point]
  x_all_test = x_all[split_point:]
  y_all_train = y_all[:split_point]
  y_all_test = y_all[split_point:]

  return x_all_train, y_all_train, x_all_test, y_all_test
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
x_train, y_train, x_test, y_test = my_split(X,y,0.2)


Number of people recidivating within two years
-1    3963
 1    3251
Name: count, dtype: int64
{}
(7214, 2)
(7214,)
['priors_count', 'priors_count', 'c_charge_degree', 'c_charge_degree']


In [48]:
# re-train model 1 on balanced weights and "unbiased" data
model5 = LogisticRegression(class_weight='balanced')
model5.fit(x_train, y_train)
y_pred = model5.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model5accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model5report = classification_report(y_test, y_pred)
scores5 = cross_val_score(model5, X, y, cv=5, scoring="accuracy")
print(scores5)

Accuracy: 0.6394039161323861
              precision    recall  f1-score   support

          -1       0.64      0.77      0.70      3151
           1       0.63      0.49      0.55      2620

    accuracy                           0.64      5771
   macro avg       0.64      0.63      0.62      5771
weighted avg       0.64      0.64      0.63      5771

[0.65349965 0.64033264 0.60776161 0.66250866 0.64909847]


In [49]:
# re-train model 2 on balanced weights and "unbiased" data
model6 = RandomForestClassifier(class_weight='balanced')
model6.fit(x_train, y_train)
y_pred = model6.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model6accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model6report = classification_report(y_test, y_pred)
scores6 = cross_val_score(model6, X, y, cv=5, scoring="accuracy")
print(scores6)

Accuracy: 0.6340322301160978
              precision    recall  f1-score   support

          -1       0.64      0.76      0.69      3151
           1       0.62      0.49      0.55      2620

    accuracy                           0.63      5771
   macro avg       0.63      0.62      0.62      5771
weighted avg       0.63      0.63      0.63      5771

[0.63825364 0.62924463 0.60637561 0.64795565 0.6518724 ]


In [50]:
# re-train model 1 on normal weights and "unbiased" data
model7 = LogisticRegression()
model7.fit(x_train, y_train)
y_pred = model7.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model7accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model7report = classification_report(y_test, y_pred)
scores7 = cross_val_score(model7, X, y, cv=5, scoring="accuracy")
print(scores7)

Accuracy: 0.6314330272049905
              precision    recall  f1-score   support

          -1       0.62      0.86      0.72      3151
           1       0.68      0.36      0.47      2620

    accuracy                           0.63      5771
   macro avg       0.65      0.61      0.59      5771
weighted avg       0.64      0.63      0.60      5771

[0.63825364 0.63894664 0.60706861 0.64726265 0.63661581]


In [51]:
# re-train model 2 on normal weights and "unbiased" data
model8 = RandomForestClassifier()
model8.fit(x_train, y_train)
y_pred = model8.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
model8accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))
model8report = classification_report(y_test, y_pred)
scores8 = cross_val_score(model8, X, y, cv=5, scoring="accuracy")
print(scores8)

Accuracy: 0.6236354184716687
              precision    recall  f1-score   support

          -1       0.62      0.81      0.70      3151
           1       0.63      0.40      0.49      2620

    accuracy                           0.62      5771
   macro avg       0.63      0.61      0.60      5771
weighted avg       0.63      0.62      0.61      5771

[0.65142065 0.63825364 0.60984061 0.65627166 0.64979196]


In [52]:
# output the results to output.txt
with open("output.txt", "w") as f:
    f.write(f"Model 1 (biased and balanced):\n")
    f.write(f"Accuracy: {model1accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model1report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores1}\n\n")
    f.write("Model 2 (biased and balanced):\n")
    f.write(f"Accuracy: {model2accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model2report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores2}\n\n")
    f.write(f"Model 3 (1 but biased and unbalanced):\n")
    f.write(f"Accuracy: {model3accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model3report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores3}\n\n")
    f.write("Model 4 (2 but biased and unbalanced):\n")
    f.write(f"Accuracy: {model4accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model4report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores4}\n\n")
    f.write(f"Model 5 (1 but unbiased and balanced):\n")
    f.write(f"Accuracy: {model5accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model5report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores5}\n\n")
    f.write("Model 6 (2 but unbiased and balanced):\n")
    f.write(f"Accuracy: {model6accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model6report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores6}\n\n")
    f.write(f"Model 7 (1 but unbiased and unbalanced):\n")
    f.write(f"Accuracy: {model7accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model7report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores7}\n\n")
    f.write("Model 8 (2 but unbiased and unbalanced):\n")
    f.write(f"Accuracy: {model8accuracy}\n")
    f.write("Classification Report:\n")
    f.write(model8report)
    f.write("\n")
    f.write(f"Cross validation scores: {scores8}\n\n")
    f.write(f"Average accuracy of models 1 and 2: {(model1accuracy + model2accuracy) / 2}\n")
    f.write(f"Average accuracy of models 3 and 4: {(model3accuracy + model4accuracy) / 2}\n")
    f.write(f"Average accuracy of models 5 and 6: {(model5accuracy + model6accuracy) / 2}\n")
    f.write(f"Average accuracy of models 7 and 8: {(model7accuracy + model8accuracy) / 2}\n")