In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
import numpy as np

import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv("compas-scores-two-years.csv")

keep_cols = ['age', 'age_cat', 'sex', 'race',  'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
               'c_charge_degree', 'is_recid', 'is_violent_recid', 'two_year_recid', 'decile_score', 'score_text']

df = data[keep_cols]

target_col = 'class'

df['days_b_screening_arrest'] = np.abs(df['days_b_screening_arrest'])

df['c_jail_out'] = pd.to_datetime(df['c_jail_out'])
df['c_jail_in'] = pd.to_datetime(df['c_jail_in'])
df['length_of_stay'] = (df['c_jail_out'] - df['c_jail_in']).dt.days
df['length_of_stay'] = np.abs(df['length_of_stay'])

df['length_of_stay'].fillna(df['length_of_stay'].value_counts().index[0], inplace=True)
df['days_b_screening_arrest'].fillna(df['days_b_screening_arrest'].value_counts().index[0], inplace=True)

df['length_of_stay'] = df['length_of_stay'].astype(int)
df['days_b_screening_arrest'] = df['days_b_screening_arrest'].astype(int)

def get_class(x):
    if x < 7:
        return 'Medium-Low'
    else:
        return 'High'

df['class'] = df['decile_score'].apply(get_class)

del df['c_jail_in']
del df['c_jail_out']
del df['decile_score']
del df['score_text']

feature_names = [col for col in df.columns if col != target_col]

possible_outcomes = list(df[target_col].unique())
print("Possible outcomes:", possible_outcomes)

pro_data = df.dropna()

non_num_cols = [column for column in pro_data.drop([target_col], axis = 1).columns if pro_data[column].dtypes == "O"]
num_cols = [column for column in pro_data.drop([target_col], axis = 1).columns if column not in non_num_cols]

num_data = pro_data[num_cols]
cat_data = pro_data[non_num_cols]

other_cat_cols = ['is_recid', 'is_violent_recid', 'two_year_recid']

Possible outcomes: ['Medium-Low', 'High']


In [2]:
num_cols

['age',
 'priors_count',
 'days_b_screening_arrest',
 'is_recid',
 'is_violent_recid',
 'two_year_recid',
 'length_of_stay']

In [3]:
encoder = OneHotEncoder(sparse = True)
enc_data = encoder.fit_transform(cat_data)
enc_df = pd.DataFrame.sparse.from_spmatrix(enc_data, columns = encoder.get_feature_names(cat_data.columns))
enc_df

Unnamed: 0,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,sex_Female,sex_Male,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,c_charge_degree_F,c_charge_degree_M
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7210,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7211,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
7212,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
full_enc_df = enc_df.copy()
for each in other_cat_cols:
    full_enc_df[each] = pro_data[each]
#full_enc_df.to_csv("cat_cols.csv", index = False)

In [5]:
binarizer = LabelBinarizer()
target_data = binarizer.fit_transform(pro_data[target_col])
target_data = target_data.reshape(1, -1)[0]
enc_df['high_risk'] = target_data

In [6]:
enc_df

Unnamed: 0,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,sex_Female,sex_Male,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,c_charge_degree_F,c_charge_degree_M,high_risk
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
1,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
2,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
7210,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
7211,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1
7212,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [7]:
fin_data = pd.concat([num_data.reset_index(), enc_df.reset_index()], axis = 1, join = 'inner').drop(['index'], axis = 1)
fin_data.to_csv("compas.csv", index = False)

In [8]:
import json

num_cols = [col for col in num_cols if col not in other_cat_cols]
one_hot_cols = list(enc_df.columns.drop('high_risk'))
one_hot_cols.extend(other_cat_cols)

col_dict = {"continuous": num_cols, "discrete": one_hot_cols}

form = json.dumps(col_dict)
f = open("col_dict.json", "w")
f.write(form)
f.close()

In [9]:
with open('col_dict.json') as file:
    col_dict = json.load(file)
file.close()
col_dict

{'continuous': ['age',
  'priors_count',
  'days_b_screening_arrest',
  'length_of_stay'],
 'discrete': ['age_cat_25 - 45',
  'age_cat_Greater than 45',
  'age_cat_Less than 25',
  'sex_Female',
  'sex_Male',
  'race_African-American',
  'race_Asian',
  'race_Caucasian',
  'race_Hispanic',
  'race_Native American',
  'race_Other',
  'c_charge_degree_F',
  'c_charge_degree_M',
  'is_recid',
  'is_violent_recid',
  'two_year_recid']}