In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

# Compare different datasets

- ProPublica obtained a dataset of pretrial defendants and probationers from Broward County, FL, who had been assessed with the COMPAS screening system between January 1, 2013, and December 31, 2014.
- COMPAS recidivism risk scores are based on a defendant’s answers to the COMPAS screening survey. The survey is completed by pre-trial services in cooperation with the defendant after his or her arrest. 
- The COMPAS survey, at least in the ProPublica data, is typically administered the same day or the day after a person is jailed.
- For the more than 11 thousand pretrial defendants in this dataset,9 ProPublica then collected data on future arrests through the end of March 2016, in order to study how the COMPAS score predicts recidivism for these defendants
- ProPublica collected the data for its study and created a database. From that database, it constructed various sub-datasets that merged and calculated various important features. For example, an indicator for a re-arrest for a new crime within two years of the original one, and the period of time between arrests. ProPublica then exported these sub-datasets into .csv files. These files were named by ProPublica `compas-scores.csv` and `compas-scores-two-years.csv`. 
- The first file `compas-scores.csv` contains the full dataset of pretrial defendants that ProPublica obtained from the Broward County Sheriff’s Office. This file contains 11,757 people.
- The second file `compas-scores-two-years.csv` is a file that ProPublica created for the purpose of studying two-year general recidivism. The term general recidivism is used to distinguish it from the smaller subset of violent recidivism. General recidivism includes both violent and non-violent offenses. I focus on the two-year general recidivism dataset
- The two-year general recidivism file contains 7,214 people.<br>
<br>https://arxiv.org/pdf/1906.04711.pdf

## Load datasets

In [60]:
# Load datasets
cs = pd.read_csv("../../data-ProPublica/compas-scores.csv") # Compas-scores
cs_twoyears = pd.read_csv("../../data-ProPublica/compas-scores-two-years.csv") # Compas-scores-two-years
cs_twoyears_vio = pd.read_csv("../../data-ProPublica/compas-scores-two-years-violent.csv") # Compas-scores-two-years violent ("subset" of Compas-scores-two-years)

In [61]:
#Check shapes
print(cs.shape)
print(cs_twoyears.shape)
print(cs_twoyears_vio.shape)

(11757, 47)
(7214, 53)
(4743, 54)


## Check difference in columns

### Difference between `cs` & `cs_twoyears`

In [63]:
# Check for unique columns in both datasets (cs & cs_twoyears)
list(set(cs).symmetric_difference(set(cs_twoyears)))

['end',
 'start',
 'event',
 'in_custody',
 'priors_count.1',
 'violent_recid',
 'num_vr_cases',
 'out_custody',
 'two_year_recid',
 'num_r_cases']

Columns from "cs" df:
- `num_vr_cases`
- `num_r_cases`

Columns from "cs_twoyears" df:
- `end`
- `out_custody` 
- `start`
- `cs_twoyears_year_recid`
- `in_custody`
- `event`
- `violent_recid` 
- `priors_count.1`

In [71]:
# Both columns only contain NaN values
cs[['num_vr_cases','num_r_cases']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11757 entries, 0 to 11756
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   num_vr_cases  0 non-null      float64
 1   num_r_cases   0 non-null      float64
dtypes: float64(2)
memory usage: 183.8 KB


### Difference between `cs_twoyears` & `cs_twoyears_vio`

In [62]:
# Check for unique columns in both datasets (cs_twoyears & cs_twoyears_vio)
list(set(cs_twoyears).symmetric_difference(set(cs_twoyears_vio)))

['two_year_recid.1']

## Check for duplicates within the dataframes

### `cs`

In [74]:
# Cs
set(cs.columns)-set(cs.T.drop_duplicates().T.columns)

{'decile_score.1', 'num_vr_cases', 'screening_date', 'v_screening_date'}

### `cs_twoyears`

In [73]:
# cs_twoyears
set(cs_twoyears.columns)-set(cs_twoyears.T.drop_duplicates().T.columns)

{'decile_score.1', 'priors_count.1', 'screening_date', 'v_screening_date'}

### `cs_twoyears_vio`

In [75]:
# cs_twoyears
set(cs_twoyears_vio.columns)-set(cs_twoyears_vio.T.drop_duplicates().T.columns)

{'decile_score.1',
 'priors_count.1',
 'screening_date',
 'two_year_recid.1',
 'v_screening_date'}