In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import pandas as pd
import sqlite3
import pickle
import os
import joblib

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

import warnings
warnings.filterwarnings("ignore")

# Creating a pipeline for COMPAS dataset

Please note that the results from this pipeline might differ from the pickle standalone version as we used a different encoding method

In [2]:
url = 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv'
df = pd.read_csv(url)
df = df[df["days_b_screening_arrest"] <=30][df["days_b_screening_arrest"] >= -30][df["score_text"] != "N/A"][df["is_recid"] != -1]

## Split the dataset here

In [3]:
X = df.drop("two_year_recid", axis=1)
y = df[["two_year_recid"]]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print("X = {0}, X = {1}".format(len(X_train), len(X_test)))

X = 4629, X = 1543


## Create a pipeline with data transformation

In [8]:
class columnSelectTransformer():
    """custom transformer to select columns"""
    def __init__(self, columns):
        self.columns = columns 
    
    def transform(self, X, y=None):
        return X[self.columns]
    
    def fit(self, X, y=None):
        return self

In [9]:
pipeline = Pipeline(steps=[
    ("selector", ColumnTransformer([("selector", "passthrough", ["race", "sex", "age_cat", "c_charge_degree"])], remainder="drop")),
    ("columnEncoder", OneHotEncoder(sparse=False)),
    ("predictor", LogisticRegression())
])

model = pipeline.fit(X_train, y_train)
predictions = model.predict(X_test)

In [10]:
score = f1_score(y_test, predictions)
training_score = cross_val_score(model, X_train, y_train, cv=5)
print("Accuracy: %0.2f(+/- %0.2f)" % (training_score.mean(), training_score.std() * 2))
print("F1 Score: %0.2f"  % (score))

Accuracy: 0.60(+/- 0.03)
F1 Score: 0.49


### Saving the pipeline

In [11]:
import sklearn
print(sklearn.__version__)

1.0.2


In [14]:
pickle.dump(model, open("../model/pickle_scikit_lr_compas.pkl", "wb+"))
joblib.dump(model, open("../model/joblib_scikit_lr_compas.pkl", "wb+"))

In [15]:
pickle.dump(X_train, open("../data/pickle_scikit_lr_compas_xtrain_pkl.sav", "wb+"))
pickle.dump(y_train, open("../data/pickle_scikit_lr_compas_ytrain_pkl.sav", "wb+"))
pickle.dump(X_test, open("../data/pickle_scikit_lr_compas_xtest_pkl.sav", "wb+"))
pickle.dump(y_test, open("../data/pickle_scikit_lr_compas_ytest_pkl.sav", "wb+"))

In [94]:
X_test.iloc[2, :]

id                                                   6165
name                                      patrick burrows
first                                             patrick
last                                              burrows
compas_screening_date                          2013-04-20
sex                                                  Male
dob                                            1981-10-31
age                                                    34
age_cat                                           25 - 45
race                                            Caucasian
juv_fel_count                                           0
decile_score                                            2
juv_misd_count                                          0
juv_other_count                                         0
priors_count                                            5
days_b_screening_arrest                              -1.0
c_jail_in                             2013-04-19 03:20:12
c_jail_out    