# Data Preparation for the Models

In [84]:
import pandas as pd
import utils
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier

## Data Filtering and Preprocessing

We aim at predicting "is_recid". Some features in the dataset must be droped in order to remove features that might be directly linked to the target value, such as "is_violent_recid"

In [77]:
df = utils.read_data_files()
df = df.drop(df[df.is_recid == -1].index)

In [78]:
FEATURES = ['sex', 'age', 'age_cat', 'race',
            'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count',
            'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
            'c_days_from_compas', 'c_charge_degree', 'c_charge_desc', 
            'score_text', 'screening_date',
            'v_decile_score', 'v_score_text',
            'event']

TARGET = ['is_recid']

In [79]:
# Label encode categorical columns
df_encoded = df[FEATURES + TARGET].copy()
label_encoders = {}

for column in df_encoded.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_encoded[column] = le.fit_transform(df_encoded[column].astype(str))
    label_encoders[column] = le  # Store the label encoder for each column

# Now all columns should be numeric
X = df_encoded.drop('is_recid', axis=1)
y = df_encoded['is_recid']

In [80]:
# Handle missing values if any
X.fillna(X.mean(), inplace=True)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Models

### Random Forest SK Learn

In [81]:
# Create and fit the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model's performance
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

### CatBosst Classifier

In [88]:
# CatBoost Classifier
cat_model = CatBoostClassifier(iterations=200, learning_rate=0.1, depth=8, loss_function='Logloss')
cat_model.fit(X_train, y_train, verbose=False)
cat_predictions = cat_model.predict(X_test)
cat_accuracy = accuracy_score(y_test, cat_predictions)
cat_roc_auc = roc_auc_score(y_test, cat_model.predict_proba(X_test)[:, 1])

# Logistic Regression

In [89]:
# Logistic Regression
log_model = LogisticRegression(max_iter=10_000)
log_model.fit(X_train, y_train)
log_predictions = log_model.predict(X_test)
log_accuracy = accuracy_score(y_test, log_predictions)
log_roc_auc = roc_auc_score(y_test, log_model.predict_proba(X_test)[:, 1])

## Models Performace

In [90]:
# Print the performance indicators for each model
print(f"OLS Accuracy: {rf_accuracy}, ROC AUC: {rf_roc_auc}")
print(f"CatBoost Accuracy: {cat_accuracy}, ROC AUC: {cat_roc_auc}")
print(f"Logistic Regression Accuracy: {log_accuracy}, ROC AUC: {log_roc_auc}")

OLS Accuracy: 0.9191428571428572, ROC AUC: 0.9689803367556039
CatBoost Accuracy: 0.8174285714285714, ROC AUC: 0.8913686119939779
Logistic Regression Accuracy: 0.6908571428571428, ROC AUC: 0.7556532154357645
