In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

### Data Preprocessing

In [2]:
raw_data = pd.read_csv("../data/data_clean.csv")
data = raw_data[raw_data['birth date'] < 2020].copy()

data['age'] = data['parole board interview date'] - data['birth date']
data['jail duration'] = data['parole board interview date'] - data['year of entry']

others_parole_type =  ['PIE', 'SP CONSDR', 'ECPDO', 'MEDICAL','RESCISSION', 'DEPORT']
data['parole board interview type'] = data['parole board interview type'].replace(others_parole_type, 'OTHERS').replace('SUPP MERIT', 'MERIT TIME').replace('PV REAPP', 'REAPPEAR')

data = data.dropna(axis=0, subset=['crime 1 - class', 'parole eligibility date'])

df_one_hot = pd.get_dummies(data, columns=[
    "sex", "race / ethnicity"], drop_first=True)

df_one_hot = pd.get_dummies(df_one_hot, columns=[
    "crime 1 - class", "crime 2 - class",
    "crime 3 - class", "crime 4 - class",
    "parole board interview type"])

df_one_hot.drop(columns=['release date','birth date', 'year of entry'],inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df_one_hot.drop('y',axis=1), df_one_hot['y'], 
                                                    stratify=df_one_hot['y'], test_size=0.3, random_state=42)

In [3]:
y_train.value_counts()

y
0    16818
1     4291
Name: count, dtype: int64

### Dummy Classifier

In [4]:
# Create a DummyClassifier
dummy_classifier = DummyClassifier(strategy='most_frequent')

# Fit the DummyClassifier on the training data
dummy_classifier.fit(X_train, y_train)

# Make predictions on both the training and test datasets
train_predictions = dummy_classifier.predict(X_train)
test_predictions = dummy_classifier.predict(X_test)

# Calculate evaluation metrics for the training dataset
train_accuracy = accuracy_score(y_train, train_predictions)
train_balanced_accuracy = balanced_accuracy_score(y_train, train_predictions)
train_roc_auc = roc_auc_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)

# Calculate evaluation metrics for the test dataset
test_accuracy = accuracy_score(y_test, test_predictions)
test_balanced_accuracy = balanced_accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)

# Print the evaluation metrics
print("Training Metrics:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"Balanced Accuracy: {train_balanced_accuracy:.2f}")
print(f"ROC AUC: {train_roc_auc:.2f}")
print(f"F1 Score: {train_f1:.2f}")
print("\nTest Metrics:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"Balanced Accuracy: {test_balanced_accuracy:.2f}")
print(f"ROC AUC: {test_roc_auc:.2f}")
print(f"F1 Score: {test_f1:.2f}")

Training Metrics:
Accuracy: 0.80
Balanced Accuracy: 0.50
ROC AUC: 0.50
F1 Score: 0.00

Test Metrics:
Accuracy: 0.80
Balanced Accuracy: 0.50
ROC AUC: 0.50
F1 Score: 0.00


### XGBoost

In [5]:
# Create and configure the XGBoost classifier with balanced class weights
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 5, 10],
    'learning_rate': [1e-3, 1e-2, 0.1, 0.5]
}

class_weights = len(y_train) / (2 * np.bincount(y_train))

# Define base estimator (XGB Classifier)
xgb = xgb.XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=class_weights[1] / class_weights[0],
    random_state=42
)

# Define GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='roc_auc')

# Fit the GridSearch on the training data and select the best estimator
grid_search.fit(X_train, y_train)
xgb_classifier = grid_search.best_estimator_

# Make predictions on both the training and test datasets
train_predictions = xgb_classifier.predict(X_train)
test_predictions = xgb_classifier.predict(X_test)

# Calculate evaluation metrics for the training dataset
train_accuracy = accuracy_score(y_train, train_predictions)
train_balanced_accuracy = balanced_accuracy_score(y_train, train_predictions)
train_roc_auc = roc_auc_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)

# Calculate evaluation metrics for the test dataset
test_accuracy = accuracy_score(y_test, test_predictions)
test_balanced_accuracy = balanced_accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)

# Print the evaluation metrics
print("Training Metrics:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"Balanced Accuracy: {train_balanced_accuracy:.2f}")
print(f"ROC AUC: {train_roc_auc:.2f}")
print(f"F1 Score: {train_f1:.2f}")
print("\nTest Metrics:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"Balanced Accuracy: {test_balanced_accuracy:.2f}")
print(f"ROC AUC: {test_roc_auc:.2f}")
print(f"F1 Score: {test_f1:.2f}")

Training Metrics:
Accuracy: 0.68
Balanced Accuracy: 0.71
ROC AUC: 0.71
F1 Score: 0.49

Test Metrics:
Accuracy: 0.67
Balanced Accuracy: 0.68
ROC AUC: 0.68
F1 Score: 0.46


In [7]:
import xgboost as xgb
import pickle

# Save the model to a .pkl file
pickle.dump(xgb_classifier, open("model.pkl", "wb"))
