In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

import warnings

warnings.filterwarnings("ignore")

In [2]:
# Load dataframes
train = pd.read_csv("data/train.csv", index_col = 0)
test = pd.read_csv("data/test.csv", index_col = 0)

# Get the ids to differentiate them after joining
train_ids = list(train.index)
test_ids = list(test.index)

# Join dataframes vertically
full = pd.concat([train, test], axis = 0)

# Variables to keep
# "ticket_issued_date", "hearing_date", 
to_keep = ["agency_name", "violation_code", "late_fee", "discount_amount", "judgment_amount", "compliance"]
full = full.loc[:, to_keep]

# Datetime variables processing
# full.ticket_issued_date = pd.to_datetime(full.ticket_issued_date)
# full.hearing_date = pd.to_datetime(full.hearing_date)

# full["hearing_month"] = full.hearing_date.dt.month
# full["ticket_issued_month"] = full.ticket_issued_date.dt.month

# full.hearing_month = full.hearing_month.fillna(0)

# full = full.drop(["hearing_date", "ticket_issued_date"], axis = 1)

full.violation_code = LabelEncoder().fit_transform(full.violation_code)
full = pd.get_dummies(full, prefix = ["agency_name"], columns = ["agency_name"])

In [3]:
# First, I split back the full dataframe into train and test

# I add dropna to remove those rows where compliance is null, meaning that the subject
# wasn't responsible after all
train2 = full.loc[train_ids, :].dropna()

# From test I remove compliance as I don't have the info for those rows
# In fact, it is what I'm trying to predict
test2 = full.loc[test_ids, :].drop("compliance", axis = 1)

In [4]:
# Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score

from sklearn.ensemble import RandomForestClassifier

In [5]:
seed = 42
scaler = MinMaxScaler()

X = train2.drop("compliance", axis = 1)
#X = scaler.fit_transform(X)
y = train2.compliance
print("Data succesfuly scaled")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = seed)
print("Train-test split was succesful")

Data succesfuly scaled
Train-test split was succesful


In [6]:
# Model
#model = LogisticRegression(n_jobs = -1, random_state = seed)
model = RandomForestClassifier(n_jobs = -1, random_state = seed)

# Training
model.fit(X_train, y_train)
print("Model training completed")

# Predictions
predictions = model.predict(X_test)
cm = confusion_matrix(y_test, predictions)

# Probabilities
probabilities = model.predict_proba(test2)
probabilities = [prob[1] for prob in probabilities]
print("Model predictions completed")
print("-" * 50)

# Metrics
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print("- TRAIN -")
print(f"Score: {train_score}")
print("\n- TEST -")
print(f"AUC Score:{auc_score}")
print(f"Score: {test_score}")
print(cm)

Model training completed
Model predictions completed
--------------------------------------------------
- TRAIN -
Score: 0.9361854724376616

- TEST -
AUC Score:0.7744914408553801
Score: 0.9347760820615462
[[37011    29]
 [ 2578   352]]


In [7]:
probabilities

[0.20115791980935105,
 0.014542348943832339,
 0.06457786042983728,
 0.04924161142045227,
 0.08727709789289487,
 0.04924161142045227,
 0.06597912540608468,
 0.3438608890537798,
 0.014542348943832339,
 0.010004584532978653,
 0.05525555940865515,
 0.014542348943832339,
 0.014542348943832339,
 0.05525555940865515,
 0.08727709789289487,
 0.04924161142045227,
 0.010004584532978653,
 0.010004584532978653,
 0.010004584532978653,
 0.013007159346267111,
 0.0,
 0.014542348943832339,
 0.010004584532978653,
 0.05525555940865515,
 0.04924161142045227,
 0.014542348943832339,
 0.010004584532978653,
 0.04924161142045227,
 0.06457786042983728,
 0.014542348943832339,
 0.04924161142045227,
 0.014542348943832339,
 0.05525555940865515,
 0.05525555940865515,
 0.014542348943832339,
 0.05525555940865515,
 0.014542348943832339,
 0.010004584532978653,
 0.33608087080067806,
 0.010004584532978653,
 0.04924161142045227,
 0.05525555940865515,
 0.04924161142045227,
 0.4662758495710584,
 0.31592196387823107,
 0.201157