In [16]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

import warnings

warnings.filterwarnings("ignore")

In [17]:
# Load dataframes
train = pd.read_csv("data/train.csv", index_col = 0)
test = pd.read_csv("data/test.csv", index_col = 0)

# Get the ids to differentiate them after joining
train_ids = list(train.index)
test_ids = list(test.index)

# Join dataframes vertically
full = pd.concat([train, test], axis = 0)

# Variables to keep
to_keep = ["ticket_issued_date", "hearing_date", "violation_code", "judgment_amount", "compliance"]
full = full.loc[:, to_keep]

# Datetime variables processing
full.ticket_issued_date = pd.to_datetime(full.ticket_issued_date)
full.hearing_date = pd.to_datetime(full.hearing_date)

full["hearing_month"] = full.hearing_date.dt.month
full["ticket_issued_month"] = full.ticket_issued_date.dt.month

full.hearing_month = full.hearing_month.fillna(0)

full = full.drop(["hearing_date", "ticket_issued_date"], axis = 1)

full.violation_code = LabelEncoder().fit_transform(full.violation_code)

In [18]:
full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 311307 entries, 22056 to 369851
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   violation_code       311307 non-null  int64  
 1   judgment_amount      311307 non-null  float64
 2   compliance           159880 non-null  float64
 3   hearing_month        311307 non-null  float64
 4   ticket_issued_month  311307 non-null  int64  
dtypes: float64(3), int64(2)
memory usage: 14.3 MB


In [21]:
# First, I split back the full dataframe into train and test

# I add dropna to remove those rows where compliance is null, meaning that the subject
# wasn't responsible after all
train2 = full.loc[train_ids, :].dropna()

# From test I remove compliance as I don't have the info for those rows
# In fact, it is what I'm trying to predict
test2 = full.loc[test_ids, :].drop("compliance", axis = 1)

In [19]:
# Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier

In [22]:
seed = 42
scaler = MinMaxScaler()

X = train2.drop("compliance", axis = 1)
X = scaler.fit_transform(X)
y = train2.compliance
print("Data succesfuly scaled")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = seed)
print("Train-test split was succesful")

Data succesfuly scaled
Train-test split was succesful


In [23]:
#model = LogisticRegression(n_jobs = -1, random_state = seed)
model = RandomForestClassifier(n_jobs = -1, random_state = seed)

model.fit(X_train, y_train)
print("Model training completed")

cm_predictions = model.predict(X_test)
cm = confusion_matrix(y_test, cm_predictions)

predictions = model.predict_proba(test2)
predictions = [prediction[1] for prediction in predictions]
print("Model predictions completed")
print("-" * 50)

train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")
print(cm)

Model training completed
Model predictions completed
--------------------------------------------------
Train score: 0.9392711200066717
Test score: 0.9327495621716287
[[36791   249]
 [ 2439   491]]


In [24]:
predictions

[0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.42458823529411766,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0.264,
 0