In [1]:
import numpy as np
import pandas as pd

import warnings

warnings.filterwarnings("ignore")

# Exploratory data analysis

In [2]:
train = pd.read_csv("data/train.csv", index_col = 0)
test = pd.read_csv("data/test.csv", index_col = 0)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (250306, 33)
Test shape: (61001, 26)


In [3]:
train.head()

Unnamed: 0_level_0,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,violation_zip_code,mailing_address_str_number,mailing_address_str_name,city,state,...,clean_up_cost,judgment_amount,payment_amount,balance_due,payment_date,payment_status,collection_status,grafitti_status,compliance_detail,compliance
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22056,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","INVESTMENT INC., MIDWEST MORTGAGE",2900.0,TYLER,,3.0,S. WICKER,CHICAGO,IL,...,0.0,305.0,0.0,305.0,,NO PAYMENT APPLIED,,,non-compliant by no payment,0.0
27586,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Michigan, Covenant House",4311.0,CENTRAL,,2959.0,Martin Luther King,Detroit,MI,...,0.0,855.0,780.0,75.0,2005-06-02 00:00:00,PAID IN FULL,,,compliant by late payment within 1 month,1.0
22062,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","SANDERS, DERRON",1449.0,LONGFELLOW,,23658.0,P.O. BOX,DETROIT,MI,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,
22084,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","MOROSI, MIKE",1441.0,LONGFELLOW,,5.0,ST. CLAIR,DETROIT,MI,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,
22093,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","NATHANIEL, NEAL",2449.0,CHURCHILL,,7449.0,CHURCHILL,DETROIT,MI,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,


In [4]:
# To save the variables that are not in the test set
test_missing_features = []

for i in train.columns:
    if i not in test.columns:
        test_missing_features.append(i)

# To remove from the list the target variable "compliance" as I'll need it later
test_missing_features.pop()

test_missing_features

['payment_amount',
 'balance_due',
 'payment_date',
 'payment_status',
 'collection_status',
 'compliance_detail']

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250306 entries, 22056 to 325561
Data columns (total 33 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   agency_name                 250306 non-null  object 
 1   inspector_name              250306 non-null  object 
 2   violator_name               250272 non-null  object 
 3   violation_street_number     250306 non-null  float64
 4   violation_street_name       250306 non-null  object 
 5   violation_zip_code          0 non-null       float64
 6   mailing_address_str_number  246704 non-null  float64
 7   mailing_address_str_name    250302 non-null  object 
 8   city                        250306 non-null  object 
 9   state                       250213 non-null  object 
 10  zip_code                    250305 non-null  object 
 11  non_us_str_code             3 non-null       object 
 12  country                     250306 non-null  object 
 13  ticket_iss

In [6]:
# Drop variables that I don't have in the test set to avoid data leakage
train = train.drop(test_missing_features, axis = 1)
# Drop variables with too many missing values
train = train.drop(["violation_zip_code", "non_us_str_code", "grafitti_status"], axis = 1)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250306 entries, 22056 to 325561
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   agency_name                 250306 non-null  object 
 1   inspector_name              250306 non-null  object 
 2   violator_name               250272 non-null  object 
 3   violation_street_number     250306 non-null  float64
 4   violation_street_name       250306 non-null  object 
 5   mailing_address_str_number  246704 non-null  float64
 6   mailing_address_str_name    250302 non-null  object 
 7   city                        250306 non-null  object 
 8   state                       250213 non-null  object 
 9   zip_code                    250305 non-null  object 
 10  country                     250306 non-null  object 
 11  ticket_issued_date          250306 non-null  object 
 12  hearing_date                237815 non-null  object 
 13  violation_

In [7]:
# Get not null values to filter by target variable
# If they have null values in the target variable, that means the citizen wasn't
# responsible
not_null_indexes = train.loc[:, "compliance"].dropna().index
train = train.loc[not_null_indexes, :]
train.head()

Unnamed: 0_level_0,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,mailing_address_str_number,mailing_address_str_name,city,state,zip_code,...,violation_description,disposition,fine_amount,admin_fee,state_fee,late_fee,discount_amount,clean_up_cost,judgment_amount,compliance
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22056,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","INVESTMENT INC., MIDWEST MORTGAGE",2900.0,TYLER,3.0,S. WICKER,CHICAGO,IL,60606,...,Failure of owner to obtain certificate of comp...,Responsible by Default,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0
27586,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Michigan, Covenant House",4311.0,CENTRAL,2959.0,Martin Luther King,Detroit,MI,48208,...,Failed To Secure Permit For Lawful Use Of Buil...,Responsible by Determination,750.0,20.0,10.0,75.0,0.0,0.0,855.0,1.0
22046,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","KASIMU, UKWELI",6478.0,NORTHFIELD,2755.0,E. 17TH,LOG BEACH,CA,908041512,...,Failure of owner to obtain certificate of comp...,Responsible by Default,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0
18738,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Deerwood Development Group Inc, Deer",8027.0,BRENTWOOD,476.0,Garfield,Clinton,MI,48038,...,Failed To Secure Permit For Lawful Use Of Land,Responsible by Default,750.0,20.0,10.0,75.0,0.0,0.0,855.0,0.0
18735,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Rafee Auto Services L.L.C., RAF",8228.0,MT ELLIOTT,8228.0,Mt. Elliott,Detroit,MI,48211,...,Noncompliance/Grant Condition/BZA/BSE,Responsible by Default,100.0,20.0,10.0,10.0,0.0,0.0,140.0,0.0


In [8]:
# Now let's get the categorical and numerical column names
categorical_variables = []
numerical_variables = []

for column in train.columns:
    if train[column].dtype == "object":
        categorical_variables.append(column)
    else:
        numerical_variables.append(column)

## Categorical variables

In [9]:
for column in train.loc[:, categorical_variables].columns:
    print(train[column].value_counts())
    print("-" * 50)

Buildings, Safety Engineering & Env Department    95863
Department of Public Works                        52445
Health Department                                  7107
Detroit Police Department                          4464
Neighborhood City Halls                               1
Name: agency_name, dtype: int64
--------------------------------------------------
Morris, John        11604
Samaan, Neil J       8720
O'Neal, Claude       8075
Steele, Jonathan     6962
Devaney, John        6837
                    ...  
Clark, Marcel           1
Thomas, Duane           1
Coleman, Lanetha        1
Paylor, Ava             1
Malone, Melanie         1
Name: inspector_name, Length: 159, dtype: int64
--------------------------------------------------
INVESTMENT, ACORN                624
INVESTMENT CO., ACORN            343
BANK, WELLS FARGO                253
MILLER, JOHN                     177
STEHLIK, JERRY                   158
                                ... 
MCSHAN W JR, -                

In [10]:
# I'll drop some variables that add no relevant information to the model
to_drop = ["violator_name", "violation_street_name", "mailing_address_str_name", "city", "zip_code", "violation_description"]
train = train.drop(to_drop, axis = 1)

In [11]:
# Transforming datetime variables to the corresponding format
train.ticket_issued_date = pd.to_datetime(train.ticket_issued_date)
train.hearing_date = pd.to_datetime(train.hearing_date)

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159880 entries, 22056 to 284333
Data columns (total 18 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   agency_name                 159880 non-null  object        
 1   inspector_name              159880 non-null  object        
 2   violation_street_number     159880 non-null  float64       
 3   mailing_address_str_number  157322 non-null  float64       
 4   state                       159796 non-null  object        
 5   country                     159880 non-null  object        
 6   ticket_issued_date          159880 non-null  datetime64[ns]
 7   hearing_date                159653 non-null  datetime64[ns]
 8   violation_code              159880 non-null  object        
 9   disposition                 159880 non-null  object        
 10  fine_amount                 159880 non-null  float64       
 11  admin_fee                   159880 

In [13]:
# Now let's get the categorical and numerical column names
categorical_variables = []
numerical_variables = []

for column in train.columns:
    if train[column].dtype == "object":
        categorical_variables.append(column)
    else:
        numerical_variables.append(column)

In [14]:
train = pd.get_dummies(train, prefix = categorical_variables, columns = categorical_variables)
train.shape

(159880, 433)

## Numerical variables

In [15]:
numerical_variables

['violation_street_number',
 'mailing_address_str_number',
 'ticket_issued_date',
 'hearing_date',
 'fine_amount',
 'admin_fee',
 'state_fee',
 'late_fee',
 'discount_amount',
 'clean_up_cost',
 'judgment_amount',
 'compliance']

In [16]:
train.loc[:, numerical_variables].describe()

Unnamed: 0,violation_street_number,mailing_address_str_number,fine_amount,admin_fee,state_fee,late_fee,discount_amount,clean_up_cost,judgment_amount,compliance
count,159880.0,157322.0,159880.0,159880.0,159880.0,159880.0,159880.0,159880.0,159880.0,159880.0
mean,10713.16,9133.714,357.035295,20.0,10.0,33.651512,0.195959,0.0,420.650218,0.072536
std,36231.59,36577.29,675.65558,0.0,0.0,67.692916,4.290344,0.0,742.555062,0.259374
min,0.0,1.0,0.0,20.0,10.0,0.0,0.0,0.0,0.0,0.0
25%,4920.0,532.0,200.0,20.0,10.0,10.0,0.0,0.0,250.0,0.0
50%,10398.0,2418.0,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0
75%,15783.25,12844.0,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0
max,14154110.0,5111345.0,10000.0,20.0,10.0,1000.0,350.0,0.0,11030.0,1.0


In [17]:
# I'll drop some variables that add no relevant information to the model
to_drop = ["violation_street_number", "mailing_address_str_number"]
train = train.drop(to_drop, axis = 1)

## Datetime variables

In [60]:
# Datetime variables cannot be fed into the model just like that
# Let's do some preprocessing
# ticket_issued_date
# hearing_date

train["time_diff"] = train.hearing_date - train.ticket_issued_date
train.loc[:, "time_diff"] = train["time_diff"].map(lambda x: pd.Timedelta(x) / np.timedelta64(1, "M"))
train["hearing_month"] = train.hearing_date.dt.month
train["ticket_issued_month"] = train.ticket_issued_date.dt.month

train = train.drop(["hearing_date", "ticket_issued_date"], axis = 1)

In [66]:
train = train.dropna()

# Model selection

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [68]:
seed = 42
scaler = MinMaxScaler()

X = train.drop("compliance", axis = 1)
X = scaler.fit_transform(X)
y = train.compliance

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = seed)

model = LogisticRegression(n_jobs = -1, random_state = seed)

model.fit(X_train, y_train)

predictions = model.predict(X_test)

train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.9324948429500831
Test score: 0.9338327403918425


In [73]:
(unique, counts) = np.unique(predictions, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[[0.0000e+00 3.9649e+04]
 [1.0000e+00 2.6500e+02]]
