In [1]:
import pandas as pd
import numpy as np
from data.preprocessing import retrieve_filepaths
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
categoricals = ['CRASH_DATE_EST_I', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
                'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'TRAFFICWAY_TYPE',
                'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ROADWAY_SURFACE_COND',
                'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I',
                'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE',
                'SEC_CONTRIBUTORY_CAUSE', 'STREET_DIRECTION', 'STREET_NAME', 'PHOTOS_TAKEN_I',
                'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I',
                'MOST_SEVERE_INJURY', 'BEAT_OF_OCCURRENCE']
dtypes = dict.fromkeys(categoricals, 'category')

In [3]:
filepaths = retrieve_filepaths('data', ['csv'])
crashes = pd.read_csv(filepaths['csv']['TrafficCrashesChicago.csv'], parse_dates = ['CRASH_DATE',
                                                                                    'DATE_POLICE_NOTIFIED'],dtype=dtypes)
weather = pd.read_csv(filepaths['csv']['ChicagoWeather.csv'], usecols=['dt_iso',
                                                                       'weather_main',
                                                                       'weather_description'],
                     dtype={'weather_main': 'category', 'weather_description': 'category'})


In [4]:
# Merge and Filter Dataset to relevant and processible time horizon
crashes = crashes[crashes.CRASH_DATE > '2018-01-01 00:00:00']
crashes['hourly'] = crashes['CRASH_DATE'].dt.round('H')
weather['dt_iso'] = pd.to_datetime(weather['dt_iso'], format="%Y-%m-%d %H:%M:%S +0000 UTC")
crashes_merged = pd.merge(crashes, weather, how='left', left_on='hourly', right_on = 'dt_iso')

In [5]:
# Replace unknown values with explicit missing values
crashes_merged = crashes_merged.replace('UNKNOWN', np.nan)

In [6]:
# Create Target Variable
def are_there_injuries(total_injuries):
    if (total_injuries != np.nan):
        return True if total_injuries > 0 else False
    else:
        return np.nan

crashes_merged['injuries'] = crashes_merged['INJURIES_TOTAL'].apply(lambda x: are_there_injuries(x))

In [7]:
inactive_variables = ['RD_NO', 'CRASH_DATE_EST_I', 'LANE_CNT', 'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'PHOTOS_TAKEN_I',
                      'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'hourly',
                      'dt_iso', 'DATE_POLICE_NOTIFIED', 'LONGITUDE', 'LATITUDE', 'LOCATION']

In [26]:
# Some variables are unusable because 95% + are missing, so we will drop them.
final_set = crashes_merged.drop(inactive_variables, axis=1)
final_set = final_set.dropna()
final_set = final_set.set_index('CRASH_DATE')

In [27]:
final_set.shape

(48372, 35)

In [28]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

injuries = final_set[['injuries']]
nums = final_set.select_dtypes(['float64', 'int64'])
onehotencoded = pd.get_dummies(final_set.select_dtypes('category'))
final_set = pd.concat([injuries, nums, onehotencoded], axis=1)
y = final_set['injuries']
X = final_set.drop(['injuries'], axis=1)
X.shape

(48372, 2051)

We now begin with a baseline logistic ridge regression. The binomial distribution with probability 'p' and number of trials 'n' is the maximum entropy distribution for discrete, independent outcomes with known probability; therefore, it is the best starting point for our investigation. Ridge classification is used so that feature selection can be performed in a more continuous fashion. Sklearn does penalized logistic regression by default, so we do not need to pass anything other than an argument for the Inverse Regularization Strength.

The values for the inverse regularization parameter will be sampled using a log-uniform distribution given that the performance of our algorithm will likely not scale linearly with the tuning of this hyperparameter. To see why, note that smaller values for that hyperparameter mean greater regularization, yet the domain of this value is only infinite in the positive direction. By not using logs, we would needlessly reduce the skepticism of our models by giving additional leverage to the optimistic values of the parameter. If skepticism is unnecessary then, with sufficient sampling and fitting, this will bear out in the validation curves.

Furthermore, we will use Leave Future Out Cross Validation given the size and time structure of the data. There are two issues with this. First, there is no guarantee that 

In [41]:
import matplotlib.pyplot as plt
from multiprocessing import cpu_count
from src.tuning import clfHyperFit, logUniform

# Randomized Grid Search with Purged K-Fold Cross Validation
cores = cpu_count() - 1
idx = X.index.to_series()

grid = logUniform(0.01,5).rvs(size=1000)

clfHyperFit(X, y, t1=idx, pipe_clf=LogisticRegression(), param_grid={'C':grid}, n_splits=3, rndSearchIter=1000, n_jobs = cores, pctEmbargo=0.1)   

IndexError: positional indexers are out-of-bounds