#### Notes
F1 scoring chosen for hyperparameter tuning in order to minimize false positives as well as false negatives.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import scipy.stats as stats
import scipy.sparse
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score, confusion_matrix, make_scorer
import pickle

### Random Forest hyperparameters

In [2]:
rf_params = {
    'criterion':['gini','entropy'],
    'max_depth':[i for i in range(1,51)],
    'min_samples_leaf':[i for i in range(1,11)],
    'n_estimators':[i for i in range(1,101)],
    'class_weight': ['balanced'],
    'max_features':[None,1,2,3,4,5,6,7,8,9,10],
    'min_impurity_decrease':[i for i in np.arange(0.0,0.35,0.05)],
    'random_state': [42],
    'n_jobs':[-1],
    'verbose':[2]
}

### Import data

In [3]:
print('Importing data...')
data_path = r'../data/clean_df.csv.gz'
df = pd.read_csv(data_path)
print('Done.')

print('Engineering features...')
df['CRASH DATE'] = pd.to_datetime(df['CRASH DATE'])
df['CRASH TIME'] = pd.to_datetime(df['CRASH TIME'])

df['CASUALTIES?'] = 0
mask = df['TOTAL PEDESTRIAN CASUALTIES'] != 0
df.loc[mask, 'CASUALTIES?'] = 1
df.loc[df['TOTAL PEDESTRIAN CASUALTIES'] != 1, ['TOTAL PEDESTRIAN CASUALTIES','CASUALTIES?']].sample(5)
print('Done.')

Importing data...
Done.
Engineering features...
Done.


### K-Means analysis

In [None]:
boroughs = ['MANHATTAN','BROOKLYN','STATEN ISLAND','QUEENS','BRONX']
subplots = [231,232,233,234,235]
k_range = range(2,31)

_ = plt.figure(figsize=(15,10))
max_k = {}
for space, current_borough in zip(subplots, boroughs):
    print(f'{current_borough.title()} K-Means analysis')
    borough = df[df['BOROUGH'] == current_borough]
    f1_list = []
    for i in k_range:
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(borough[['LATITUDE','LONGITUDE']].values)
        df_clusters = pd.Series(kmeans.labels_)
        cluster_dummies = pd.get_dummies(df_clusters)
        X = scipy.sparse.csr_matrix(cluster_dummies)
        y = borough['CASUALTIES?']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        log_reg = LogisticRegression(class_weight='balanced', max_iter=10_000)
        log_reg.fit(X_train, y_train)
        y_pred = log_reg.predict(X_test)
        log_f1 = f1_score(y_test, y_pred)
        print(f'# Clusters: {i}\n    F1 score: {log_f1}')
        f1_list.append(log_f1)
    _ = plt.subplot(space)
    _ = plt.plot(k_range, f1_list, 'k-')
    _ = plt.grid()
    _ = plt.xlabel(f'{current_borough.title()} Clusters', fontsize=12)
    _ = plt.ylabel('f1 Score', fontsize=12)
    _ = plt.xticks(k_range, rotation=60, ha='right', fontsize=6)
    max_k[current_borough] = {
                        'K':f1_list.index(max(f1_list))+2,
                        'Score': max(f1_list)
            }
    _ = plt.subplot(236)
    _ = plt.scatter(borough['LATITUDE'], borough['LONGITUDE'], alpha=0.4)
    _ = plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1])
_ = plt.savefig('K-Means borough analysis.png')

Manhattan K-Means analysis
# Clusters: 2
    F1 score: 0.1541123002492303
# Clusters: 3
    F1 score: 0.1621875839828003
# Clusters: 4
    F1 score: 0.16244468182511826
# Clusters: 5
    F1 score: 0.15882567469000727
# Clusters: 6
    F1 score: 0.1605709471810278
# Clusters: 7
    F1 score: 0.16494845360824742
# Clusters: 8
    F1 score: 0.1634797405083484


In [None]:
for i in max_k:
    print(f'{i}\n    {max_k[i]}')

### Fit clusters

In [None]:
print('Fitting K-means clusters...')
k_clusters = []
for i in max_k:
    k_clusters.append(max_k[i]['K'])
for n, borough in zip(k_clusters,boroughs):
    print(f'    Calculating {borough.title()} clusters...')
    
    borough_accidents = df[df['BOROUGH'] == borough]
    kmeans = KMeans(n_clusters=n, random_state=42)
    kmeans.fit(borough_accidents[['LATITUDE','LONGITUDE']].values)
    
    df.loc[df['BOROUGH'] == borough, f'{borough} CLUSTERS'] = kmeans.labels_
print('Done.')

### Create feature set

In [None]:
print('Creating feature set...')
borough_dummies = pd.get_dummies(df['BOROUGH'], sparse=True)
borough_clusters = [borough+' CLUSTERS' for borough in boroughs]
cluster_dummies = pd.get_dummies(df[borough_clusters].fillna(''), prefix='CLUSTER', sparse=True)
pre_X = cluster_dummies.join(borough_dummies)
print('Done.')

### Split data

In [None]:
print('Splitting data...')
X = scipy.sparse.csr_matrix(pre_X)
y = df['CASUALTIES?']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print('Done.')

### Random Forest Bayesian search

In [None]:
cv = BayesSearchCV(estimator=RandomForestClassifier(), search_spaces=rf_params, scoring=make_scorer(f1_score), n_jobs=-1, return_train_score=True)
cv.fit(X_train, y_train)

cv_results = pd.DataFrame(cv.cv_results_)
cv_results[['param_max_depth','param_n_estimators','mean_train_score','mean_test_score','mean_fit_time']].sort_values(by='mean_test_score', ascending=False)

print(f'{cv.best_params_}\n{cv.best_score_}')

### Random Forest best params

In [None]:
rf_clf = RandomForestClassifier(**cv.best_params_)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
print(cv.best_params_)
print(f1_score(y_test, y_pred))

### Export fitted tools

In [None]:
params_path = r'../Predictor tools/rf_params.pickle'
with open(params_path, 'wb') as file:
    pickle.dump(cv.best_params_, file)
    
with open(params_path, 'rb') as file:
    test = pickle.load(file)
    
test == cv.best_params_

In [None]:
params_path = r'../Predictor tools/k_clusters.pickle'
with open(params_path, 'wb') as file:
    pickle.dump(max_k, file)
    
with open(params_path, 'rb') as file:
    test = pickle.load(file)

test == max_k