# Prediction Modeling

## 0. Load Dependencies

In [1]:
# Import Relevant Packages
import numpy as np
import pandas as pd
import io
import sklearn.linear_model as lm
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import random
from scipy import stats
from scipy.special import boxcox1p
from sklearn.ensemble import GradientBoostingRegressor as gbr
from sklearn import metrics
from sklearn.svm import SVR


# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Display multiple outputs from one single cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 1. Load Raw Data

In [2]:
# Loading positive and negative events
positive = pd.read_csv("Data/NY_Accidents_June20.csv")
negative = pd.read_csv("Data/NY_Negatives_June20.csv")

# Assign label 'Accident' to each event
positive['Accident'] = 1
negative['Accident'] = 0

In [45]:
positive.columns

Index(['Unnamed: 0', 'ID', 'Source', 'TMC', 'Severity', 'Start_Time',
       'End_Time', 'Start_Lat', 'Start_Lng', 'Distance(mi)', 'Description',
       'Number', 'Street', 'Side', 'City', 'County', 'State', 'Zipcode',
       'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight', 'acc_year', 'acc_month', 'acc_hr_day',
       'new_date', 'day_name', 'Accident'],
      dtype='object')

In [49]:
# Consider using the following line of code to remove all nan entries.
# dataset = dataset.dropna()

positive = positive[~positive['Temperature(F)'].isna()]
positive = positive[~positive['Weather_Condition'].isna()]

In [50]:
positive.shape
negative.columns

(8135, 54)

Index(['Unnamed: 0', 'ID', 'Source', 'TMC', 'Severity', 'Start_Time',
       'End_Time', 'Start_Lat', 'Start_Lng', 'Distance(mi)', 'Description',
       'Number', 'Street', 'Side', 'City', 'County', 'State', 'Zipcode',
       'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight', 'acc_year', 'acc_month', 'acc_hr_day',
       'new_date', 'day_name', 'Accident'],
      dtype='object')

## 2. Generate Working Dataset

In this segment, we will create a working dataset containing equal number of both positive and negative samples.  We will then do a train/test split on this new data set.

In [51]:
# Parameter initialization
random_seed = 42
test_size = 0.2  # 80% train, 20% test
features = ['Temperature(F)',
            'Weather_Condition', 
            'Sunrise_Sunset', 
            'Civil_Twilight', 
            'Nautical_Twilight',
            'Astronomical_Twilight',
            'acc_month',
            'acc_hr_day',
            'day_name']

In [52]:
# Create working dataset
# 1. sample equal number of positive cases from dataset 'negative' without replacement
# 2. merge with dataset 'positive'
# 3. pull X [features] and Y [label], which is the new parameter 'Accident' in Section 1
# 4. one-hot encode 'object' features
# 5. shuffle the new dataset
# 6. train/test split the dataset

# 1
# temp = negative.sample(n=positive.shape[0], random_state=random_seed, replace=True).copy(deep=True)
temp = negative.sample(n=positive.shape[0], random_state=random_seed, replace=False)

# 2
data = pd.concat([positive, temp], axis=0, ignore_index=True)

# 3
X, y = data[features], data.Accident
X.acc_month = X.acc_month.astype('object')
X.acc_hr_day = X.acc_hr_day.astype('object')

# 4
X = pd.get_dummies(X,drop_first=True)

# 5
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X.iloc[shuffle], y.iloc[shuffle]

# 6
# n = round(train_test_split_ratio * X.shape[0])
# train_data, train_labels = X.iloc[:n], Y.iloc[:n]
# test_data, test_labels = X.iloc[n:], Y.iloc[n:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_seed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


## 3. Modeling

### 3.1 Logistic Regression

In [54]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(random_state=random_seed).fit(X_train, y_train)
logit_pred = logit.predict(X_test)
logit_score = logit.score(X_test, y_test)
logit_score

0.49170251997541486

In [56]:
logit_pred

array([1, 1, 0, ..., 0, 0, 0])

### 3.2 Decision Tree

In [57]:
# from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier(random_state=random_seed).fit(X_train, y_train)
DT_pred = DT.predict(X_test)
DT_score = DT.score(X_test, y_test)
DT_score

0.3869084204056546

In [58]:
DT_pred

array([0, 1, 0, ..., 0, 0, 0])

### 3.3 Multi-Layer Perceptron Classifier

In [59]:
from sklearn.neural_network import MLPClassifier

MLP = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
MLP_pred = MLP.predict(X_test)
MLP_score = MLP.score(X_test, y_test)
MLP_score

0.4794099569760295

### 3.4 Keras Sequence Model

In [67]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

In [70]:
X.isna().sum().sum()

0

In [71]:
X.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temperature(F),16270.0,57.597308,17.397745,3.9,44.0,57.9,72.0,95.0
Weather_Condition_Cloudy,16270.0,0.080455,0.272004,0.0,0.0,0.0,0.0,1.0
Weather_Condition_Cloudy / Windy,16270.0,0.000246,0.015678,0.0,0.0,0.0,0.0,1.0
Weather_Condition_Fair,16270.0,0.177505,0.382107,0.0,0.0,0.0,0.0,1.0
Weather_Condition_Fair / Windy,16270.0,0.000061,0.007840,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
day_name_Saturday,16270.0,0.065642,0.247663,0.0,0.0,0.0,0.0,1.0
day_name_Sunday,16270.0,0.079902,0.271149,0.0,0.0,0.0,0.0,1.0
day_name_Thursday,16270.0,0.173940,0.379069,0.0,0.0,0.0,0.0,1.0
day_name_Tuesday,16270.0,0.170191,0.375812,0.0,0.0,0.0,0.0,1.0


In [72]:
normalizer = preprocessing.Normalization()

In [73]:
normalizer.adapt(np.array(X_train))

In [74]:
print(normalizer.mean.numpy())

[5.7652473e+01 7.9901658e-02 3.0731407e-04 1.7739704e-01 7.6828517e-05
 7.3755379e-03 1.2446220e-02 6.0694530e-03 7.6828519e-04 2.3048556e-04
 3.0731407e-04 6.2154271e-02 3.8414259e-04 3.9182543e-03 3.0731407e-04
 8.4588200e-02 1.4643516e-01 5.6161646e-02 1.2446220e-02 2.3816841e-02
 4.6097112e-04 3.8414259e-04 1.5365703e-04 2.7788875e-01 2.3240627e-01
 1.8392748e-01 1.4566687e-01 7.6367550e-02 8.2360171e-02 9.9262446e-02
 1.0840504e-01 1.0341118e-01 7.4523665e-02 7.7443145e-02 7.4754149e-02
 7.9517514e-02 7.7289492e-02 7.3832206e-02 6.7609097e-03 6.0694530e-03
 6.8377382e-03 1.8669330e-02 3.8337432e-02 6.6149354e-02 7.5983405e-02
 7.0144437e-02 4.8555624e-02 6.1078671e-02 5.3626306e-02 4.9708053e-02
 4.9631223e-02 5.7314076e-02 5.4701906e-02 6.6072524e-02 5.9004303e-02
 5.2089736e-02 5.0015364e-02 3.6954518e-02 2.4277812e-02 1.8976644e-02
 1.5596190e-02 1.7255685e-01 6.6379838e-02 7.7980943e-02 1.7624462e-01
 1.7132759e-01 1.6633375e-01]


In [79]:
# dir(normalizer)

['_TF_MODULE_IGNORED_PROPERTIES',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__metaclass__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_activity_regularizer',
 '_add_inbound_node',
 '_add_state_variable',
 '_add_trackable',
 '_add_variable_with_custom_getter',
 '_attribute_sentinel',
 '_auto_track_sub_layers',
 '_autocast',
 '_broadcast_shape',
 '_build_input_shape',
 '_call_accepts_kwargs',
 '_call_arg_was_passed',
 '_call_fn_args',
 '_call_full_argspec',
 '_callable_losses',
 '_checkpoint_dependencies',
 '_clear_losses',
 '_collect_input_masks',
 '_combiner',
 '_compute_dtype',
 '_dedup_weights',
 '_deferred_dependencies',
 '_dtype',
 '_dtype_defaulted_t

In [86]:
first = np.array(X_train[:1])

with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())

First example: [[82.9  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   1.   1.   1.   0.
   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   1.   0.   0.   1.   0.   0.   0.   0.   0. ]]

Normalized: [[ 1.46 -0.29 -0.02 -0.46 -0.01 -0.09 -0.11 -0.08 -0.03 -0.02 -0.02 -0.26
  -0.02 -0.06 -0.02 -0.3  -0.41 -0.24 -0.11 -0.16 -0.02 -0.02 -0.01  1.61
   1.82  2.11  2.42 -0.29 -0.3  -0.33 -0.35 -0.34 -0.28 -0.29  3.52 -0.29
  -0.29 -0.28 -0.08 -0.08 -0.08 -0.14 -0.2  -0.27 -0.29 -0.27 -0.23 -0.26
  -0.24 -0.23 -0.23 -0.25 -0.24 -0.27 -0.25 -0.23 -0.23 -0.2   6.34 -0.14
  -0.13  2.19 -0.27 -0.29 -0.46 -0.45 -0.45]]


In [None]:
model = tf.keras.Sequential([
    normalizer,
    layers.Dense(units=1)
    
])