# Generation of Negative Events

## 0. Load Dependencies

In [1]:
# Load Libraries
import numpy as np 
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings

## 1. Load Data

In [2]:
data = pd.read_csv("Data/NY_Accidents_June20.csv")

In [3]:
data.shape

(8384, 53)

In [4]:
data['acc_year'] = pd.to_datetime(data['Start_Time']).dt.year
data['acc_month'] = pd.to_datetime(data['Start_Time']).dt.month
data['acc_hr_day'] = pd.to_datetime(data['Start_Time']).dt.hour
data['new_date'] = pd.to_datetime(data['Start_Time']).dt.date
data['day_name'] = pd.to_datetime(data['Start_Time']).dt.day_name()

In [5]:
list(data.columns)

['Unnamed: 0',
 'ID',
 'Source',
 'TMC',
 'Severity',
 'Start_Time',
 'End_Time',
 'Start_Lat',
 'Start_Lng',
 'Distance(mi)',
 'Description',
 'Number',
 'Street',
 'Side',
 'City',
 'County',
 'State',
 'Zipcode',
 'Country',
 'Timezone',
 'Airport_Code',
 'Weather_Timestamp',
 'Temperature(F)',
 'Wind_Chill(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Direction',
 'Wind_Speed(mph)',
 'Precipitation(in)',
 'Weather_Condition',
 'Amenity',
 'Bump',
 'Crossing',
 'Give_Way',
 'Junction',
 'No_Exit',
 'Railway',
 'Roundabout',
 'Station',
 'Stop',
 'Traffic_Calming',
 'Traffic_Signal',
 'Turning_Loop',
 'Sunrise_Sunset',
 'Civil_Twilight',
 'Nautical_Twilight',
 'Astronomical_Twilight',
 'acc_year',
 'acc_month',
 'acc_hr_day',
 'new_date',
 'day_name']

In [6]:
data = data[~data['Temperature(F)'].isna()]
data = data[~data['Weather_Condition'].isna()]

## 2. Generate Negative Events

**Methodology:**

1. Find all accidents that took place on a particular date.
2. Iterate through all possible hours of accidents on that date.
3. For each hour, locate other events that are not co-located with these accidents.
4. Sample these other events without replace; sample number is *(3 x number of accidents in that hour on that date)*.
5. Modify the features (*time* and *weather*, see below) to match those of the accident.
6. Repeat step 1 to 5 for all possible dates.

In [7]:
# randsom state seed number
random_seed = 42

# set the ratio between negative to positive samples
ratio = 3  

# features to be modified on negative samples
modified_features = ['Start_Time', 
                     'End_Time',
                     'Temperature(F)',
                     'Weather_Condition', 
                     'Sunrise_Sunset', 
                     'Civil_Twilight', 
                     'Nautical_Twilight',
                     'Astronomical_Twilight',
                     'acc_year',
                     'acc_month',
                     'acc_hr_day',
                     'new_date',
                     'day_name']

# initialize an empty negative event pandas dataframe
negative_events = pd.DataFrame(columns = data.columns)


In [8]:
# identify unique accident dates
date_list = data.new_date.unique()

# iterate through all possible dates (step 6 of methodology)
for date in date_list:
    
    # find all accidents that took place on that date (step 1 of methodology)
    event_list = data[data.new_date == date]
    
    # find unique hours of accidents on that date
    hour_list = event_list.acc_hr_day.unique()
    
    # iterate through all possible accident hours (step 2 of methodology)
    for hour in hour_list:
        
        exclude_list = event_list[event_list.acc_hr_day == hour]  # accidents on that date in that hour
        number_of_accidents = len(exclude_list)  # used to determine number of samples (step 4 of methodology)
        exclude_lat = exclude_list.Start_Lat.unique() # latitude to avoid
        exclude_lng = exclude_list.Start_Lng.unique() # logitude to avoid
        
        # determine a sample list that excludes the locations of the accidents on that date in that hour
        # this is step 3 of the methodology
        sample_list = data[(~data.Start_Lat.isin(exclude_lat)) | (~data.Start_Lng.isin(exclude_lng))]
        
        # sample at three times more than the number of accidents on that date in that hour
        # this is step 4 of the metholodogy
        negative_samples = sample_list.sample(n=ratio*number_of_accidents, replace=False, random_state=random_seed)
        
        # modify the features of the negative samples to match those of the accident
        # we chose the first accident to be representative of the weather conditions
        # this is step 5 of the methodology
        negative_samples[modified_features] = exclude_list.iloc[0][modified_features].values
        
        # append the negative samples to the overall negative events that can be used to generate our model
        negative_events = negative_events.append(negative_samples)
          

In [9]:
negative_events

Unnamed: 0.1,Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),...,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,acc_year,acc_month,acc_hr_day,new_date,day_name
1334,478976,A-478982,MapQuest,201.0,2,2016-12-01 08:21:11,2016-12-01 08:50:48,40.850063,-73.944794,1.970,...,False,Day,Day,Day,Day,2016,12,8,2016-12-01,Thursday
1592,589230,A-589238,MapQuest,201.0,2,2016-12-01 08:21:11,2016-12-01 08:50:48,40.746658,-73.969345,0.000,...,False,Day,Day,Day,Day,2016,12,8,2016-12-01,Thursday
6174,2624326,A-2624367,Bing,,2,2016-12-01 08:21:11,2016-12-01 08:50:48,40.793840,-73.931880,0.060,...,False,Day,Day,Day,Day,2016,12,8,2016-12-01,Thursday
5449,2425131,A-2425172,MapQuest,201.0,3,2016-12-01 08:21:11,2016-12-01 08:50:48,40.850067,-73.944817,1.650,...,False,Day,Day,Day,Day,2016,12,8,2016-12-01,Thursday
3922,1595383,A-1595416,MapQuest,201.0,2,2016-12-01 08:21:11,2016-12-01 08:50:48,40.782791,-73.957397,0.000,...,False,Day,Day,Day,Day,2016,12,8,2016-12-01,Thursday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5581,2524315,A-2524356,Bing,,2,2017-07-31 19:34:51,2017-08-01 01:34:51,40.716430,-73.975590,0.305,...,False,Day,Day,Day,Day,2017,7,19,2017-07-31,Monday
1461,529601,A-529609,MapQuest,201.0,2,2017-07-31 19:34:51,2017-08-01 01:34:51,40.710560,-73.983292,0.000,...,False,Day,Day,Day,Day,2017,7,19,2017-07-31,Monday
6104,2586282,A-2586323,Bing,,2,2019-05-25 14:31:30,2019-05-25 16:02:18,40.846847,-73.932299,0.477,...,False,Day,Day,Day,Day,2019,5,14,2019-05-25,Saturday
5581,2524315,A-2524356,Bing,,2,2019-05-25 14:31:30,2019-05-25 16:02:18,40.716430,-73.975590,0.305,...,False,Day,Day,Day,Day,2019,5,14,2019-05-25,Saturday


In [10]:
# save the negative events to a csv file
negative_events.to_csv("Data/NY_Negatives_June20.csv", index=False)