# Analysis of I75 Dataset

In [1]:
import pandas as pd
from data_preprocessor import DataPreprocessor
from ML_classifier import MLClassifier
import csv

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
# Load data
I75_data = pd.read_csv('data/I75_data.csv', low_memory=False)
I75_data.head()

Unnamed: 0,incident at sensor (i),road,mile,type,date,incident_time,incident_hour,data_time,weather,light,...,occupancy (i+2),speed (i+3),volume (i+3),occupancy (i+3),speed (i+4),volume (i+4),occupancy (i+4),speed (i+5),volume (i+5),occupancy (i+5)
0,1,00I75S,13.7,Suspected Minor Injury,2020-12-09,14:57,14,14:42:00,--,Daylight,...,9.0,62.25,9.0,3.5,41.666667,9.0,3.75,,,
1,1,00I75S,13.7,Suspected Minor Injury,2020-12-09,14:57,14,14:42:30,--,Daylight,...,10.5,64.0,10.0,3.75,43.333333,12.0,8.0,,,
2,1,00I75S,13.7,Suspected Minor Injury,2020-12-09,14:57,14,14:43:00,--,Daylight,...,7.0,58.0,4.0,1.75,43.333333,12.0,6.0,,,
3,1,00I75S,13.7,Suspected Minor Injury,2020-12-09,14:57,14,14:43:30,--,Daylight,...,9.0,62.75,7.0,2.25,43.333333,8.0,4.75,,,
4,1,00I75S,13.7,Suspected Minor Injury,2020-12-09,14:57,14,14:44:00,--,Daylight,...,6.5,60.333333,10.0,5.5,46.0,23.0,10.75,,,


## Variables type conversion

In [3]:
# Convert types of 'date' and 'incident_time'
I75_data['date'] = pd.to_datetime(I75_data['date'])
I75_data['incident_time'] = pd.to_datetime(I75_data['date'].dt.strftime('%Y-%m-%d') + ' ' + I75_data['incident_time'], format='%Y-%m-%d %H:%M')

# Adjust `data_time` for potential rollover to the next or previous day
def adjust_observation_time(row):
    data_time = pd.to_datetime(row['date'].strftime('%Y-%m-%d') + ' ' + row['data_time'], format='%Y-%m-%d %H:%M:%S')
    if data_time < row['incident_time'] - pd.Timedelta(hours=12):
        data_time += pd.Timedelta(days=1)
    elif data_time > row['incident_time'] + pd.Timedelta(hours=12):
        data_time -= pd.Timedelta(days=1)
    return data_time

I75_data['data_time'] = I75_data.apply(adjust_observation_time, axis=1)

## Filtering the dataset

In [5]:
# Calculate the time difference in minutes
I75_data['time_diff'] = (I75_data['data_time'] - I75_data['incident_time']).dt.total_seconds() / 60

# Filter data for observations from 4 min before up to 7 min after an accident/non-accident
filtered_I75_data = I75_data[(I75_data['time_diff'] >= -4) & (I75_data['time_diff'] <= 7)]
filtered_I75_data.head()

Unnamed: 0,incident at sensor (i),road,mile,type,date,incident_time,incident_hour,data_time,weather,light,...,speed (i+3),volume (i+3),occupancy (i+3),speed (i+4),volume (i+4),occupancy (i+4),speed (i+5),volume (i+5),occupancy (i+5),time_diff
22,1,00I75S,13.7,Suspected Minor Injury,2020-12-09,2020-12-09 14:57:00,14,2020-12-09 14:53:00,--,Daylight,...,64.0,15.0,4.5,22.0,5.0,28.75,,,,-4.0
23,1,00I75S,13.7,Suspected Minor Injury,2020-12-09,2020-12-09 14:57:00,14,2020-12-09 14:53:30,--,Daylight,...,63.0,19.0,6.25,21.5,8.0,49.25,,,,-3.5
24,1,00I75S,13.7,Suspected Minor Injury,2020-12-09,2020-12-09 14:57:00,14,2020-12-09 14:54:00,--,Daylight,...,64.666667,12.0,5.5,32.0,19.0,41.0,,,,-3.0
25,1,00I75S,13.7,Suspected Minor Injury,2020-12-09,2020-12-09 14:57:00,14,2020-12-09 14:54:30,--,Daylight,...,64.333333,11.0,5.0,27.5,14.0,34.5,,,,-2.5
26,1,00I75S,13.7,Suspected Minor Injury,2020-12-09,2020-12-09 14:57:00,14,2020-12-09 14:55:00,--,Daylight,...,63.666667,16.0,7.0,33.0,11.0,31.75,,,,-2.0
