## Analysis of I24 Dataset

In [1]:
import pandas as pd
from data_preprocessor import DataPreprocessor
from ML_classifier import MLClassifier

In [2]:
# Load data
I24_data = pd.read_csv('data/I24_data.csv', low_memory=False)
I24_data

Unnamed: 0,incident at sensor (i),road,mile,type,date,incident_time,incident_hour,data_time,weather,light,...,occupancy (i+2),speed (i+3),volume (i+3),occupancy (i+3),speed (i+4),volume (i+4),occupancy (i+4),speed (i+5),volume (i+5),occupancy (i+5)
0,1,00I24E,177.8,Suspected Minor Injury,2021-03-05,15:20,15,15:05:00,--,Daylight,...,3.897500e+02,31.75,2562.0,4065916.00,,,,,,
1,1,00I24E,177.8,Suspected Minor Injury,2021-03-05,15:20,15,15:05:30,--,Daylight,...,1.155292e+05,31.75,6464.0,4550758.75,,,,,,
2,1,00I24E,177.8,Suspected Minor Injury,2021-03-05,15:20,15,15:06:00,--,Daylight,...,3.303106e+06,25.00,4751.0,2848578.00,,,,,,
3,1,00I24E,177.8,Suspected Minor Injury,2021-03-05,15:20,15,15:06:30,--,Daylight,...,2.780782e+06,24.75,4264.0,6395809.75,,,,,,
4,1,00I24E,177.8,Suspected Minor Injury,2021-03-05,15:20,15,15:07:00,--,Daylight,...,1.896250e+03,22.00,4743.0,1593311.75,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302741,0,00I24E,182.4,,2021-04-16,18:11,18,18:24:30,--,Daylight,...,9.333333e+00,,,,64.000000,27.0,4.666667,,,
302742,0,00I24E,182.4,,2021-04-16,18:11,18,18:25:00,--,Daylight,...,1.200000e+01,,,,61.333333,32.0,10.000000,,,
302743,0,00I24E,182.4,,2021-04-16,18:11,18,18:25:30,--,Daylight,...,1.066667e+01,,,,63.666667,25.0,8.666667,,,
302744,0,00I24E,182.4,,2021-04-16,18:11,18,18:26:00,--,Daylight,...,7.666667e+00,,,,62.333333,25.0,7.333333,,,


In [3]:
I24_data.dtypes

incident at sensor (i)      int64
road                       object
mile                      float64
type                       object
date                       object
incident_time              object
incident_hour               int64
data_time                  object
weather                    object
light                      object
speed (i-5)               float64
volume (i-5)              float64
occupancy (i-5)           float64
speed (i-4)               float64
volume (i-4)              float64
occupancy (i-4)           float64
speed (i-3)               float64
volume (i-3)              float64
occupancy (i-3)           float64
speed (i-2)               float64
volume (i-2)              float64
occupancy (i-2)           float64
speed (i-1)               float64
volume (i-1)              float64
occupancy (i-1)           float64
speed (i)                 float64
volume (i)                float64
occupancy (i)             float64
speed (i+1)               float64
volume (i+1)  

### Convert Variables Type

In [4]:
# Convert 'date' type
I24_data['date'] = pd.to_datetime(I24_data['date'])

In [5]:
# Convert 'incident_time' and 'data_time' to datetime
I24_data['incident_time'] = pd.to_datetime(I24_data['date'].dt.strftime('%Y-%m-%d') + ' ' + I24_data['incident_time'], format='%Y-%m-%d %H:%M')

# Adjust `data_time` for potential rollover to the next or previous day
def adjust_observation_time(row):
    data_time = pd.to_datetime(row['date'].strftime('%Y-%m-%d') + ' ' + row['data_time'], format='%Y-%m-%d %H:%M:%S')
    if data_time < row['incident_time'] - pd.Timedelta(hours=12):
        data_time += pd.Timedelta(days=1)
    elif data_time > row['incident_time'] + pd.Timedelta(hours=12):
        data_time -= pd.Timedelta(days=1)
    return data_time

I24_data['data_time'] = I24_data.apply(adjust_observation_time, axis=1)

### Filter Dataset

In [6]:
# Calculate the time difference in minutes
I24_data['time_diff'] = (I24_data['data_time'] - I24_data['incident_time']).dt.total_seconds() / 60

# Filter data for observations from 4 min before up to 7 min after an accident/non-accident
filtered_I24_data = I24_data[(I24_data['time_diff'] >= -4) & (I24_data['time_diff'] <= 7)]
filtered_I24_data

Unnamed: 0,incident at sensor (i),road,mile,type,date,incident_time,incident_hour,data_time,weather,light,...,speed (i+3),volume (i+3),occupancy (i+3),speed (i+4),volume (i+4),occupancy (i+4),speed (i+5),volume (i+5),occupancy (i+5),time_diff
22,1,00I24E,177.8,Suspected Minor Injury,2021-03-05,2021-03-05 15:20:00,15,2021-03-05 15:16:00,--,Daylight,...,17.25,5586.0,8585709.75,,,,,,,-4.0
23,1,00I24E,177.8,Suspected Minor Injury,2021-03-05,2021-03-05 15:20:00,15,2021-03-05 15:16:30,--,Daylight,...,14.25,1355.0,3716285.25,,,,,,,-3.5
24,1,00I24E,177.8,Suspected Minor Injury,2021-03-05,2021-03-05 15:20:00,15,2021-03-05 15:17:00,--,Daylight,...,24.50,5466.0,2360606.25,,,,,,,-3.0
25,1,00I24E,177.8,Suspected Minor Injury,2021-03-05,2021-03-05 15:20:00,15,2021-03-05 15:17:30,--,Daylight,...,20.25,2546.0,1080457.00,,,,,,,-2.5
26,1,00I24E,177.8,Suspected Minor Injury,2021-03-05,2021-03-05 15:20:00,15,2021-03-05 15:18:00,--,Daylight,...,19.00,3553.0,1636759.00,,,,,,,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302724,0,00I24E,182.4,,2021-04-16,2021-04-16 18:11:00,18,2021-04-16 18:16:00,--,Daylight,...,,,,53.333333,42.0,16.333333,,,,5.0
302725,0,00I24E,182.4,,2021-04-16,2021-04-16 18:11:00,18,2021-04-16 18:16:30,--,Daylight,...,,,,57.333333,32.0,10.666667,,,,5.5
302726,0,00I24E,182.4,,2021-04-16,2021-04-16 18:11:00,18,2021-04-16 18:17:00,--,Daylight,...,,,,52.333333,34.0,12.000000,,,,6.0
302727,0,00I24E,182.4,,2021-04-16,2021-04-16 18:11:00,18,2021-04-16 18:17:30,--,Daylight,...,,,,51.666667,32.0,9.000000,,,,6.5


In [7]:
# Delete unnecessary variables 
filtered_I24_data = filtered_I24_data.drop(columns=['road', 'mile', 'type', 'date', 'incident_time', 'incident_hour', 'data_time'])
filtered_I24_data

Unnamed: 0,incident at sensor (i),weather,light,speed (i-5),volume (i-5),occupancy (i-5),speed (i-4),volume (i-4),occupancy (i-4),speed (i-3),...,speed (i+3),volume (i+3),occupancy (i+3),speed (i+4),volume (i+4),occupancy (i+4),speed (i+5),volume (i+5),occupancy (i+5),time_diff
22,1,--,Daylight,68.500000,49.0,33.500000,71.000000,55.0,53.000000,62.000000,...,17.25,5586.0,8585709.75,,,,,,,-4.0
23,1,--,Daylight,68.500000,67.0,27.000000,71.500000,513.0,959.000000,62.500000,...,14.25,1355.0,3716285.25,,,,,,,-3.5
24,1,--,Daylight,67.000000,67.0,36.500000,70.000000,36.0,42.500000,59.500000,...,24.50,5466.0,2360606.25,,,,,,,-3.0
25,1,--,Daylight,69.500000,1314.0,54.000000,70.000000,20.0,5.000000,,...,20.25,2546.0,1080457.00,,,,,,,-2.5
26,1,--,Daylight,67.500000,1119.0,656.000000,70.500000,311.0,607.000000,37.500000,...,19.00,3553.0,1636759.00,,,,,,,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302724,0,--,Daylight,58.000000,28.0,4.142857,59.666667,44.0,9.333333,65.000000,...,,,,53.333333,42.0,16.333333,,,,5.0
302725,0,--,Daylight,58.333333,36.0,8.571429,60.333333,53.0,11.333333,64.000000,...,,,,57.333333,32.0,10.666667,,,,5.5
302726,0,--,Daylight,59.000000,39.0,8.857143,59.000000,33.0,7.333333,65.166667,...,,,,52.333333,34.0,12.000000,,,,6.0
302727,0,--,Daylight,57.500000,45.0,6.000000,60.833333,26.0,4.833333,62.800000,...,,,,51.666667,32.0,9.000000,,,,6.5


### Handle Missing Values

In [8]:
print(" \nCount total NaN at each column in a DataFrame : \n\n", filtered_I24_data.isnull().sum()) 

 
Count total NaN at each column in a DataFrame : 

 incident at sensor (i)        0
weather                       0
light                         0
speed (i-5)               20049
volume (i-5)              18137
occupancy (i-5)           18137
speed (i-4)               20535
volume (i-4)              18355
occupancy (i-4)           18355
speed (i-3)               19185
volume (i-3)              16331
occupancy (i-3)           16331
speed (i-2)               15103
volume (i-2)              12111
occupancy (i-2)           12111
speed (i-1)               17387
volume (i-1)              14605
occupancy (i-1)           14605
speed (i)                  9069
volume (i)                 7332
occupancy (i)              7332
speed (i+1)               13634
volume (i+1)              11504
occupancy (i+1)           11504
speed (i+2)                9417
volume (i+2)               7608
occupancy (i+2)            7608
speed (i+3)               10894
volume (i+3)               8882
occupancy (i+3)    

In [9]:
# Data Preprocessing
preprocessor = DataPreprocessor(filtered_I24_data)

In [10]:
preprocessor.handle_missings()
preprocessed_I24_data = preprocessor.get_preprocessed_data()
preprocessed_I24_data

Unnamed: 0,incident at sensor (i),weather,light,speed (i-5),volume (i-5),occupancy (i-5),speed (i-4),volume (i-4),occupancy (i-4),speed (i-3),...,speed (i+3),volume (i+3),occupancy (i+3),speed (i+4),volume (i+4),occupancy (i+4),speed (i+5),volume (i+5),occupancy (i+5),time_diff
22,1.0,--,Daylight,68.500000,49.0,33.500000,71.000000,55.0,53.000000,62.000000,...,17.25,5586.0,8585709.75,-99999.000000,-99999.0,-99999.000000,-99999.0,-99999.0,-99999.0,-4.0
23,1.0,--,Daylight,68.500000,67.0,27.000000,71.500000,513.0,959.000000,62.500000,...,14.25,1355.0,3716285.25,-99999.000000,-99999.0,-99999.000000,-99999.0,-99999.0,-99999.0,-3.5
24,1.0,--,Daylight,67.000000,67.0,36.500000,70.000000,36.0,42.500000,59.500000,...,24.50,5466.0,2360606.25,-99999.000000,-99999.0,-99999.000000,-99999.0,-99999.0,-99999.0,-3.0
25,1.0,--,Daylight,69.500000,1314.0,54.000000,70.000000,20.0,5.000000,-99999.000000,...,20.25,2546.0,1080457.00,-99999.000000,-99999.0,-99999.000000,-99999.0,-99999.0,-99999.0,-2.5
26,1.0,--,Daylight,67.500000,1119.0,656.000000,70.500000,311.0,607.000000,37.500000,...,19.00,3553.0,1636759.00,-99999.000000,-99999.0,-99999.000000,-99999.0,-99999.0,-99999.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302724,0.0,--,Daylight,58.000000,28.0,4.142857,59.666667,44.0,9.333333,65.000000,...,-99999.00,-99999.0,-99999.00,53.333333,42.0,16.333333,-99999.0,-99999.0,-99999.0,5.0
302725,0.0,--,Daylight,58.333333,36.0,8.571429,60.333333,53.0,11.333333,64.000000,...,-99999.00,-99999.0,-99999.00,57.333333,32.0,10.666667,-99999.0,-99999.0,-99999.0,5.5
302726,0.0,--,Daylight,59.000000,39.0,8.857143,59.000000,33.0,7.333333,65.166667,...,-99999.00,-99999.0,-99999.00,52.333333,34.0,12.000000,-99999.0,-99999.0,-99999.0,6.0
302727,0.0,--,Daylight,57.500000,45.0,6.000000,60.833333,26.0,4.833333,62.800000,...,-99999.00,-99999.0,-99999.00,51.666667,32.0,9.000000,-99999.0,-99999.0,-99999.0,6.5


### One-Hot Encoding

In [11]:
# Adjust 'weather' variable
preprocessed_I24_data['weather'] = preprocessed_I24_data['weather'].replace({'--': 'Unknown'})

In [12]:
preprocessor.one_hot_encode()
preprocessed_I24_data = preprocessor.get_preprocessed_data()
preprocessed_I24_data

Unnamed: 0,incident at sensor (i),speed (i-5),volume (i-5),occupancy (i-5),speed (i-4),volume (i-4),occupancy (i-4),speed (i-3),volume (i-3),occupancy (i-3),...,occupancy (i+4),speed (i+5),volume (i+5),occupancy (i+5),time_diff,weather_Snow,weather_Unknown,light_Dawn,light_Daylight,light_Dusk
22,1.0,68.500000,49.0,33.500000,71.000000,55.0,53.000000,62.000000,45.0,88.500000,...,-99999.000000,-99999.0,-99999.0,-99999.0,-4.0,False,True,False,True,False
23,1.0,68.500000,67.0,27.000000,71.500000,513.0,959.000000,62.500000,511.0,1010.500000,...,-99999.000000,-99999.0,-99999.0,-99999.0,-3.5,False,True,False,True,False
24,1.0,67.000000,67.0,36.500000,70.000000,36.0,42.500000,59.500000,611.0,1360.500000,...,-99999.000000,-99999.0,-99999.0,-99999.0,-3.0,False,True,False,True,False
25,1.0,69.500000,1314.0,54.000000,70.000000,20.0,5.000000,-99999.000000,0.0,30.000000,...,-99999.000000,-99999.0,-99999.0,-99999.0,-2.5,False,True,False,True,False
26,1.0,67.500000,1119.0,656.000000,70.500000,311.0,607.000000,37.500000,39.0,1209.500000,...,-99999.000000,-99999.0,-99999.0,-99999.0,-2.0,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302724,0.0,58.000000,28.0,4.142857,59.666667,44.0,9.333333,65.000000,38.0,7.833333,...,16.333333,-99999.0,-99999.0,-99999.0,5.0,False,True,False,True,False
302725,0.0,58.333333,36.0,8.571429,60.333333,53.0,11.333333,64.000000,35.0,6.666667,...,10.666667,-99999.0,-99999.0,-99999.0,5.5,False,True,False,True,False
302726,0.0,59.000000,39.0,8.857143,59.000000,33.0,7.333333,65.166667,20.0,4.333333,...,12.000000,-99999.0,-99999.0,-99999.0,6.0,False,True,False,True,False
302727,0.0,57.500000,45.0,6.000000,60.833333,26.0,4.833333,62.800000,27.0,10.166667,...,9.000000,-99999.0,-99999.0,-99999.0,6.5,False,True,False,True,False


### Model Training

In [15]:
classifier = MLClassifier(data=preprocessed_I24_data, target='incident at sensor (i)')
classifier.train_models()

Fitting 5 folds for each of 5 candidates, totalling 25 fits




Best parameters for Logistic Regression: {'classifier__C': 0.1}
AUC-ROC for Logistic Regression: 0.5230
-----
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters for Random Forest: {'classifier__n_estimators': 100}
AUC-ROC for Random Forest: 0.9956
-----
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters for XGBoost: {'classifier__learning_rate': 0.01, 'classifier__n_estimators': 1000}
AUC-ROC for XGBoost: 0.9781
-----
