## Analysis of I24 Dataset

In [3]:
import pandas as pd
from utils.data_preprocessor import DataPreprocessor
from utils.ML_classifier import MLClassifier

In [None]:
# Load data
I24_data = pd.read_csv('../data/input/I24_data.csv', low_memory=False)
I24_data

### Convert Variables Type

In [None]:
# Convert 'date' type
I24_data['date'] = pd.to_datetime(I24_data['date'])

In [None]:
# Convert 'incident_time' and 'data_time' to datetime
I24_data['incident_time'] = pd.to_datetime(I24_data['date'].dt.strftime('%Y-%m-%d') + ' ' + I24_data['incident_time'], format='%Y-%m-%d %H:%M')

# Adjust `data_time` for potential rollover to the next or previous day
def adjust_observation_time(row):
    data_time = pd.to_datetime(row['date'].strftime('%Y-%m-%d') + ' ' + row['data_time'], format='%Y-%m-%d %H:%M:%S')
    if data_time < row['incident_time'] - pd.Timedelta(hours=12):
        data_time += pd.Timedelta(days=1)
    elif data_time > row['incident_time'] + pd.Timedelta(hours=12):
        data_time -= pd.Timedelta(days=1)
    return data_time

I24_data['data_time'] = I24_data.apply(adjust_observation_time, axis=1)

### Filter Dataset

In [None]:
# Calculate the time difference in minutes
I24_data['time_diff'] = (I24_data['data_time'] - I24_data['incident_time']).dt.total_seconds() / 60

# Filter data for observations from 4 min before up to 7 min after an accident/non-accident
filtered_I24_data = I24_data[(I24_data['time_diff'] >= -4) & (I24_data['time_diff'] <= 7)]
filtered_I24_data

In [None]:
# Delete unnecessary variables 
filtered_I24_data = filtered_I24_data.drop(columns=['road', 'mile', 'type', 'date', 'incident_time', 'incident_hour', 'data_time'])
filtered_I24_data

In [None]:
# Adjust variables names
filtered_I24_data.rename(columns={'speed (i)': 'speed (i+0)'}, inplace=True)
filtered_I24_data.rename(columns={'volume (i)': 'volume (i+0)'}, inplace=True)
filtered_I24_data.rename(columns={'occupancy (i)': 'occupancy (i+0)'}, inplace=True)

### Handle Missing Values

In [None]:
print(" \nCount total NaN at each column in a DataFrame : \n\n", filtered_I24_data.isnull().sum()) 

In [None]:
# Data Preprocessing
preprocessor = DataPreprocessor(filtered_I24_data)

In [None]:
preprocessor.replace_missings()
preprocessed_I24_data = preprocessor.get_preprocessed_data()
preprocessed_I24_data

### One-Hot Encoding

In [None]:
preprocessor.one_hot_encode()
preprocessed_I24_data = preprocessor.get_preprocessed_data()
preprocessed_I24_data

In [None]:
# Save preprocessed data
preprocessed_I24_data.to_csv('data/preprocessed_I24_data.csv', index=False)

### Model Training

In [None]:
classifier = MLClassifier(data=preprocessed_I24_data, target='incident at sensor (i)')
classifier.train_models()

### Sensitivity analysis

In [None]:
results = classifier.sensitivity_analysis()

In [None]:
# Plot heatmaps
classifier.generate_heatmap(results) 

In [None]:
# Results summary
classifier.generate_summary_table(results)

### Feature importance analysis

In [None]:
# Plot SHAP values for the best performing model
classifier.plot_shap_values(results) 