# 4. Model Development

*   Model Selection: For the study, ANN model would be used.
*   Loss function Definition: sparse categorical crossentropy
*   Model Training and Tuning

**Limitation in this project**: Here, due to lack of resources, model tuning will not be performed.

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/MyDrive/Crash severity

/content/gdrive/.shortcut-targets-by-id/1PPTdMShCmN_ebwQDrXakKkI4ggw-4Ayf/Crash severity


In [3]:
import pandas as pd
import numpy as np

y_train = pd.read_csv('y_train.csv')
chunksize = 10000
chunks = []
for chunk in pd.read_csv('X_train.csv', chunksize=chunksize, dtype=np.float16):
  chunks.append(chunk)
X_train = pd.concat(chunks, ignore_index=True)

In [4]:
# Free memory by deleting variables from system memory
for var in list(globals().keys()):
  if var not in ['X_train','y_train'] and not var.startswith('__'):
    del globals()[var]

import gc
gc.collect()
print(globals().keys())

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '__', '___', 'y_train', 'X_train', 'var', 'gc'])


In [5]:
import tensorflow as tf

def create_dataset(X, y, batch_size, is_training=True):
  dataset = tf.data.Dataset.from_tensor_slices((X, y))
  if is_training:
    dataset = dataset.shuffle(buffer_size=len(X))
  dataset = dataset.batch(batch_size)
  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
  return dataset


In [6]:
batch_size = 32
train_size = int(0.7 * len(X_train))
train_dataset = create_dataset(X_train[:train_size], y_train[:train_size], batch_size).apply(tf.data.experimental.prefetch_to_device('/gpu:0'))
val_dataset = create_dataset(X_train[train_size:], y_train[train_size:], batch_size, is_training=False).apply(tf.data.experimental.prefetch_to_device('/gpu:0'))

In [7]:
# Free memory by deleting variables from system memory
for var in list(globals().keys()):
  if var not in ['train_dataset','val_dataset'] and not var.startswith('__'):
    del globals()[var]

import gc
gc.collect()
print(globals().keys())

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '__', '___', 'train_dataset', 'val_dataset', 'var', 'gc'])


In [8]:
import tensorflow as tf

for features, labels in train_dataset.take(1):
  input_shape = features.shape[1:]

inputs = tf.keras.Input(shape=input_shape)
x = tf.keras.layers.Dense(64,activation='relu')(inputs)
x = tf.keras.layers.Dense(64,activation='relu')(x)
outputs = tf.keras.layers.Dense(4,activation='softmax')(x)
model = tf.keras.Model(inputs,outputs)
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)

Change initial_epoch if system crashes

In [None]:
model = tf.keras.models.load_model('best_model.keras')

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    initial_epoch = 8,
    epochs = 10,
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            filepath='best_model.keras',
            monitor='val_loss',
            save_best_only=True,
            mode='min',
            verbose=1
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience = 3
        )
    ]
)

Epoch 9/10
 135247/Unknown [1m259s[0m 2ms/step - accuracy: 0.8014 - loss: 0.4773

  self.gen.throw(typ, value, traceback)



Epoch 9: val_loss improved from inf to 0.47717, saving model to best_model.keras
[1m135247/135247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 2ms/step - accuracy: 0.8014 - loss: 0.4773 - val_accuracy: 0.8019 - val_loss: 0.4772
Epoch 10/10
[1m135238/135247[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8018 - loss: 0.4767
Epoch 10: val_loss did not improve from 0.47717
[1m135247/135247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 2ms/step - accuracy: 0.8018 - loss: 0.4767 - val_accuracy: 0.8002 - val_loss: 0.4777


# 5. Evaluation and Interpretation (1)

*   Performance of the Best Model: The best model for ANN is loaded for evaluation.
*   Error Analysis: Inclusion of other road characteristics data such as shoulder width, injury severity, different road characteristics, skid resistance would have enhanced the quality of this study. Additionally, a computer with better computational resources would also have helped in the study to delve deeper through tuning.
*   Outcome Interpretation: ANNs being a "black-box" model incorporated with Shapley Additive exPlanations (SHAP) helps explaining the feature important for prediction of severity.

In [9]:
def create_dataset(X, y, batch_size, is_training=True):
  dataset = tf.data.Dataset.from_tensor_slices((X, y))
  if is_training:
    dataset = dataset.shuffle(buffer_size=len(X))
  dataset = dataset.batch(batch_size)
  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
  return dataset

In [10]:
import pandas as pd
import numpy as np

X_test = pd.read_csv('X_test.csv',dtype=np.float16)
y_test = pd.read_csv('y_test.csv')

In [11]:
test_dataset = create_dataset(X_test, y_test, batch_size=32, is_training=False).apply(tf.data.experimental.prefetch_to_device('/gpu:0'))

In [12]:
model = tf.keras.models.load_model('best_model.keras')
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

[1m48303/48303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2ms/step - accuracy: 0.8018 - loss: 0.4777
Test Loss: 0.4774448275566101
Test Accuracy: 0.8020319938659668


  self.gen.throw(typ, value, traceback)
