# Emergency Vehicle Detection in Philippine Roads using Pretrained Object Detection Model and Classification Model


___
<center> Andam, Jhun Brian | Garcia, Gabriel Gary | Javier, Alexandria | Jimenez, Clifford Jay | Nietes, Jonzee Carel | Ramos, Royette | Sunga, Edmar </center>

# Data Acquisition

The dataset that was used for this study is a combination of Kaggle datasets, web scraped images and Google map steet view screenshots. Refer to this <a href="Make Dataset.ipynb">notebook</a> for the entire data acquisition process.

We also developed modules for web scraping and localization procedures to create the final dataset. Refer to these python files for the <a href="modules/image_scraper.py">web scraping</a> procedure and this file for <a href="modules/image_scraper.py">localizing</a> the scraped images.

# Image Classification Implementation

Image classification training is a computationally expensive task, so we trained the model in a Kaggle environemnt using their dedicated GPUs. For the Image Classification process refer to this <a href="Image Classification Implementation.ipynb">notebook</a>.

# Localization + Classification

In [1]:
import PIL.Image as Image
import numpy as np
import cv2

import os

**Localization Model**

In [2]:
import torch
from torchvision.utils import draw_bounding_boxes
import torchvision
from torchvision.io import read_image

lm = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
vehicles = {
    2:'car',
    5:'bus', 
    6:'train', 
    7:'truck', 
    8:'boat'
}

lm.conf        = 0.30
lm.iou         = 0.45  
lm.agnostic    = False  
lm.multi_label = False  
lm.classes     = list(vehicles.keys())  
lm.max_det     = 10  
lm.amp         = False

Using cache found in C:\Users\brian/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2022-12-28 Python-3.9.0 torch-1.13.1+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


**Classification Model**

In [3]:
from torchvision import transforms
import torchvision.models as models

# State dictionary path
model_path = 'best_DenseNet.pth'
model = models.densenet161(pretrained=False, num_classes=2)

# loading state dictionary to DenseNet-161 architecture
display(model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))))

def predict(image, display_image=False):
    # Convert image array to PIL Image
    image = Image.fromarray(image)
    pil_img = Image.fromarray(np.array(image))

    # Pre-process the image
    transformation = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image = transformation(image).unsqueeze(0)

    # Put the model in evaluation mode
    model.eval()

    # Use the model to make predictions
    with torch.no_grad():
        logits = model(image)
        probs = torch.nn.functional.softmax(logits, dim=1)
    class_index = torch.argmax(probs)
    class_index = class_index.item()
    class_name = 'emergency' if class_index == 0 else 'non_emergency'
    class_prob = probs[0][class_index].item()
    if display_image:
        display(pil_img)
    return class_name, np.round(class_prob, 2)

<All keys matched successfully>

`lm()` and `predict()` functions can be used separately, but for this project, we will use a function that will bind their functionality as one.

In [17]:
def draw_bb(image):
    """
    Parameter:
    ---------
    image: either path or numpy.ndarray
    
    Return:
    -------
    None: displays the image with bounding boxes.
    """
    
    # check if passed parameter is a path
    if os.path.exists(image):
        cv2_image = cv2.imread(image)
        cv2_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(cv2_image)
    
    # check if passed parameter is an ndarray
    elif type(image) is np.ndarray:
        cv2_image = np.array(image)
        cv2_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image)
        
    else:
        print('wrong argument type')
        
    result = lm(pil_image)
    
    mins, maxs = [], []
    for idx in result.xyxy[0]:
        xmin, ymin, xmax, ymax = np.asarray(idx)[:4].astype('uint32')
        mins.append([xmin, ymin])
        maxs.append([xmax, ymax])
        
    preds, confs = [], []
    for mi, ma in zip(mins, maxs):
        mini, maxi = tuple(mi), tuple(ma)
        cropped = pil_image.crop(mini + maxi)
        pred, conf = predict(np.array(cropped))
        preds.append(pred)
        confs.append(conf)

        if pred == 'emergency':
            color = (int(255*conf), 0, 255)
            cv2.rectangle(cv2_image,mini,maxi, color, 2)
            cv2.putText(cv2_image, f"{pred} {np.round(conf,2)}", (mini[0], mini[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 1)
        
        elif pred == 'non_emergency':
            color = (0, int(255*conf), 255)
            cv2.rectangle(cv2_image,mini,maxi,color,2)
            cv2.putText(cv2_image, f"{pred} {np.round(conf,2)}", (mini[0], mini[1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 1)
    
    display(Image.fromarray(cv2_image))

In [18]:
# only one test video file
video_path = ""

# image test files
image_path = []
for i in os.listdir('test'):
    if 'mp4' in os.path.join('test', i):
        video_path = os.path.join('test', i)
    else:
        image_path.append(os.path.join('test', i))

# Image as Input

for i in image_path:
    draw_bb(i)

From the results, it is apparent that the bounding boxes have two distinct color for each detected class, but these colors will change by intensity based on how confident the model is about its prediction. The higher the intensity color, the more confident the model is.

# Video or Real-Time as Input

In [20]:
def get_bb(frame):
    pil_image = Image.fromarray(np.array(frame))
    res = lm(pil_image)
    
    mins, maxs = [], []
    for idx in res.xyxy[0]:
        xmin, ymin, xmax, ymax = np.asarray(idx)[:4].astype('uint32')
        mins.append([xmin, ymin])
        maxs.append([xmax, ymax])
        
    location, preds, confs = [], [], []
    for mi, ma in zip(mins, maxs):
        location.append(mi + ma)
        
        mini, maxi = tuple(mi), tuple(ma)
        cropped = pil_image.crop(mini + maxi)
        pred, conf = predict(np.array(cropped))
        preds.append(pred)
        confs.append(conf)
        print(f"Detected {pred}: {np.round(conf,2) * 100}%")
        
    return location, preds, confs

# Create a window to display the video input
cv2.namedWindow("Video Input", cv2.WINDOW_NORMAL)

# Open the video file
cap = cv2.VideoCapture(video_path)

while True:
    # Read a frame from the video
    ret, frame = cap.read()
    if not ret:
        break
    # Get the bounding boxes, labels, and confidences from your function
    boxes, labels, confidences = get_bb(frame)

    # Iterate through the bounding boxes and draw them on the frame
    for box, label, confidence in zip(boxes, labels, confidences):
        xmin, ymin, xmax, ymax = box
        if label == 'emergency':
            color = (0, int(255*confidence), 255)
        elif label == 'non_emergency':
            color = (int(255*confidence), 0, 255)
        else:
            color = (0, 255, 0)
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color, 2)
        cv2.putText(frame, f"{label} {np.round(confidence,2)}", (xmin, ymin-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # Display the frame in the window
    cv2.imshow("Video Input", frame)

    # Exit the loop if the 'q' key is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video and close the window
cap.release()
cv2.destroyAllWindows()

Detected emergency: 50.0%
Detected emergency: 51.0%
Detected non_emergency: 55.00000000000001%
Detected emergency: 51.0%
Detected non_emergency: 53.0%
Detected non_emergency: 54.0%
Detected non_emergency: 68.0%
Detected non_emergency: 84.0%
Detected non_emergency: 50.0%
Detected non_emergency: 90.0%
Detected emergency: 55.00000000000001%
Detected non_emergency: 62.0%
Detected emergency: 51.0%
Detected non_emergency: 50.0%
Detected non_emergency: 77.0%
Detected non_emergency: 77.0%
Detected non_emergency: 81.0%
Detected non_emergency: 100.0%
Detected non_emergency: 60.0%
Detected emergency: 60.0%
Detected emergency: 61.0%
Detected non_emergency: 51.0%
Detected non_emergency: 50.0%
Detected non_emergency: 61.0%
Detected non_emergency: 98.0%
Detected non_emergency: 100.0%
Detected non_emergency: 86.0%
Detected emergency: 61.0%
Detected non_emergency: 96.0%
Detected non_emergency: 56.00000000000001%
Detected non_emergency: 52.0%
Detected emergency: 60.0%
Detected non_emergency: 52.0%
Detec

Detected non_emergency: 100.0%
Detected non_emergency: 97.0%
Detected non_emergency: 100.0%
Detected non_emergency: 76.0%
Detected non_emergency: 57.99999999999999%
Detected non_emergency: 85.0%
Detected non_emergency: 99.0%
Detected non_emergency: 70.0%
Detected non_emergency: 75.0%
Detected non_emergency: 100.0%
Detected non_emergency: 53.0%
Detected non_emergency: 71.0%
Detected non_emergency: 100.0%
Detected non_emergency: 99.0%
Detected non_emergency: 87.0%
Detected non_emergency: 63.0%
Detected non_emergency: 72.0%
Detected non_emergency: 99.0%
Detected non_emergency: 56.99999999999999%
Detected non_emergency: 75.0%
Detected non_emergency: 99.0%
Detected non_emergency: 100.0%
Detected non_emergency: 74.0%
Detected non_emergency: 73.0%
Detected non_emergency: 96.0%
Detected non_emergency: 54.0%
Detected non_emergency: 99.0%
Detected non_emergency: 77.0%
Detected non_emergency: 72.0%
Detected non_emergency: 99.0%
Detected non_emergency: 87.0%
Detected non_emergency: 52.0%
Detected 

Detected non_emergency: 93.0%
Detected non_emergency: 97.0%
Detected non_emergency: 100.0%
Detected non_emergency: 83.0%
Detected non_emergency: 99.0%
Detected non_emergency: 75.0%
Detected emergency: 50.0%
Detected non_emergency: 77.0%
Detected non_emergency: 96.0%
Detected non_emergency: 98.0%
Detected non_emergency: 100.0%
Detected non_emergency: 75.0%
Detected non_emergency: 75.0%
Detected non_emergency: 99.0%
Detected emergency: 57.99999999999999%
Detected emergency: 57.99999999999999%
Detected non_emergency: 96.0%
Detected non_emergency: 97.0%
Detected non_emergency: 100.0%
Detected non_emergency: 100.0%
Detected non_emergency: 78.0%
Detected emergency: 56.00000000000001%
Detected non_emergency: 78.0%
Detected non_emergency: 100.0%
Detected non_emergency: 100.0%
Detected non_emergency: 97.0%
Detected non_emergency: 98.0%
Detected non_emergency: 62.0%
Detected non_emergency: 97.0%
Detected non_emergency: 99.0%
Detected non_emergency: 75.0%
Detected emergency: 56.99999999999999%
De