# AI and Deep Learning - midterm_exam (Part 2)

#### Author: Ivan Flores Martinez

## Import libraries

In [9]:
# Import libraries
%matplotlib inline
import numpy as np
import tqdm
import os
import json
import random
import cv2
from matplotlib import pyplot as plt
import time
import imutils

# import detectron2
import detectron2

# import detectron2 utilities
from detectron2.utils.logger import setup_logger
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.structures import BoxMode
from detectron2 import model_zoo
from detectron2.config import get_cfg
from detectron2.utils.video_visualizer import VideoVisualizer
from detectron2.utils.visualizer import ColorMode, Visualizer

## Load the data

In [10]:
# Specify path to images
img_dir = "E:/Pascal VOC 2012.v3-416x416.coco/train"

In [11]:
# Register dataset
from detectron2.data.datasets import register_coco_instances
register_coco_instances("pascal", {}, "E:/Pascal VOC 2012.v3-416x416.coco/train/_annotations.coco.json", "E:/Pascal VOC 2012.v3-416x416.coco/train/")

In [12]:
# Get dataset metadata and load images
from detectron2.data import MetadataCatalog
pascal_metadata = MetadataCatalog.get("pascal")
dataset_dicts = DatasetCatalog.get("pascal")

Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.

[32m[04/11 15:31:28 d2.data.datasets.coco]: [0mLoaded 13690 images in COCO format from E:/Pascal VOC 2012.v3-416x416.coco/train/_annotations.coco.json


In [13]:
# Print classes in dataset
MetadataCatalog.get("pascal").thing_classes 

['VOC',
 'aeroplane',
 'bicycle',
 'bird',
 'boat',
 'bottle',
 'bus',
 'car',
 'cat',
 'chair',
 'cow',
 'diningtable',
 'dog',
 'horse',
 'motorbike',
 'person',
 'pottedplant',
 'sheep',
 'sofa',
 'train',
 'tvmonitor']

## Train the model

In [14]:
# Import DefaultTrainer from the engine module and config file
from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg

# Load config file
cfg = get_cfg()

# Modify config file
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("pascal",)
cfg.DATASETS.TEST = ()
cfg.DATALOADER.NUM_WORKERS = 1

# Let training initialize from model zoo
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml")

# Specify batch size, learning rate and other hyperparameters
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.001
cfg.SOLVER.MAX_ITER = 500
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128   # faster, enough for this dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 20  # twenty classes
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)

# Train the model
trainer.train()

[32m[04/11 15:31:31 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (81, 1024) in the checkpoint but (21, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (81,) in the checkpoint but (21,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (320, 1024) in the checkpoint but (80, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (320,) in the checkpoint but (80,) in the model! You might want to double check if this is expected.
Some model parameters or buffers are not found in the checkpoint:
[34mroi_heads.box_predictor.bbox_pred.{bias, weight}[0m
[34mroi_heads.box_predictor.c

[32m[04/11 15:31:32 d2.engine.train_loop]: [0mStarting training from iteration 0


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


[32m[04/11 15:31:42 d2.utils.events]: [0m eta: 0:01:51  iter: 19  total_loss: 3.854  loss_cls: 3.115  loss_box_reg: 0.6993  loss_rpn_cls: 0.01831  loss_rpn_loc: 0.005493  time: 0.2287  data_time: 0.0738  lr: 3.8962e-05  max_mem: 2352M
[32m[04/11 15:31:46 d2.utils.events]: [0m eta: 0:01:45  iter: 39  total_loss: 3.022  loss_cls: 2.498  loss_box_reg: 0.6348  loss_rpn_cls: 0.009855  loss_rpn_loc: 0.004768  time: 0.2246  data_time: 0.0008  lr: 7.8922e-05  max_mem: 2352M
[32m[04/11 15:31:51 d2.utils.events]: [0m eta: 0:01:41  iter: 59  total_loss: 1.904  loss_cls: 1.09  loss_box_reg: 0.6733  loss_rpn_cls: 0.009768  loss_rpn_loc: 0.008667  time: 0.2254  data_time: 0.0008  lr: 0.00011888  max_mem: 2352M
[32m[04/11 15:31:55 d2.utils.events]: [0m eta: 0:01:37  iter: 79  total_loss: 1.473  loss_cls: 0.7933  loss_box_reg: 0.6502  loss_rpn_cls: 0.004958  loss_rpn_loc: 0.005519  time: 0.2272  data_time: 0.0009  lr: 0.00015884  max_mem: 2352M
[32m[04/11 15:32:00 d2.utils.events]: [0m eta: 

## Save the model

In [15]:
# Save model to folder
from detectron2.checkpoint import DetectionCheckpointer, Checkpointer
checkpointer = DetectionCheckpointer(trainer, save_dir=cfg.OUTPUT_DIR)
checkpointer.save("pascal")  

## Load model for detection in webcam video

In [16]:
# Load saved model
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")

# Create predictions for test dataset
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8   # set the testing threshold for this model
cfg.DATASETS.TEST = ("pascal", )
predictor = DefaultPredictor(cfg)

## Record video with webcam

In [None]:
#Capture video from webcam
vid_capture = cv2.VideoCapture(0)
vid_cod = cv2.VideoWriter_fourcc(*'XVID')
# Create output file
output = cv2.VideoWriter("videos/video_raw.mp4", vid_cod, 20.0, (640,480))
# Record until user hits x to stop the video
while(True):
     # Capture each frame of webcam video
    ret,frame = vid_capture.read()
    cv2.imshow("My webcam video", frame)
     # Write every frame to mp4 file
    output.write(frame)
     # Close and break the loop after pressing "x" key
    if cv2.waitKey(1) &0XFF == ord('x'):
        break
# close the already opened camera
vid_capture.release()
# close the already opened file
output.release()
# close the window and de-allocate any associated memory usage
cv2.destroyAllWindows()

# Pass video through detectron model 

In [18]:
# Load video
video = cv2.VideoCapture('C:/Users/USER/Videos/video_raw.mp4')

# Extract video properties
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames_per_second = video.get(cv2.CAP_PROP_FPS)
num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))

# Initialize video writer
video_writer = cv2.VideoWriter('C:/Users/USER/Videos/video_pred.mp4', fourcc=cv2.VideoWriter_fourcc(*"mp4v"), fps=float(frames_per_second), frameSize=(width, height), isColor=True)

# Load saved model
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")

# Initialize predictor
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8   # set the testing threshold for this model
cfg.DATASETS.TEST = ("pascal", )
predictor = DefaultPredictor(cfg)

# Initialize visualizer
v = VideoVisualizer(MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), ColorMode.IMAGE)

# Create function to predict video frames
def runOnVideo(video, maxFrames):
    # Runs the predictor on every frame in the video and returns the frame with the predictions drawn.
    readFrames = 0
    while True:
        hasFrame, frame = video.read()
        if not hasFrame:
            break
        # Get prediction results for this frame
        outputs = predictor(frame)
        # Make sure the frame is colored
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        # Draw a visualization of the predictions using the video visualizer
        visualization = v.draw_instance_predictions(frame, outputs["instances"].to("cpu"))
        # Convert Matplotlib RGB format to OpenCV BGR format
        visualization = cv2.cvtColor(visualization.get_image(), cv2.COLOR_RGB2BGR)
        # Return visualization
        yield visualization
        # Count number of frames
        readFrames += 1
        if readFrames > maxFrames:
            break

# Create a cut-off for debugging
num_frames = 1412 # This is the max lenght of the video I took with my webcam

# Enumerate the frames of the video
for visualization in tqdm.tqdm(runOnVideo(video, num_frames), total=num_frames):

    # Write to video file
    video_writer.write(visualization)

# Release resources
video.release()
video_writer.release()
cv2.destroyAllWindows()

100%|██████████████████████████████████████████████████████████████████████████████| 1412/1412 [05:38<00:00,  4.17it/s]


## Open two videos simultaneously with cv2

In [19]:
import cv2
import numpy as np 
import imutils

# Load videos using cv
cap1 = cv2.VideoCapture('videos/video_raw.mp4')
cap2 = cv2.VideoCapture('videos/video_pred.mp4')
# Keep streaming videos
while cap1.isOpened() or cap2.isOpened():
    # Capture each frame of webcam videos
    ret1, frame1 = cap1.read()
    ret2, frame2 = cap2.read()
    # Resize window for each videos.
    frame1 = imutils.resize(frame1, width=560, height=640)
    frame2 = imutils.resize(frame2, width=560, height=640)
    # Make sure frame in video1 has color
    if ret1:
        hsv1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2HSV)
        cv2.imshow('raw_video', frame1)
    # Make sure frame in video3 has color
    if ret2:
        hsv2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2HSV)
        cv2.imshow('video_with_predictions', frame2)
    # Print error message if videos can not be streamed
    if not okay1 or not okay2:
        print('Cant read the video , Exit!')
        break
    # Close windows
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    # Wait 1 second
    cv2.waitKey(1)
# Release resources
cap1.release()
cap2.release()
cv2.destroyAllWindows()