In [12]:
import torch
import cv2
import numpy as np
from torchvision.io import read_video
from torchvision.transforms import v2 as T
from torchvision.utils import draw_bounding_boxes


# paths
import os
import sys

# set paths
dirpath = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(dirpath)

# my imports
from models.SoSi_detection import SoSiDetectionModel  # noqa: E402
from utils.plot_utils import inverse_transform_bbox  # noqa: E402


# the lifesaver
%load_ext autoreload
%autoreload 2

# torch setup
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: cuda


#### Load the model:

In [14]:
model_file = 'p02_model_Mar-09_18-15-05.pth' # good result using MSE K=1
# model_file = 'p02_model_Mar-09_19-30-47.pth' # smooth L1 and K=4
# model_file = 'p02_model_Mar-09_20-09-10.pth' # ciou and k=4

In [15]:
# load model
model_path = 'models\\model_savepoints\\'
model_path = os.path.join(dirpath, model_path, model_file)

# build and load model
model = SoSiDetectionModel(final_head_conv_depth = 128)  
sucess = model.load_state_dict(torch.load(model_path, map_location=device))
print(sucess)
model.to(device).eval();

<All keys matched successfully>


#### Get video file:

In [16]:
# vid_name = 'kittens_video.mp4'
vid_name = 'cats_wild.mp4'
# vid_name = 'cats_forest.mp4'

In [17]:
video_file = os.path.join(dirpath, f'inference\\{vid_name}')
video_out_file = os.path.join(dirpath, f'inference\\{vid_name[:-4]}_{model_file[:-4]}.mp4')

# video params
video_h, video_w = 360, 640
video_fps = 30

# calculate end time and time jump for inference
video_start_sec = 90
video_end_sec = 90+120
batch_size = 256 
video_jump = batch_size / video_fps

Build video writer:

In [18]:
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(video_out_file, fourcc, video_fps, (video_w, video_h))

#### Define Transforms for inferences

In [19]:
# get model transforms
backbone_transforms = model.backbone_transforms()

# pre-procesing transform
preprocess = T.Compose([
    # standard transforms - resizing and center cropping for 1:1 aspect ratio and 224 size
    T.Resize(size = backbone_transforms.resize_size, interpolation = backbone_transforms.interpolation),
    T.CenterCrop(size=backbone_transforms.crop_size),
    
    # standard transforms - normalizing
    T.ToImage(),
    T.ToDtype(torch.float32, scale=True),
    T.Normalize(mean = backbone_transforms.mean, std = backbone_transforms.std)
])

In [20]:
cutoff = 0.3

In [21]:
video_current_start = video_start_sec

# loop on frames
while video_current_start < video_end_sec:
    # read frames
    frames, _ ,_  = read_video(filename = video_file, 
                                    start_pts = video_current_start, end_pts = min(video_current_start + video_jump, video_end_sec),
                                    output_format="TCHW", pts_unit = 'sec')
    video_current_start += video_jump
    frames = frames.to(device)
    
    # if no frames read, break the loop
    if frames.numel() == 0:  
        break
    
    # preprocess
    frames_preproces = preprocess(frames).to(device)
    
    # infer
    pred_boxes, pred_labels_logits = model(frames_preproces)
    
    # compute the labels
    confidences = torch.sigmoid(pred_labels_logits).squeeze()
    pred_labels_str = [
            f"cat {conf:.2f}" if conf > 0.3 else "none"
            for conf in confidences.tolist()
        ]
    
    # scale the bbox to video scale
    bbox_resized = inverse_transform_bbox(pred_boxes, video_w, video_h)
    
    # append all frames and write to disk
    for idx in range(len(frames)):
        # false label - do not print
        if confidences[idx] <= cutoff:  
            video_frame = frames[idx].cpu()
            
        # true label - print bbox
        else:
            image_with_boxes = draw_bounding_boxes(frames[idx], bbox_resized[idx,:], fill=False, colors="red", width=3, 
                                            labels=[pred_labels_str[idx]], font_size=25, font='verdana.ttf')
            # frame_with_bbox_pil = F.to_pil_image(image_with_boxes)
            video_frame = (image_with_boxes)
        
        # convert it to a NumPy array with shape [H, W, C]
        frame_np = video_frame.permute(1, 2, 0).cpu().numpy()
        
        # convert to bgr
        frame_np = frame_np[..., [2, 1, 0]]
        
        # if frame is normalized, convert to uint8
        if frame_np.dtype != np.uint8:
            frame_np = (255 * frame_np).clip(0, 255).astype(np.uint8)
        
        # Write the frame to the video file.
        video_writer.write(frame_np)

video_writer.release()

#### Video re-encode to compress


In [22]:
command = f'ffmpeg -i "{video_out_file}" -c:v libx264 -b:v 1M "{video_out_file[:-4]}_compressed.mp4"' # "{video_out_file}"
!{command}

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with clang version 17.0.1
  configuration: --prefix=/d/bld/ffmpeg_1696213838503/_h_env/Library --cc=clang.exe --cxx=clang++.exe --nm=llvm-nm --ar=llvm-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --ld=lld-link --target-os=win64 --enable-cross-compile --toolchain=msvc --host-cc=clang.exe --extra-libs=ucrt.lib --extra-libs=vcruntime.lib --extra-libs=oldnames.lib --strip=llvm-strip --disable-stripping --host-extralibs= --enable-gpl --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-pic --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libopus --pkg-config=/d/bld/ffmpeg_1696213838503/_build_env/Library/bin/pkg-config
  libavutil      57. 28.100 / 57. 28.100
  libavcodec     59. 37.100 / 59. 37.100
  libavformat    59. 27.100 / 59.