In [None]:
import numpy as np
from tqdm import tqdm

import torch
from sgg_benchmark.config import cfg
from sgg_benchmark.modeling.detector import build_detection_model
from sgg_benchmark.utils.checkpoint import DetectronCheckpointer
from sgg_benchmark.data import make_data_loader
from sgg_benchmark.structures.image_list import to_image_list

def latency_bench(config_file):
    cfg.merge_from_file(config_file)
    cfg.MODEL.BACKBONE.NMS_THRESH = 0.001
    cfg.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 80
    cfg.TEST.IMS_PER_BATCH = 1
    # cfg.freeze()

    # build dataloader
    val_data_loader = make_data_loader(
        cfg,
        mode='val',
        is_distributed=False,
    )
    val_data_loader = val_data_loader[0]

    cfg.TEST.CUSTUM_EVAL = True

    model = build_detection_model(cfg)
    checkpointer = DetectronCheckpointer(cfg, model, save_dir=cfg.OUTPUT_DIR)
    last_check = checkpointer.get_checkpoint_file()
    if last_check == "":
        last_check = cfg.MODEL.WEIGHT
    print("Loading last checkpoint from {}...".format(last_check))
    _ = checkpointer.load(last_check)

    model.to(cfg.MODEL.DEVICE)
    model.roi_heads.eval()
    model.backbone.eval()

    # INIT LOGGERS
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
    repetitions = 100
    timings = np.zeros((repetitions,1))
    timings_relation_head = np.zeros((repetitions,1))
    input_img, _, _ = next(iter(val_data_loader))
    input_img = input_img.to(cfg.MODEL.DEVICE)

    #GPU-WARM-UP
    for _ in tqdm(range(10)):
        _ = model(input_img, None)
    # MEASURE PERFORMANCE
    with torch.no_grad():
        for rep, (input_img, _, _) in enumerate(tqdm(val_data_loader)):
            if rep == repetitions:
                break
            input_img = input_img.to(cfg.MODEL.DEVICE)
            images = to_image_list(input_img)
            starter.record()
            outputs, features = model.backbone(images.tensors, embed=True)
            proposals = model.backbone.postprocess(outputs, images.image_sizes)
            ender.record()
            # WAIT FOR GPU SYNC
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)
            timings[rep] = curr_time

            starter.record()
            _, _, _ = model.roi_heads(features, proposals, None, None, proposals)
            ender.record()
            # WAIT FOR GPU SYNC
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)
            timings_relation_head[rep] = curr_time

    mean_syn = np.sum(timings) / repetitions
    mean_syn_relation_head = np.sum(timings_relation_head) / repetitions
    std_syn = np.std(timings)
    print("Average time backbone: {} ms".format(mean_syn))
    print("Average time relation head: {} ms".format(mean_syn_relation_head))
    print("Full network latency: {} ms".format(mean_syn + mean_syn_relation_head))
    print("Standard deviation: {} ms".format(std_syn))

    # print total number of params
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters: {total_params}")

conf = "/home/maelic/Documents/PhD/MyModel/SGG-Benchmark/checkpoints/IndoorVG4/SGDET/penet-yolov8m/config.yml"

latency_bench(conf)