# Multi Model Parallel Inference

OpenVINO provides [Asynchronous Inference Request](https://docs.openvino.ai/2023.2/openvino_docs_ov_plugin_dg_async_infer_request.html) for parallel inference.

Suppose that we are processing a video stream, of which each frame needs to call inference of three different models once.

#### Define Benchmark Function

In [None]:
from time import time

import numpy as np
from openvino import Core
from openvino import properties
from openvino.properties.hint import PerformanceMode, SchedulingCoreType


def benchmark_model(infer_one_frame):
    sec = 10
    count = 0
    start = time()
    while time() - start < sec:
        infer_one_frame()
        count += 1

    print(f"FPS={count / sec:.2f}")

#### Single Model Parallel Inference

Load OpenVINO models path

In [None]:
res18 = "models/resnet18/int8/model.xml"
res50 = "models/resnet50/int8/model.xml"
res101 = "models/resnet101/int8/model.xml"

Define infer_one_frame()

In [None]:
def single_res18_sync_infer():
    core = Core()
    compiled_res18 = core.compile_model(res18, "CPU")
    inputs = np.random.randn(1, 3, 224, 224)
    req = compiled_res18.create_infer_request()

    def infer_one_frame():
        req.infer(inputs)
        req.infer(inputs)
        req.infer(inputs)

    return infer_one_frame


def single_res18_async_infer():
    req_count = 3

    # Reason use PCORE_ONLY here:
    # The inference time on E-core is longer than three times of P-core. (Test by Core-13700K)
    # So, when we schedule three inferences, three P-core inferences is faster than two P-core calls and one E-core inference.
    config = {
        properties.hint.performance_mode(): PerformanceMode.THROUGHPUT,
        properties.hint.num_requests(): req_count,
        properties.hint.scheduling_core_type(): SchedulingCoreType.PCORE_ONLY,
    }

    core = Core()
    compiled_res18 = core.compile_model(res18, "CPU", config)
    infer_reqs = [compiled_res18.create_infer_request() for _ in range(req_count)]
    inputs = np.random.randn(1, 3, 224, 224)

    def infer_one_frame():
        for req in infer_reqs:
            req.start_async(inputs)

        for req in infer_reqs:
            req.wait()

    return infer_one_frame

Benchmark sync inferences

In [None]:
benchmark_model(single_res18_sync_infer())

Benchmark async inference

In [None]:
benchmark_model(single_res18_async_infer())

#### Three Models Parallel Inference: all resnet18.

Define infer_one_frame() 

In [None]:
def three_res18_sync_infer():
    core = Core()

    model_count = 3
    compiled_res18s = [core.compile_model(res18, "CPU") for _ in range(model_count)]
    reqs = [m.create_infer_request() for m in compiled_res18s]

    inputs = np.random.randn(1, 3, 224, 224)

    def infer_one_frame():
        for req in reqs:
            req.infer(inputs)

    return infer_one_frame


def three_res18_async_infer():
    p_core_config = {
        properties.hint.scheduling_core_type(): SchedulingCoreType.PCORE_ONLY,
    }

    core = Core()
    compiled_res18s = [
        core.compile_model(res18, "CPU", p_core_config),
        core.compile_model(res18, "CPU", p_core_config),
        core.compile_model(res18, "CPU", p_core_config),
    ]

    reqs = [m.create_infer_request() for m in compiled_res18s]
    inputs = np.random.randn(1, 3, 224, 224)

    def infer_one_frame():
        for req in reqs:
            req.start_async(inputs)

        for req in reqs:
            req.wait()

    return infer_one_frame

Benchmark sync inferences

In [None]:
benchmark_model(three_res18_sync_infer())

Benchmark async inference

In [None]:
benchmark_model(three_res18_async_infer())

#### Three Models Parallel Inference: resnet18, resnet50, and resnet101

Define infer_one_frame() 

In [None]:
def three_sync_infer():
    core = Core()

    compiled_models = [
        core.compile_model(res18, "CPU"),
        core.compile_model(res50, "CPU"),
        core.compile_model(res101, "CPU"),
    ]

    reqs = [m.create_infer_request() for m in compiled_models]
    inputs = np.random.randn(1, 3, 224, 224)

    def infer_one_frame():
        for req in reqs:
            req.infer(inputs)

    return infer_one_frame


def three_async_infer():
    p_core_config = {
        properties.hint.scheduling_core_type(): SchedulingCoreType.PCORE_ONLY,
    }

    e_core_config = {
        properties.hint.scheduling_core_type(): SchedulingCoreType.ECORE_ONLY,
    }

    core = Core()
    compiled_res18s = [
        core.compile_model(res18, "CPU", e_core_config),
        core.compile_model(res50, "CPU", p_core_config),
        core.compile_model(res101, "CPU", p_core_config),
    ]

    reqs = [m.create_infer_request() for m in compiled_res18s]
    inputs = np.random.randn(1, 3, 224, 224)

    def infer_one_frame():
        for req in reqs:
            req.start_async(inputs)

        for req in reqs:
            req.wait()

    return infer_one_frame

Benchmark sync inferences

In [None]:
benchmark_model(three_sync_infer())

Benchmark async inferences

In [None]:
benchmark_model(three_async_infer())