# PYNQTorch 项目演示

先导入依赖和我们自定义的算子

In [1]:
import torch
import matplotlib
from pytorch_zynq import init_hardware as init, mmult

**AI加速**的基础是矩阵乘法，在torch后端注册`PrivateUseOne`接口并重命名到`zynq`可以很方便地注册我们想要的功能。我们需要在算子中声明设备的功能和行为，以下是注册的设备的声明：
```python
from .device import register_zynq_device, is_registered
from .device import enable_full_device, disable_full_device
from .device import enable_implicit_accel, disable_implicit_accel
from .ops import mmult, register_aten_impls
from .linear import ZynqLinear
from .hardware import init as init_hardware, is_hardware_available, deinit as deinit_hardware

__all__ = [
    "register_zynq_device",
    "is_registered",
    "mmult",
    "ZynqLinear",
    "init_hardware",
    "is_hardware_available",
    "register_aten_impls",
    "deinit_hardware",
    "enable_full_device",
    "disable_full_device",
    "enable_implicit_accel",
    "disable_implicit_accel",
    
]
```

上面的代码展示了我们的双管线矩阵乘法（**GeMM**）算子支持的行为和加速操作。如何把张量绑定至算子呢？其实我们的硬件是不支持直接加载张量的，但是我们可以巧妙地将张量加载到**CPU**上，但是给这个张量一个属于**zynq**的属性，然后在算子后端检测张量属性列表是否含有**zynq**的属性，只要进行运算的两个张量都具有**zynq**的设备属性就将运算绑定到硬件加速器。

## 演示1：通用矩阵乘（GeMM）算子基准测试

在这段测试中我们直接调用了算子中的实现的`mmult`加速操作。我们依托双管线（*pipeline*）**INT8**类型**256**维度矩阵乘法加速器构建了上述的通用矩阵乘算子，相较于调用**CPU**执行的类**GeMM**算法，我们构建的硬件**GeMM**具有极高的加速比，以下是测试函数定义：

In [2]:
import time

def benchmark_cpu(a, b, iters=3):
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    with torch.no_grad():
        _ = torch.matmul(a, b)
        t0 = time.perf_counter()
        out = None
        for _ in range(iters):
            out = torch.matmul(a, b)
        t1 = time.perf_counter()
    return (t1 - t0) / iters, out

def benchmark_fpga(a, b, iters=3):
    with torch.no_grad():
        _ = mmult(a, b)
        t0 = time.perf_counter()
        out = None
        for _ in range(iters):
            out = mmult(a, b)
        t1 = time.perf_counter()
    return (t1 - t0) / iters, out

def make_inputs(n, m, p, kind):
    if kind == "FP32":
        a = torch.randn((n, m), dtype=torch.float32)
        b = torch.randn((m, p), dtype=torch.float32)
    elif kind == "FP16":
        a = torch.randn((n, m), dtype=torch.float16)
        b = torch.randn((m, p), dtype=torch.float16)
    elif kind == "INT8":
        a = torch.randint(-128, 127, (n, m), dtype=torch.int32)
        b = torch.randint(-128, 127, (m, p), dtype=torch.int32)
    else:
        a = torch.randint(-128, 127, (n, m), dtype=torch.int32)
        b = torch.randint(-128, 127, (m, p), dtype=torch.int32)
    return a, b

def test1():
    n = 1024
    m = 1024
    p = 1024
    iters = 1

    ok = init()
    kinds = ["FP32", "FP16", "INT8"]
    print("\n==================== MMULT BENCH ====================")
    print(f"Size: ({n} x {m}) @ ({m} x {p})")
    for kind in kinds:
        a, b = make_inputs(n, m, p, kind)
        a_cpu = a.to(torch.int32) if a.dtype in (torch.float32, torch.float16, torch.int16) else a
        b_cpu = b.to(torch.int32) if b.dtype in (torch.float32, torch.float16, torch.int16) else b
        cpu_time, cpu_out = benchmark_cpu(a_cpu, b_cpu, iters)
        fpga_time, fpga_out = benchmark_fpga(a_cpu, b_cpu, iters)
        diff = (cpu_out - fpga_out).abs()
        l2_err = torch.norm(cpu_out.float() - fpga_out.float()).item()
        max_abs = diff.max().item()
        eq_ratio = (diff == 0).float().mean().item()
        speedup = cpu_time / fpga_time if fpga_time > 0 else float('inf')
        print(f"\n[{kind}] CPU avg:   {cpu_time:.6f} s")
        print(f"[{kind}] ZYNQ avg:  {fpga_time:.6f} s")
        print(f"[{kind}] Speedup:   {speedup:.2f}x")
        print(f"[{kind}] L2 error:  {l2_err:.6f}")
        print(f"[{kind}] Max diff:  {max_abs}")
        print(f"[{kind}] Exact %:   {eq_ratio*100:.2f}%")
    print("====================================================")


现在开始测试：

In [3]:
test1()


Size: (1024 x 1024) @ (1024 x 1024)

[FP32] CPU avg:   17.803478 s
[FP32] ZYNQ avg:  0.846731 s
[FP32] Speedup:   21.03x
[FP32] L2 error:  0.000000
[FP32] Max diff:  0
[FP32] Exact %:   100.00%

[FP16] CPU avg:   20.213671 s
[FP16] ZYNQ avg:  0.843857 s
[FP16] Speedup:   23.95x
[FP16] L2 error:  0.000000
[FP16] Max diff:  0
[FP16] Exact %:   100.00%

[INT8] CPU avg:   20.214558 s
[INT8] ZYNQ avg:  0.847224 s
[INT8] Speedup:   23.86x
[INT8] L2 error:  0.000000
[INT8] Max diff:  0
[INT8] Exact %:   100.00%


## 演示2：基于GeMM算子加速的Conv2D与F.linear运算操作进行CNN前向推理基准测试

在PyTorch平台上，我们可以部署许多的前端应用。这里我们选择了著名的**SpeechBrain**作为前端工具集，运行**ASR CN AIShell语音识别**模型。**SpeechBrain**有着用户友好的模型部署方式。为了充分发挥硬件加速效果，我们使用**PyTorch**的子工具集`qnnpack`对模型进行动态量化，使用`int8`数据格式对除了`clc_in`外的权重层进行修改，在保留较高精度的同时可以评估我们加速器的性能。

以下是测试框架代码：

In [4]:
import os
import speechbrain
from speechbrain.inference.ASR import EncoderDecoderASR
from pytorch_zynq import (
    register_zynq_device,
    init_hardware as init,
    deinit_hardware,
    is_hardware_available,
    enable_full_device,
    disable_full_device,
)

torch.backends.quantized.engine = 'qnnpack'
os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "yes"

def _apply_dynamic_quant(m):
    targets = {
        torch.nn.Linear,
        torch.nn.LSTM,
        torch.nn.GRU,
        torch.nn.RNNCell,
        torch.nn.GRUCell,
        torch.nn.LSTMCell,
        torch.nn.Embedding,
        torch.nn.EmbeddingBag,
    }
    try:
        torch.quantization.quantize_dynamic(
            m.mods.encoder.transformer_encoder.transformer,
            targets,
            dtype=torch.qint8,
            inplace=True,
        )
    except Exception:
        pass
    try:
        if hasattr(m.mods.encoder, "enc"):
            torch.quantization.quantize_dynamic(
                m.mods.encoder.enc,
                targets,
                dtype=torch.qint8,
                inplace=True,
            )
    except Exception:
        pass
    return m


def run_asr(source, device_str, wav, hparams_file, do_quant):
    if device_str in ("zynq", "privateuseone"):
        enable_full_device()
    else:
        disable_full_device()
    m = EncoderDecoderASR.from_hparams(
        source=source,
        savedir=source,
        run_opts={"device": device_str},
        hparams_file=hparams_file,
    )
    try:
        engines = getattr(torch.backends.quantized, "supported_engines", [])
        if do_quant and ("qnnpack" in engines or "fbgemm" in engines) and device_str in ("zynq", "privateuseone"):
            _apply_dynamic_quant(m)
    except Exception:
        pass
    with torch.no_grad():
        t0 = time.perf_counter()
        out = m.transcribe_file(wav)
        t1 = time.perf_counter()
    return out, t1 - t0


def test2():
    source = "./ASR_CN"
    wav = "./test2.wav"
    hparams = ""

    register_zynq_device()
    init()
    hw = is_hardware_available()
    print(f"Hardware available: {hw}")

    # resolve hparams path: SpeechBrain joins savedir+filename internally,
    # so pass only the filename and let source be the directory
    if hparams:
        if os.path.isabs(args.hparams) or os.path.sep in args.hparams:
            args.source = os.path.dirname(args.hparams)
            hparams_file = os.path.basename(args.hparams)
        else:
            hparams_file = hparams
    else:
        hparams_file = "hyperparams.yaml"
    print(f"Using source: {source}")
    print(f"Using hparams: {hparams}")

    try:
        engines = getattr(torch.backends.quantized, "supported_engines", [])
        if args.quantized and ("qnnpack" not in engines and "fbgemm" not in engines):
            print("[WARN] No quantized engine available on this platform; falling back to float model")
            hparams_file = "hyperparams.yaml"
    except Exception:
        pass

    text_cpu, t_cpu = run_asr(source, "cpu", wav, hparams_file, True)
    text_fpga, t_fpga = ("", 0.0)
    if hw:
        text_fpga, t_fpga = run_asr(source, "zynq", wav, hparams_file, True)

    print("\n==================== ASR RESULTS ====================")
    print(f"CPU time:  {t_cpu:.6f}s")
    if hw:
        print(f"ZYNQ time: {t_fpga:.6f}s")
        print(f"Speedup:   {t_cpu / t_fpga:.2f}x" if t_fpga > 0 else "Speedup:   inf")
    print(f"CPU text:  {text_cpu}")
    if hw:
        print(f"ZYNQ text: {text_fpga}")
    print("====================================================")

  available_backends = torchaudio.list_audio_backends()


由于板端算力有限，执行基准测试大约需要**5分钟**，请视情况执行测试。

In [12]:
test2()

Hardware available: True
Using source: ./ASR_CN
Using hparams: 


  stats = torch.load(path, map_location=device)
  state_dict = torch.load(path, map_location=device)
For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  torch.quantization.quantize_dynamic(



CPU time:  294.368227s
ZYNQ time: 227.967185s
Speedup:   1.29x
CPU text:  嵌入 式 高层 次 综合 赛道
ZYNQ text: 嵌入 式 高层 次 综合 赛道
