# 5 线性层量化
在前面的过程中我们已经学习了对称量化和非对称量化，现在我们要尝试对模型中的线性层进行量化。

In [36]:
import torch
import copy
import torch.nn as nn
import torch.nn.functional as F
import util.quant_tool as quant_tool

## 5.1 自定义量化模块

In [37]:
# 定义权重量化模块（per-channel 对称 int8 量化）
class QuantLinear(nn.Module):
    """
    简化版的权重量化线性层：
    - 只量化 weight（symmetric per-channel int8）
    - bias 保持 FP32
    - 前向时：先临时反量化，再用 F.linear
    """
    def __init__(self, in_features: int, out_features: int, bias: bool = True):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        # # qweight: int8，形状 [out_features, in_features]
        self.register_buffer(
            "qweight",
            torch.empty(out_features, in_features, dtype=torch.int8),
        )
        # scale: per-output-channel，形状 [out_features, 1]
        self.register_buffer(
            "scale",
            torch.ones(out_features, 1, dtype=torch.float32),
        )
        # 对称量化 zero_point 固定 0，这里留个占位方便扩展
        self.register_buffer(
            "zero_point",
            torch.zeros(out_features, 1, dtype=torch.float32),
        )
        
        if bias:
            self.bias = nn.Parameter(torch.zeros(out_features, dtype=torch.float32))
        else:
            self.bias = None

    @classmethod
    def from_linear(cls, linear: nn.Linear, per_channel: bool=False, is_symmetric: bool=True) -> "QuantLinear":
        """
        给定一个 nn.Linear，构造对应的 QuantLinear 并完成权重量化。
        """
        qlinear = cls(
            in_features=linear.in_features,
            out_features=linear.out_features,
            bias=linear.bias is not None,
        )

        with torch.no_grad():
            # 获取线性层的权重
            weight = linear.weight.data.detach().float().cpu()

            if is_symmetric:
                # 对线性层进行对称量化
                qparams = quant_tool.get_symmetric_qparams(weight,per_channel)
                qweight = quant_tool.quantize_tensor(weight,qparams)
            else:
                # 对线性层进行非对称量化
                qparams = quant_tool.get_asymmetric_qparams(weight,per_channel)
                qweight = quant_tool.quantize_tensor(weight,qparams)

            # 储存缩放信息
            qlinear.qweight.copy_(qweight)
            qlinear.scale.copy_(qparams.scale)
            qlinear.zero_point.copy_(qparams.zero_point)

            # 对偏置项不做处理
            if linear.bias is not None:
                qlinear.bias.data.copy_(linear.bias.data.detach().float().cpu())
        return qlinear

    # 前向传播的时候需要进行反量化
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # 反量化得到近似权重：w_hat = (q - z) * scale
        # qweight: [out, in], scale: [out, 1]
        w_hat = (self.qweight.float()) * self.scale
        return F.linear(x, w_hat, self.bias)

## 5.2 定义模型量化入口
输入一个模型，输出量化后的模型

In [38]:
from typing import Optional, List
# 对权重进行量化
def quantize_model_weights(
    model: nn.Module,
    is_symmetric:bool=True,
    modules_to_exclude: Optional[List[str]] = None,  # 可选参数，排除不需要量化的层
) -> nn.Module:
    """
    递归遍历模型，遇到 nn.Linear 就替换成 QuantLinear（权重量化）。
    可以通过 modules_to_exclude 按模块名排除不想量化的层。
    """
    if modules_to_exclude is None:
        modules_to_exclude = []

    for name, child in list(model.named_children()):
        full_name = name

        if isinstance(child, nn.Linear) and full_name not in modules_to_exclude:
            setattr(model, name, QuantLinear.from_linear(child,is_symmetric))
        else:
            quantize_model_weights(child, modules_to_exclude=modules_to_exclude)
    return model

## 5.3 对模型进行量化

In [39]:
# 定义一个多层线性层用于验证
class FourLayerModel(nn.Module):
    def __init__(self, input_size=64):
        super(FourLayerModel, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.layer2 = nn.Linear(128, 256)
        self.layer3 = nn.Linear(256, 128)
        self.layer4 = nn.Linear(128, 1)
        self.layer5 = nn.Linear(1, 9)
    
    def forward(self, x):
        x = torch.relu(self.layer1(x))  
        x = torch.relu(self.layer2(x))  
        x = torch.relu(self.layer3(x))  
        x = torch.relu(self.layer4(x))  
        x = self.layer5(x) 
        return x

In [40]:
model = FourLayerModel()
model

FourLayerModel(
  (layer1): Linear(in_features=64, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=256, bias=True)
  (layer3): Linear(in_features=256, out_features=128, bias=True)
  (layer4): Linear(in_features=128, out_features=1, bias=True)
  (layer5): Linear(in_features=1, out_features=9, bias=True)
)

In [41]:
# 对模型进行量化,排除['layer1']层
base_model = copy.deepcopy(model)
q_model = quantize_model_weights(model=base_model,modules_to_exclude=['layer1','layer5'])
q_model

FourLayerModel(
  (layer1): Linear(in_features=64, out_features=128, bias=True)
  (layer2): QuantLinear()
  (layer3): QuantLinear()
  (layer4): QuantLinear()
  (layer5): Linear(in_features=1, out_features=9, bias=True)
)

In [42]:
import copy
base_model = copy.deepcopy(model)
# 对模型进行量化,不排除层
q_model = quantize_model_weights(base_model)
q_model

FourLayerModel(
  (layer1): QuantLinear()
  (layer2): QuantLinear()
  (layer3): QuantLinear()
  (layer4): QuantLinear()
  (layer5): QuantLinear()
)

In [43]:
# 初始化随机输入
input_data = torch.randn(1,64)
input_data

tensor([[-1.9708,  0.5421,  1.0840, -0.0047,  0.1527,  0.7801,  2.0204, -0.1570,
          1.4731,  1.1514, -0.9850,  1.7130, -1.8669, -1.0876,  1.4048, -0.4458,
          0.1398, -0.4947,  0.5127, -0.4818, -0.8740,  1.6020,  0.4836, -0.8855,
         -0.6367, -0.4623, -1.4764, -1.1121,  0.0138, -0.8910, -1.4673,  0.1777,
          0.6252, -0.8003, -0.0221,  0.5497, -1.0890,  1.2884,  0.6866,  1.0233,
          0.6059,  1.3729,  0.1450,  1.2973, -0.5043,  0.0220,  0.6940, -0.1463,
         -0.0866,  1.4762, -0.0066, -0.0279, -0.1981,  0.9113, -0.2384,  0.6656,
         -0.1552,  0.8052,  0.2919, -1.1599, -0.1969,  0.2914, -0.2966, -0.0989]])

In [44]:
# 模型量化前输出
model.eval()
# 禁用梯度计算，进行推理
with torch.no_grad():
    output = model(input_data)
output

tensor([[-0.4874, -0.8665,  0.0766, -0.7997,  0.3351, -0.6136, -0.2721, -0.9806,
          0.1873]])

In [45]:
# 模型量化后输出
q_model.eval()
# 禁用梯度计算，进行推理
with torch.no_grad():
    sym_output = q_model(input_data)
sym_output

tensor([[-0.4878, -0.8664,  0.0770, -0.7996,  0.3353, -0.6140, -0.2726, -0.9802,
          0.1872]])

In [46]:
# 计算误差，差值较小有时候会省略显示，通过 * 1000 放大误差用于展示
err = output - sym_output
print(f"err: \n {err * 1000} ")

err: 
 tensor([[ 0.3647, -0.1088, -0.3959, -0.0354, -0.2068,  0.4053,  0.4546, -0.4076,
          0.1279]]) 


### 以上方法已被装入quant_tool可直接通过quant_tool进行调用