# 5 线性层量化
在前面的过程中我们已经学习了对称量化和非对称量化，现在我们要尝试对模型中的线性层进行量化。

In [1]:
import torch
import copy
import torch.nn as nn
import torch.nn.functional as F
import util.quant_tool as quant_tool

## 5.1 自定义量化模块

In [2]:
# 定义权重量化模块（per-channel 对称 int8 量化）
class QuantLinear(nn.Module):
    """
    简化版的权重量化线性层：
    - 只量化 weight（symmetric per-channel int8）
    - bias 保持 FP32
    - 前向时：先临时反量化，再用 F.linear
    """
    def __init__(self, in_features: int, out_features: int, bias: bool = True,dtype=torch.float32):
        super().__init__()


        # # qweight: int8，形状 [out_features, in_features]
        self.register_buffer(
            "qweight",
            torch.empty(out_features, in_features, dtype=torch.int8),
        )
        # scale: per-output-channel，形状 [out_features, 1]
        self.register_buffer(
            "scale",
            torch.ones(out_features, 1, dtype=dtype),
        )
        # 对称量化 zero_point 固定 0，这里留个占位方便扩展
        self.register_buffer(
            "zero_point",
            torch.zeros(out_features, 1, dtype=dtype),
        )
        
        if bias:
            self.register_buffer("bias", 
                                 torch.randn((1, out_features), 
                                             dtype=dtype))
        else:
            self.bias = None

    @classmethod
    def from_linear(cls, linear: nn.Linear, per_channel: bool=False, is_symmetric: bool=True,dtype=torch.float32,channel_dim=0) -> "QuantLinear":
        """
        给定一个 nn.Linear，构造对应的 QuantLinear 并完成权重量化。
        """
        qlinear = cls(
            in_features=linear.in_features,
            out_features=linear.out_features,
            bias=linear.bias is not None,
            dtype=dtype
        )

        with torch.no_grad():
            # 获取线性层的权重
            weight = linear.weight
            if is_symmetric:
                # 对线性层进行对称量化
                qparams = quant_tool.get_symmetric_qparams(weight,per_channel,channel_dim)
                qweight = quant_tool.quantize_tensor(weight,qparams)
            else:
                # 对线性层进行非对称量化
                qparams = quant_tool.get_asymmetric_qparams(weight,per_channel,channel_dim)
                qweight = quant_tool.quantize_tensor(weight,qparams)

            # 储存缩放信息
            qlinear.qweight = qweight
            qlinear.scale = qparams.scale
            qlinear.zero_point = qparams.zero_point

            # 对偏置项不做处理
            if linear.bias is not None:
                qlinear.bias = linear.bias
        return qlinear

    # 前向传播的时候需要进行反量化
    def forward(self, x: torch.Tensor) -> torch.Tensor:

        
        # 反量化得到近似权重：w_hat = (q - z) * scale
        # qweight: [out, in], scale: [out, 1]
        w_hat = self.qweight.to(x.dtype)
        output = F.linear(x, w_hat) * self.scale
        if self.bias is not None:
            output = output + self.bias
        return output

## 5.2 定义模型量化入口
输入一个模型，输出量化后的模型

In [3]:
from typing import Optional, List
# 对权重进行量化
def quantize_model_weights(
    model: nn.Module,
    per_channel:bool=False,
    is_symmetric:bool=True,
    channel_dim:int=0,
    modules_to_exclude: Optional[List[str]] = None,  # 可选参数，排除不需要量化的层
) -> nn.Module:
    """
    递归遍历模型，遇到 nn.Linear 就替换成 QuantLinear（权重量化）。
    可以通过 modules_to_exclude 按模块名排除不想量化的层。
    """
    if modules_to_exclude is None:
        modules_to_exclude = []

    for name, child in list(model.named_children()):
        full_name = name

        if isinstance(child, nn.Linear) and full_name not in modules_to_exclude:
            setattr(model, name, QuantLinear.from_linear(child,per_channel=per_channel,is_symmetric=is_symmetric,dtype=child.weight.dtype,channel_dim=channel_dim))
        else:
            quantize_model_weights(child,per_channel,is_symmetric,modules_to_exclude=modules_to_exclude)
    return model

## 5.3 对模型进行量化

In [4]:
# 定义一个多层线性层用于验证
class FourLayerModel(nn.Module):
    def __init__(self, input_size=64):
        super(FourLayerModel, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.layer2 = nn.Linear(128, 256)
        self.layer3 = nn.Linear(256, 128)
        self.layer4 = nn.Linear(128, 1)
        self.layer5 = nn.Linear(1, 9)
    
    def forward(self, x):
        x = torch.relu(self.layer1(x))  
        x = torch.relu(self.layer2(x))  
        x = torch.relu(self.layer3(x))  
        x = torch.relu(self.layer4(x))  
        x = self.layer5(x) 
        return x

In [5]:
model = FourLayerModel()
model

FourLayerModel(
  (layer1): Linear(in_features=64, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=256, bias=True)
  (layer3): Linear(in_features=256, out_features=128, bias=True)
  (layer4): Linear(in_features=128, out_features=1, bias=True)
  (layer5): Linear(in_features=1, out_features=9, bias=True)
)

In [6]:
# 对模型进行量化,排除['layer1']层
base_model = copy.deepcopy(model)
q_model = quantize_model_weights(model=base_model,modules_to_exclude=['layer1','layer5'])
q_model

FourLayerModel(
  (layer1): Linear(in_features=64, out_features=128, bias=True)
  (layer2): QuantLinear()
  (layer3): QuantLinear()
  (layer4): QuantLinear()
  (layer5): Linear(in_features=1, out_features=9, bias=True)
)

In [7]:
import copy
base_model = copy.deepcopy(model)
# 对模型进行量化,不排除层
q_model = quantize_model_weights(base_model)
q_model

FourLayerModel(
  (layer1): QuantLinear()
  (layer2): QuantLinear()
  (layer3): QuantLinear()
  (layer4): QuantLinear()
  (layer5): QuantLinear()
)

In [8]:
# 初始化随机输入
input_data = torch.randn(1,64)
input_data

tensor([[-1.0460,  0.6507, -0.0024, -0.0910,  0.7244,  1.1987, -1.5149, -0.5180,
         -0.0269,  0.0533,  0.1340, -1.1349, -2.2192,  0.2176, -0.2943, -1.0202,
         -1.4823,  1.9782,  0.5639,  0.0028,  0.9610,  1.2390,  0.3270,  0.8854,
          1.7788,  0.5284,  0.3881, -0.4657,  1.2609, -2.3039, -0.1030,  0.8527,
         -0.4935, -0.4974,  0.3793, -0.8307,  1.7702,  0.3372,  0.7004,  1.6341,
          0.8201, -0.5599, -0.5949,  0.5014, -0.7524, -0.1732, -1.8799, -0.8076,
         -0.6814,  1.3539, -0.1717,  1.5322,  0.4549,  1.2054,  0.5036, -0.6115,
          0.8582,  0.0821, -0.2673, -1.4161, -0.2485, -1.6907, -0.3361, -0.3935]])

In [9]:
# 模型量化前输出
model.eval()
# 禁用梯度计算，进行推理
with torch.no_grad():
    output = model(input_data)
output

tensor([[-0.7181,  0.1854, -0.6549,  0.7976, -0.1812,  0.8611,  0.6819, -0.9689,
          0.6257]])

In [10]:
# 模型量化后输出
q_model.eval()
# 禁用梯度计算，进行推理
with torch.no_grad():
    sym_output = q_model(input_data)
sym_output

tensor([[-0.7180,  0.1855, -0.6549,  0.7975, -0.1810,  0.8612,  0.6821, -0.9688,
          0.6256]])

In [11]:
# 计算误差，差值较小有时候会省略显示，通过 * 1000 放大误差用于展示
err = output - sym_output
print(f"err: \n {err * 1000} ")

err: 
 tensor([[-0.0807, -0.1463, -0.0560,  0.1019, -0.1965, -0.0707, -0.2319, -0.0903,
          0.0679]]) 


### 以上方法已被装入quant_tool可直接通过quant_tool进行调用