In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.models import densenet121
import numpy as np
import torch.nn.functional as F
import math

def get_dataloaders(batch_size=128):
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2470, 0.2435, 0.2616)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2470, 0.2435, 0.2616)),
    ])

    trainset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True, transform=transform_train)
    trainloader = torch.utils.data.DataLoader(
        trainset, batch_size=batch_size, shuffle=True, num_workers=2)

    testset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform_test)
    testloader = torch.utils.data.DataLoader(
        testset, batch_size=batch_size, shuffle=False, num_workers=2)

    return trainloader, testloader


def build_model(num_classes=10):
    model = densenet121(weights=None) 

    num_features = model.classifier.in_features
    model.classifier = nn.Linear(num_features, num_classes)

    # CIFAR-10 是 32x32，小于 DenseNet 预期的 224x224
    # 所以第一个 conv 层 kernel=7 → kernel=3, stride=2 → stride=1
    model.features.conv0 = nn.Conv2d(
        3, 64, kernel_size=3, stride=1, padding=1, bias=False)

    return model

def int_to_bin_str(val, bits):
    """Convert signed integer to two's complement binary string."""
    v = int(round(float(val)))
    mask = (1 << bits) - 1
    if v < 0:
        v = (1 << bits) + v 
    v = v & mask
    return format(v, '0{}b'.format(bits))

# ==========================================
# 2. 硬件仿真与导出类
# ==========================================
# class HardwareExporter:
#     def __init__(self, layer, input_tensor, bit_width=4, array_size=8):
#         """
#         layer: 训练好的 Conv2d 层
#         input_tensor: 输入该层的 tensor (batch=1)
#         bit_width: 量化比特数
#         array_size: 脉动阵列大小 (8x8)
#         """
#         self.layer = layer
#         self.input = input_tensor.detach()
#         self.bits = bit_width
#         self.arr_size = array_size
        
#         # 获取量化参数 (假设使用对称量化，或者如果有特定alpha可在此修改)
#         # 这里模拟计算 Scale，保证数据在 int 范围内
#         self.act_max = self.input.abs().max()
#         self.wgt_max = self.layer.weight.data.abs().max()
        
#         self.scale_a = (2**(self.bits-1) - 1) / self.act_max
#         self.scale_w = (2**(self.bits-1) - 1) / self.wgt_max
        
#         print(f"Quantization Info: Act_Scale={self.scale_a:.4f}, Wgt_Scale={self.scale_w:.4f}")

#     def get_quantized_int(self):
#         """获取量化后的整数 Tensor"""
#         # Quantize Input
#         inp_int = torch.round(self.input * self.scale_a).clamp(-(2**(self.bits-1)), 2**(self.bits-1)-1)
        
#         # Quantize Weight
#         wgt_int = torch.round(self.layer.weight.data * self.scale_w).clamp(-(2**(self.bits-1)), 2**(self.bits-1)-1)
        
#         return inp_int, wgt_int

#     def run_software_verification(self, kij_idx=0):
#         """
#         纯软件模拟硬件行为，计算预期输出结果。
#         仅计算 Slice 出的 8x8 区域。
#         kij_idx: 3x3卷积核中的第几个点 (0~8)
#         """
#         inp_int, wgt_int = self.get_quantized_int()
        
#         # 1. Slice: 取出 Input Channel 0-7, Output Channel 0-7
#         # Input shape: [1, 64, 32, 32] -> Slice -> [1, 8, 32, 32]
#         inp_slice = inp_int[:, :self.arr_size, :, :]
        
#         # Weight shape: [64, 64, 3, 3] -> Slice -> [8, 8, 3, 3] (Out, In, k, k)
#         wgt_slice = wgt_int[:self.arr_size, :self.arr_size, :, :]
        
#         # 2. 模拟 Padding
#         padding = self.layer.padding[0]
#         inp_pad = F.pad(inp_slice, (padding, padding, padding, padding)) # [1, 8, 34, 34]
        
#         # 3. 提取特定 kij 对应的权重 [8, 8]
#         # kij_idx 映射到 (ki, kj): 0->(0,0), 1->(0,1)... 4->(1,1 Center)
#         ki = kij_idx // 3
#         kj = kij_idx % 3
#         w_matrix = wgt_slice[:, :, ki, kj] # [8, 8] -> (Out_Ch, In_Ch)
        
#         # 4. Flatten Input (按照 Im2Col 或 简单的 Time Series)
#         # 硬件通常读取 Flatten 后的输入流
#         # inp_pad shape: [1, 8, H_pad, W_pad] -> [8, Total_Pixels]
#         inp_stream = inp_pad.squeeze(0).reshape(self.arr_size, -1)
        
#         # 5. 矩阵乘法模拟 (Systolic Array 核心逻辑: Weight * Input)
#         # Output [8, Time] = Weight [8, 8] @ Input [8, Time]
#         output_int = torch.matmul(w_matrix, inp_stream)
        
#         print(f"Software Verification (kij={kij_idx}): Output Shape {output_int.shape}")
#         print(f"Sample Output (First 5 values of Ch0): {output_int[0, :5].tolist()}")
        
#         return inp_stream, w_matrix, output_int
        
#     def calculate_final_output(self, prefix="test"):
#         """
#         计算并导出 9 个 kij Partial Sums 累加后的最终输出，
#         并应用 Bias 和 Activation Function。
#         """
#         final_output = None
        
#         # 1. 累加 9 个 kij 的 Partial Sums
#         print("Calculating final accumulated output...")
        
#         for kij_idx in range(9):
#             # run_software_verification 已经返回了量化后的 W * A 结果 (Partial Sum)
#             inp_stream, w_matrix, current_partial_sum = self.run_software_verification(kij_idx)
            
#             # 累加 Partial Sums
#             if final_output is None:
#                 final_output = current_partial_sum
#             else:
#                 final_output += current_partial_sum
                
#         # 2. 处理 Bias 和 Activation Function
        
#         # 获取 Bias (DenseNet 121 for CIFAR-10 通常是 Conv + BN，所以 Conv 层无 Bias)
#         bias = self.layer.bias
#         if bias is not None:
#             # Bias 是 [Out_Ch] 维度。final_output 是 [Out_Ch, Time]
#             # 需要将 bias 扩展为 [Out_Ch, Time]
#             bias_slice = bias[:self.arr_size].unsqueeze(1).expand_as(final_output)
#             final_output += bias_slice
#             print("Bias applied.")
#         else:
#             # 检查 DenseNet 架构：通常 Conv 后面紧跟 BatchNorm，所以 Conv 没有 Bias。
#             # 如果是这种情况，需要确保在 Verilog 端或后续层处理 BatchNorm。
#             print("Note: Target layer has no bias. Assuming BatchNorm is handled externally/later.")
        
#         # 3. 应用 Activation Function (DenseNet 的 Conv2 后面通常是 ReLU)
#         # 在 Verilog 验证中，通常只验证 MAC/累加部分。但为了软件对比的完整性，我们应用 ReLU。
        
#         # Clamp 到 4-bit 范围 (可选，取决于硬件的输出位宽和后续层处理)
#         # final_output = final_output.clamp(-(2**(self.bits-1)), 2**(self.bits-1)-1)
        
#         # 转换为 16-bit 整数 (假设累加结果位宽增加到 16-bit)
#         final_output_int = torch.round(final_output)
#         final_output_int = torch.nn.functional.relu(final_output_int)
    
#         # 4. 导出 Final Output
#         out_filename = f"{prefix}_final_output_ref.txt"
#         with open(out_filename, 'w') as f:
#             f.write('# [Final Accumulated Output - Sum of 9 Partial Sums + Bias (if present)] Bits: Row7(MSB)..Row0(LSB) #\n')
            
#             # 同样按照 Time Major, Row (Out Ch) High to Low 导出
#             for t in range(final_output_int.shape[1]):
#                 line_bin = ""
#                 # Rows (Output Channels) 7 down to 0
#                 for r in range(final_output_int.shape[0] - 1, -1, -1):
#                     val = final_output_int[r, t].item()
#                     # 累加后的结果通常需要更大的位宽 (例如 16 bit)
#                     line_bin += int_to_bin_str(val, 16) 
#                 f.write(line_bin + '\n')
                
#         print(f"Exported Final Output Reference: {out_filename}")
#         return final_output_int
        
#     def export_files(self, kij_idx=0, prefix="test"):
#         """导出 .txt 文件"""
#         inp_stream, w_matrix, out_ref = self.run_software_verification(kij_idx)
        
#         # === 1. 导出 Activation (Input) ===
#         # 格式: #time0row7...time0row0#
#         # inp_stream shape: [8 (Rows), Time]
#         act_filename = f"{prefix}_activation_kij{kij_idx}.txt"
#         with open(act_filename, 'w') as f:
#             f.write('# [Time Step Major] Each line is one time step. Bits: Row7(MSB)..Row0(LSB) #\n')
#             time_steps = inp_stream.shape[1]
#             rows = inp_stream.shape[0] # 8
            
#             for t in range(time_steps):
#                 line_bin = ""
#                 # 注意：硬件通常要求高位在左 (Row 7)，低位在右 (Row 0)
#                 for r in range(rows - 1, -1, -1): 
#                     val = inp_stream[r, t].item()
#                     line_bin += int_to_bin_str(val, self.bits)
#                 f.write(line_bin + '\n')
#         print(f"Exported: {act_filename}")

#         # === 2. 导出 Weight ===
#         # 格式: #col0row7...col0row0# (假设列代表Output Ch, 行代表Input Ch)
#         # w_matrix shape: [8 (Out/Col), 8 (In/Row)]
#         wgt_filename = f"{prefix}_weight_kij{kij_idx}.txt"
#         with open(wgt_filename, 'w') as f:
#             f.write('# [Col Major] Each line is one Output Channel (Col). Bits: Row7(InCh7)..Row0(InCh0) #\n')
#             num_cols = w_matrix.shape[0] # Output Channels
#             num_rows = w_matrix.shape[1] # Input Channels
            
#             # 你的 Reference 代码(HW_Code6) 似乎并没有遍历 Col，而是直接取了一个 Tile。
#             # 通常 Systolic Array Weight Loading 是一列一列 Load 或者一次性 Load。
#             # 这里我们假设文件的一行对应 Array 的一列 (One Output Channel's weights across inputs)
            
#             for c in range(num_cols): # Output Channel Loop
#                 line_bin = ""
#                 for r in range(num_rows - 1, -1, -1): # Input Channel Loop (High to Low)
#                     val = w_matrix[c, r].item()
#                     line_bin += int_to_bin_str(val, self.bits)
#                 f.write(line_bin + '\n')
#         print(f"Exported: {wgt_filename}")
        
#         # === 3. 导出 Golden Output (用于对比 Verilog 输出) ===
#         out_filename = f"{prefix}_output_ref_kij{kij_idx}.txt"
#         with open(out_filename, 'w') as f:
#             # 同样按照 Time Major, Row (Out Ch) High to Low 导出
#             for t in range(out_ref.shape[1]):
#                 line_bin = ""
#                 for r in range(out_ref.shape[0] - 1, -1, -1):
#                     val = out_ref[r, t].item()
#                     # Output 通常位宽会变大 (例如 16bit)，防止溢出
#                     line_bin += int_to_bin_str(val, 16) 
#                 f.write(line_bin + '\n')
#         print(f"Exported: {out_filename}")

class HardwareExporter:
    # ... (其他方法如 __init__, int_to_bin_str, get_quantized_int 保持不变) ...
    
    def __init__(self, layer, input_tensor, bit_width=4, array_size=8):
        """
        layer: 训练好的 Conv2d 层
        input_tensor: 输入该层的 tensor (batch=1)
        bit_width: 量化比特数
        array_size: 脉动阵列大小 (8x8)
        """
        self.layer = layer
        self.input = input_tensor.detach().cpu()
        self.bits = bit_width
        self.arr_size = array_size
        
        # 获取量化参数
        self.act_max = self.input.abs().max()
        self.wgt_max = self.layer.weight.data.abs().max()
        
        self.scale_a = (2**(self.bits-1) - 1) / self.act_max
        self.scale_w = (2**(self.bits-1) - 1) / self.wgt_max
        
        print(f"Quantization Info: Act_Scale={self.scale_a:.4f}, Wgt_Scale={self.scale_w:.4f}")

    def int_to_bin_str(self, val, bits):
        """将整数转换为补码形式的二进制字符串"""
        val = int(round(val))
        if val < 0:
            val = (1 << bits) + val
        mask = (1 << bits) - 1
        val = val & mask
        return f"{val:0{bits}b}"

    def get_quantized_int(self):
        """获取量化后的整数输入和权重"""
        inp_int = torch.round(self.input * self.scale_a).clamp(-(2**(self.bits-1)), 2**(self.bits-1)-1)
        wgt_int = torch.round(self.layer.weight.data * self.scale_w).clamp(-(2**(self.bits-1)), 2**(self.bits-1)-1)
        return inp_int, wgt_int

    def run_software_verification(self, kij_idx=0, input_tile_size=6, output_tile_size=4):
        """
        模拟 Tiling：生成 [8, 36] 的输入流，但只裁剪出 [8, 16] 的有效输出。
        input_tile_size: 决定输入数据流的行数 (6x6=36)
        output_tile_size: 决定有效输出结果的行数 (4x4=16)
        """
        inp_int, wgt_int = self.get_quantized_int()
        
        # 1. Slice: 取出 Input Channel 0-7, Output Channel 0-7
        inp_slice = inp_int.squeeze(0)[:self.arr_size, :, :] # [8, 32, 32]
        wgt_slice = wgt_int[:self.arr_size, :self.arr_size, :, :]
        
        # 2. 提取 input_tile_size x input_tile_size 的输入局部区域。
        # [8, 32, 32] -> [8, 6, 6]
        inp_tile_chw = inp_slice[:, :input_tile_size, :input_tile_size]
        
        # 3. 提取特定 kij 对应的权重 [8, 8]
        ki = kij_idx // 3
        kj = kij_idx % 3
        w_matrix = wgt_slice[:, :, ki, kj] # [8, 8]
        
        # 4. 生成完整的 Input Stream (6x6 = 36 Time Steps)
        full_inp_stream_list = []
        
        # r 和 c 循环的是 Input Tile 的尺寸 (6x6)，生成 36 个输入向量
        for r in range(input_tile_size):  
            for c in range(input_tile_size):  
                # 采样位置：[channels, r, c]
                input_vector = inp_tile_chw[:, r, c] # [8] 个输入通道的值
                full_inp_stream_list.append(input_vector.unsqueeze(1))
        
        # Full Input Stream: [8 (In Ch), 36 (Time Steps)]
        inp_stream_full = torch.cat(full_inp_stream_list, dim=1)
        
        # 5. 矩阵乘法模拟 (生成 36 个 Partial Sums)
        # Output [8, 36] = Weight [8, 8] @ Input [8, 36]
        full_output_int = torch.matmul(w_matrix.cuda(), inp_stream_full.cuda())
        
        # 6. 裁剪 Output (只保留 4x4 的有效区域)
        # 假设 36 个输出是按照 R0C0..R0C5, R1C0..R1C5, ... 的顺序排列的
        # 有效输出区域是 R1C1 到 R4C4 的 4x4 区域
        
        start_idx = input_tile_size + 1 # (1 * 6) + 1 = 7 (跳过 R0 和 R1C0)
        valid_indices = []
        
        for r in range(output_tile_size): # R=0 to 3
            # 在 6x6 的输出中，有效行是从 R=1 开始的，到 R=4 结束
            # 在 6x6 的输出中，有效列是从 C=1 开始的，到 C=4 结束
            
            # 对应的 6x6 索引是： (r+1)*6 + (c+1)
            base_idx = (r + 1) * input_tile_size + 1
            for c in range(output_tile_size): # C=0 to 3
                valid_indices.append(base_idx + c)

        # 裁剪出 16 个有效的 Partial Sums
        output_cropped = full_output_int[:, valid_indices] # [8, 16]
        
        print(f"Software Verification (kij={kij_idx}): Input Stream Shape {inp_stream_full.shape}, Cropped Output Shape {output_cropped.shape}")
        
        # 返回 36 行输入流和 16 行裁剪后的输出
        return inp_stream_full, w_matrix, output_cropped


    def export_files(self, kij_idx=0, prefix="test", export_activation=True):
        """导出 .txt 文件"""
        # Tiling 参数硬编码在 run_software_verification 内部，这里无需传入
        inp_stream, w_matrix, out_ref = self.run_software_verification(kij_idx)
        
        # === 1. 导出 Activation (Input) === (36行)
        tile_size = inp_stream.shape[1]
        out_size = out_ref.shape[1]
        act_filename = f"{prefix}_activation_in{tile_size}.txt" 
        if export_activation:
            # ... (写入逻辑不变) ...
            with open(act_filename, 'w') as f:
                f.write('# [Time Step Major] Bits: Row7(MSB)..Row0(LSB) #\n')
                time_steps = inp_stream.shape[1]
                rows = inp_stream.shape[0] # 8
                
                for t in range(time_steps):
                    line_bin = ""
                    for r in range(rows - 1, -1, -1): 
                        val = inp_stream[r, t].item()
                        line_bin += self.int_to_bin_str(val, self.bits) 
                    f.write(line_bin + '\n')
            print(f"Exported: {act_filename} ({time_steps} lines)")

        # === 2. 导出 Weight === (8行)
        wgt_filename = f"{prefix}_weight_kij{kij_idx}.txt"
        # ... (写入逻辑不变) ...
        with open(wgt_filename, 'w') as f:
            f.write('# [Col Major] Bits: Row7(InCh7)..Row0(InCh0) #\n')
            num_cols = w_matrix.shape[0] # Output Channels
            num_rows = w_matrix.shape[1] # Input Channels
            
            for c in range(num_cols): # Output Channel Loop
                line_bin = ""
                for r in range(num_rows - 1, -1, -1): # Input Channel Loop (High to Low)
                    val = w_matrix[c, r].item()
                    line_bin += self.int_to_bin_str(val, self.bits)
                f.write(line_bin + '\n')
        print(f"Exported: {wgt_filename}")
        
        # === 3. 导出 Golden Output (Partial Sum Reference) === (16行)
        out_filename = f"{prefix}_output_ref_kij{kij_idx}_out{out_size}.txt"
        # ... (写入逻辑不变) ...
        with open(out_filename, 'w') as f:
            for t in range(out_ref.shape[1]):
                line_bin = ""
                for r in range(out_ref.shape[0] - 1, -1, -1):
                    val = out_ref[r, t].item()
                    line_bin += self.int_to_bin_str(val, 16) 
                f.write(line_bin + '\n')
        print(f"Exported: {out_filename}")


    def calculate_final_output(self, prefix="test"):
        """计算并导出 9 个 kij Partial Sums 累加后的最终输出 (16行)"""
        final_output = None
        
        print("Calculating final accumulated output...")
        
        for kij_idx in range(9):
            # 调用 run_software_verification (自动产生 16 行输出)
            inp_stream, w_matrix, current_partial_sum = self.run_software_verification(kij_idx)
            
            if final_output is None:
                final_output = current_partial_sum
            else:
                final_output += current_partial_sum
            
        # ... (Bias 和 ReLU 逻辑保持不变) ...
        bias = self.layer.bias
        if bias is not None:
            bias_slice = bias[:self.arr_size].unsqueeze(1).expand_as(final_output)
            final_output += bias_slice
            print("Bias applied.")
        else:
            print("Note: Target layer has no bias.")
        
        # 转换为 16-bit 整数
        final_output_int = torch.round(final_output)
        final_output = torch.nn.functional.relu(final_output)
        out_size = final_output_int.shape[1]

        # 4. 导出 Final Output
        out_filename = f"{prefix}_final_output_ref_out{out_size}.txt"
        with open(out_filename, 'w') as f:
            f.write('# [Final Accumulated Output] Bits: Row7(MSB)..Row0(LSB) #\n')
            
            for t in range(final_output_int.shape[1]):
                line_bin = ""
                for r in range(final_output_int.shape[0] - 1, -1, -1):
                    val = final_output_int[r, t].item()
                    line_bin += self.int_to_bin_str(val, 16) 
                f.write(line_bin + '\n')
                
        print(f"Exported Final Output Reference: {out_filename} ({final_output_int.shape[1]} lines)")
        return final_output_int

def train(model, trainloader, criterion, optimizer, device):
    model.train()
    running_loss = 0

    for i, (inputs, labels) in enumerate(trainloader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return running_loss / len(trainloader)


def test(model, testloader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0

    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    return test_loss / len(testloader), correct / total

def get_testloader(batch_size=128):
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2470, 0.2435, 0.2616)),
    ])

    testset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform_test)

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=batch_size, shuffle=False, num_workers=2)

    return testloader


# -----------------------------------
# 3. Hook：保存中间特征
# -----------------------------------
saved_features = {}   # 用来存储 hook 输出

def save_hook(name):
    def hook(module, input, output):
        # detach + to CPU + numpy（如果需要处理）
        saved_features[name] = output.detach().cpu()
    return hook

print('All preparations finished')

All preparations finished


In [29]:
# ==========================================
# 第一步：复用你的旧代码加载模型和数据
# ==========================================
import torch
import torch.nn as nn
from torchvision import datasets, transforms

# 1. 设置设备
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 2. 构建模型 (复用你的 build_model)
model = build_model().to(device)

# 3. 加载训练好的权重 (复用你的路径)
weight_path = "result/dense/best_densenet_cifar10.pth"
# 如果你有训练好的文件，取消下面注释
# model.load_state_dict(torch.load(weight_path, map_location=device))
print("Model loaded.")

# 4. 注册 Hook (复用你的逻辑，定位到想要导出的层)
# 假设我们要导出 denseblock1 中 denselayer1 的 conv2 (3x3 卷积)
target_layer = model.features.denseblock1.denselayer1.conv2
capture_data = {}

def hook_fn(module, input, output):
    capture_data['input'] = input[0].detach() # 保持在 GPU/CPU 均可，后续处理
    capture_data['output'] = output.detach()

handle = target_layer.register_forward_hook(hook_fn)

# ==========================================
# 第二步：运行一次推理以捕获数据
# ==========================================
# 复用你的测试数据获取逻辑
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
])
testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

# 取一张图片，增加 batch 维度
img, label = testset[0]
img = img.unsqueeze(0).to(device)

# 前向传播
model.eval()
with torch.no_grad():
    _ = model(img)

# 移除 Hook
handle.remove()
print(f"Captured input shape: {capture_data['input'].shape}")

# ==========================================
# 第三步：使用新工具导出 (这里嵌入我上一条回答提供的类)
# ==========================================
# 确保在这里粘贴了 class HardwareExporter ... 的定义代码

# 初始化导出器
# 注意：我们传入捕获到的 input 和 目标层 target_layer
exporter = HardwareExporter(
    layer=target_layer, 
    input_tensor=capture_data['input'], 
    bit_width=4, 
    array_size=8
)

# 导出文件
# kij_idx=4 代表 3x3 卷积核的中心点 (中间的那个权重)
# exporter.export_files(kij_idx=4, prefix="dense_layer1_conv2")
for i in range(0, 9):
    exporter.export_files(kij_idx=i, prefix="dense_layer1_conv2_new")
final_output_tensor = exporter.calculate_final_output(prefix="dense_layer1_conv2_relu")
print("\nAll files, including the final accumulated output reference, have been generated.")

Model loaded.
Files already downloaded and verified
Captured input shape: torch.Size([1, 128, 16, 16])
Quantization Info: Act_Scale=1.8229, Wgt_Scale=38.5760
Software Verification (kij=0): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Exported: dense_layer1_conv2_new_activation_in36.txt (36 lines)
Exported: dense_layer1_conv2_new_weight_kij0.txt
Exported: dense_layer1_conv2_new_output_ref_kij0_out16.txt
Software Verification (kij=1): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Exported: dense_layer1_conv2_new_activation_in36.txt (36 lines)
Exported: dense_layer1_conv2_new_weight_kij1.txt
Exported: dense_layer1_conv2_new_output_ref_kij1_out16.txt
Software Verification (kij=2): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Exported: dense_layer1_conv2_new_activation_in36.txt (36 lines)
Exported: dense_layer1_conv2_new_weight_kij2.txt
Exported: dense_layer1_conv2_new_output_ref_kij2_

In [33]:
final_output_tensor = exporter.calculate_final_output(prefix="dense_layer1_conv2_relu")
print('Finished')

Calculating final accumulated output...
Software Verification (kij=0): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Software Verification (kij=1): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Software Verification (kij=2): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Software Verification (kij=3): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Software Verification (kij=4): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Software Verification (kij=5): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Software Verification (kij=6): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Software Verification (kij=7): Input Stream Shape torch.Size([8, 36]), Cropped Output Shape torch.Size([8, 16])
Software Verification (kij=8): Input Stream Shape torch.Size([8,

In [14]:
import math

def generate_full_acc_address_file(output_width=32, output_height=32, address_bits=11, filename="full_acc_address.txt"):
    """
    根据用户提供的 acc_address.txt 示例文件中的规律，生成完整的累加器地址文件。

    该示例文件展示了两个核心规律：
    1. 相对地址模式 (Rj)：9个kij/MAC操作的相对地址。
    2. 起始地址序列 (Sk)：每隔一个输出像素，起始地址有规律地跳变 (Sk = k + 2 * floor(k/4))。
    
    Args:
        output_width (int): 目标输出特征图的宽度 (32)。
        output_height (int): 目标输出特征图的高度 (32)。
        address_bits (int): 累加器地址的位宽 (11)。
        filename (str): 输出文件名。
    """
    
    # 1. 从用户提供的示例中提取 9 个 kij 的相对地址模式 (Rj)
    # 转换为十进制，便于计算。这是 acc_address.txt 前 9 行的十进制值 。
    RELATIVE_ADDRESSES = [
        0,    # 00000000000
        77,   # 00000100101
        308,  # 00001001010
        450,  # 00001110010
        551,  # 00010010111
        732,  # 00010111100
        964,  # 00011100100
        1089, # 00100001001
        1278  # 00100101110
    ]
    
    # 2. 定义起始地址序列的生成函数 (Sk)
    # 规律：Sk = k + 2 * floor(k/4)
    # 这表明您的硬件在处理4个像素后，会跳过2个地址，这很可能对应于您PSUM SRAM的Tiling结构。
    def get_starting_address(k):
        """计算第 k 个输出像素的起始累加地址 (Sk)"""
        return k + 2 * (k // 4)

    # 3. 生成完整的地址文件
    total_pixels = output_width * output_height # 32 * 32 = 1024
    total_cycles = total_pixels * 9             # 1024 * 9 = 9216
    
    print(f"Generating {filename} for {total_pixels} output pixels ({total_cycles} total cycles)...")

    with open(filename, 'w') as f:
        # k: 循环所有输出像素 (从 0 到 1023)
        for k in range(total_pixels):
            
            # S_k: 当前像素的起始地址
            S_k = get_starting_address(k)
            
            # j: 循环 9 个 kij (从 0 到 8)
            for j in range(9):
                
                # A: 绝对累加地址 = 起始地址 + 相对地址
                A = S_k + RELATIVE_ADDRESSES[j]
                
                # 格式化为 11 位二进制字符串 (例如: 00000000000)
                # 使用 format(number, '0b') 将整数转为二进制，并确保左侧补零
                address_bin = f'{A:0{address_bits}b}'
                
                # 写入文件
                f.write(address_bin + '\n')
                
    print(f"Generation complete. The file '{filename}' contains {total_cycles} 11-bit addresses.")

# 执行函数以生成文件
generate_full_acc_address_file()

Generating full_acc_address.txt for 1024 output pixels (9216 total cycles)...
Generation complete. The file 'full_acc_address.txt' contains 9216 11-bit addresses.


In [None]:
import torch
import torch.nn.functional as F
import math

