## Connect to CAS

In [5]:
import swat
import os
import numpy as np
import pandas as pd
import sys

In [None]:
cashost = 'sas-cas-server-default-client'
conn = swat.CAS(cashost, 5570, password=os.environ.get('ACCESS_TOKEN'))

In [None]:
conn.about()['About']['Viya Version']

In [None]:
conn.caslibInfo()

In [None]:
conn.fileInfo(caslib='casuser')

In [None]:
conn.tableInfo(caslib='casuser')

## Explore the CAS Table

In [None]:
castb = conn.CASTable('train_data', caslib='casuser')
display(type(castb), castb)     # display type & value

#### Table Preview

In [None]:
castb.tableDetails()

In [None]:
df = castb.head()


#### Table Info

In [None]:
castb.shape

In [None]:
castb.columnInfo()

In [None]:
df = (castb.SFR_Result.value_counts())
display(type(df), df)
df.plot(kind='bar', figsize=(8, 6))

## sas.ipynb

In [None]:
!pip install git+https://github.com/sassoftware/python-dlpy.git

### data prep

In [None]:
import swat
import os
import numpy as np
import pandas as pd
import sys
import dlpy
from dlpy import Sequential
from dlpy import *
from dlpy.model import TextParms
from dlpy.blocks import Bidirectional
from dlpy.applications import TextClassification
from dlpy.network import *
from dlpy.utils import *
from dlpy.applications import *
from dlpy.model import *
from dlpy.images import *
from dlpy.layers import *
cashost = 'sas-cas-server-default-client'
conn = swat.CAS(cashost, 5570, password=os.environ.get('ACCESS_TOKEN'))

castb = conn.CASTable('train_data', caslib='casuser')
SFR = castb.iloc[:, 2:46].values
blocks = [SFR[:, i:i+4].reshape(-1, 2, 2) for i in range(0, 45, 5)] # 将36列中的每4列合并成一个2*2矩阵，得到9个块
x_train = np.concatenate([np.concatenate(blocks[i:i+3], axis=2) for i in range(0, 9, 3)], axis=1) # 将9个块按3*3的方式拼成一个大矩阵
# 修改train_data
# 将 numpy 数组转换为 pandas DataFrame
X_train_flat = x_train.reshape(8773, -1)  # 展平图像数据
df_train = pd.DataFrame(X_train_flat)
df_train['label'] = castb.SFR_Result
df_train['label'] = df_train['label'].map({'OK': 1, 'NG': 0})
train_data = conn.upload_frame(df_train, casout={'name':'train_data', 'replace':True})

# 读取测试数据
test_castb = conn.CASTable('test_data', caslib='casuser')

# 处理测试数据
SFR_test = test_castb.iloc[:, 2:46].values
blocks_test = [SFR_test[:, i:i+4].reshape(-1, 2, 2) for i in range(0, 45, 5)] 
x_test = np.concatenate([np.concatenate(blocks_test[i:i+3], axis=2) for i in range(0, 9, 3)], axis=1)

# 将 numpy 数组转换为 pandas DataFrame
X_test_flat = x_test.reshape(test_castb.shape[0], -1)  # 展平图像数据
df_test = pd.DataFrame(X_test_flat)
df_test['label'] = test_castb.SFR_Result
df_test['label'] = df_test['label'].map({'OK': 1, 'NG': 0})

# 上传测试数据到CAS
test_data = conn.upload_frame(df_test, casout={'name': 'test_data', 'replace': True})

### train model

In [None]:
import functools
import time
import types
import pprint
from tqdm import tqdm
import swat
import dlpy
from dlpy import Sequential
from dlpy.model import *
from dlpy.layers import *
from dlpy.utils import *
        
# 模型定义和训练
model = Sequential(conn, model_table='Simple_CNN')
model.add(InputLayer(1, 6, 6))
model.add(Conv2d(n_filters=16, width=2, height=2, stride=2, act='relu'))
model.add(BatchNormalization())
model.add(Pooling(1))
model.add(Dense(16*3*3, act='relu'))
model.add(Dense(256, act='relu'))
model.add(Dense(64, act='relu'))
model.add(Dense(32, act='relu'))
model.add(Dense(1))
model.add(OutputLayer(act='sigmoid', n=1))

input_vars = train_data.columns[:-1].tolist()  # 除去 'label' 列的所有列名
target_var = 'label'
model.fit(
    data=train_data,
    inputs=input_vars,
    target=target_var,
    mini_batch_size=128,
    max_epochs=100,
    lr = 0.1,
    n_threads=1,
    log_level=2
)


### profile

In [None]:
import functools
import time
import types
import pprint
from tqdm import tqdm
import swat
import dlpy
from dlpy import Sequential
from dlpy.model import *
from dlpy.layers import *
from dlpy.utils import *

# 创建一个字典来存储调用次数
call_count = {}

def count_calls(func, module_name=None):
    @functools.wraps(func)
    def wrapper_count_calls(*args, **kwargs):
        full_name = module_name + '.' + func.__name__
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        
        elapsed_time = end_time - start_time
        
        if full_name not in call_count:
            call_count[full_name] = {"count": 0, "total_time": 0.0}
        
        call_count[full_name]["count"] += 1
        call_count[full_name]["total_time"] += elapsed_time
        
        return result
    wrapper_count_calls._is_decorated = True
    return wrapper_count_calls

def set_new_attr(module, attr_name, attr):
    if not hasattr(attr, "_is_decorated"):
        decorated_attr = count_calls(attr, module.__name__)
        decorated_attr._is_decorated = True
        setattr(module, attr_name, decorated_attr)

# 递归封装所有的包
def auto_decorate_module(module, visited=None):
    if visited is None:
        visited = set()
    
    module_name = module.__name__
    if module_name in visited:
        return
    visited.add(module_name)
    for attr_name in dir(module):
        try:
            attr = getattr(module, attr_name)
            if isinstance(attr, types.FunctionType):
                set_new_attr(module, attr_name, attr)
            elif isinstance(attr, types.ModuleType) and attr.__name__.startswith('dlpy'):
                auto_decorate_module(attr, visited)
            elif isinstance(attr, type):
                auto_decorate_class(attr)
            elif callable(attr):
                set_new_attr(module, attr_name, attr)
        except AttributeError:
            continue

def auto_decorate_class(cls):
    for attr_name in dir(cls):
        try:
            attr = getattr(cls, attr_name)
            if isinstance(attr, types.FunctionType):
                set_new_attr(cls, attr_name, attr)
            elif attr_name in ['__add__', '__mul__', '__sub__', '__truediv__', '__matmul__', '__pow__', '__mod__']:
                set_new_attr(cls, attr_name, attr)
        except (AttributeError, TypeError):
            continue

# 自动装饰 dlpy 模块及其子模块
auto_decorate_module(dlpy)
# 装饰 Layer 类的 __call__ 方法
Layer.__call__ = count_calls(Layer.__call__, 'Layer')

In [None]:
# 遍历 test_data 中的64行进行前向计算
for index, row in df_test.head(64).iterrows():
    # 从 test_data 中获取当前行的前 36 个元素，并将其重塑为 6x6 矩阵
    input_data = row.values[:36].reshape((1, 1, 6, 6))
    
    # 创建输入张量
    input_tensor = Tensor(InputLayer(1, 6, 6))
    input_tensor.shape = (1, 1, 6, 6)
    input_tensor._value = input_data

    # 前向计算并记录时间
    conv_output = model.layers[1](input_tensor)
    batch_norm_output = model.layers[2](conv_output)
    pooling_output = model.layers[3](batch_norm_output)
    dense_output1 = model.layers[4](pooling_output)
    dense_output2 = model.layers[5](dense_output1)
    dense_output3 = model.layers[6](dense_output2)
    dense_output4 = model.layers[7](dense_output3)
    dense_output5 = model.layers[8](dense_output4)
    output = model.layers[9](dense_output5)

In [None]:
pprint.pprint(call_count)

### backup

In [None]:
# 获取预测的浮点数值
predicted_probs = pred['P_label'].values

# 将预测的浮点数值转换为0或1
predicted_labels = np.where(predicted_probs > 0.5, 1.0, 0.0)

# 获取实际标签
actual_labels = df_test['label'].values

# 计算准确率
accuracy = np.mean(predicted_labels == actual_labels)
print(f'Accuracy: {accuracy}')

In [None]:
# 遍历 test_data 中的每一行进行前向计算
for index, row in df_test.iterrows():
    # 从 test_data 中获取当前行的前 36 个元素，并将其重塑为 6x6 矩阵
    input_data = row.values[:36].reshape((1, 1, 6, 6))
    
    # 创建输入张量
    input_tensor = Tensor(input_layer)
    input_tensor.shape = (1, 1, 6, 6)
    input_tensor._value = input_data

    # 前向计算并记录时间
    conv_output = conv_layer(input_tensor)
    batch_norm_output = batch_norm_layer(conv_output)
    pooling_output = pooling_layer(batch_norm_output)
    dense_output1 = dense_layer1(pooling_output)
    dense_output2 = dense_layer2(dense_output1)
    dense_output3 = dense_layer3(dense_output2)
    dense_output4 = dense_layer4(dense_output3)
    dense_output5 = dense_layer5(dense_output4)
    output = output_layer(dense_output5)

In [None]:
# 遍历 test_data 中的前 128 行进行前向计算
for index, row in df_test.head(128).iterrows():
    # 从 test_data 中获取当前行的前 36 个元素，并将其重塑为 6x6 矩阵
    input_data = row.values[:36].reshape((1, 1, 6, 6))
    
    # 创建输入张量
    input_tensor = Tensor(input_layer)
    input_tensor.shape = (1, 1, 6, 6)
    input_tensor._value = input_data

    # 前向计算并记录时间
    conv_output = conv_layer(input_tensor)
    batch_norm_output = batch_norm_layer(conv_output)
    pooling_output = pooling_layer(batch_norm_output)
    dense_output1 = dense_layer1(pooling_output)
    dense_output2 = dense_layer2(dense_output1)
    dense_output3 = dense_layer3(dense_output2)
    dense_output4 = dense_layer4(dense_output3)
    dense_output5 = dense_layer5(dense_output4)
    output = output_layer(dense_output5)

## quant.ipynb

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import torch.utils.data
import math
from copy import deepcopy
import numpy as np
import cv2 as cv
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim     # for constructing optimizer
import torchvision.models as models
from module import *
from function import *

# 1. preparing the dataset
class GDdataset(Dataset):
    def __init__(self, path):
        self.data = pd.read_csv(path)
        SFR = torch.tensor(self.data.iloc[:, 2:46].values)
        blocks = [SFR[:, i:i+4].reshape(-1, 2, 2) for i in range(0, 45, 5)] # 将36列中的每4列合并成一个2*2矩阵，得到9个块
        
        self.value = torch.cat([torch.cat(blocks[i:i+3], dim=2) for i in range(0, 9, 3)], dim=1) # 将9个块按3*3的方式拼成一个大矩阵
        self.value = self.value.unsqueeze(1).to(torch.float32)
        
        self.target = torch.tensor([1.0 if x == 'OK' else 0.0 for x in self.data.iloc[:, 48].values])
        self.target = self.target.unsqueeze(1)
        
    def __getitem__(self, index):
        return self.value[index], self.target[index]
        
        
    def __len__(self):
        return len(self.data)
    
from module import *
# 2. define the model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=2, stride=2)
        self.bn = nn.BatchNorm2d(num_features=16)
        self.fc1 = nn.Linear(in_features=16*3*3, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=32)
        self.fc4 = nn.Linear(in_features=32, out_features=1)
        self.sigmoid = nn.Sigmoid()

        
        
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 1, 1)
        x = x.view(x.shape[0], -1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        # x = F.dropout(x, p=0.5)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        return x
    
    def quantize(self, num_bits=8):
        self.qconv = QConvBNReLU(self.conv, self.bn, qi=True, qo=True, num_bits=num_bits)
        self.qmaxpool2d = QMaxPooling2d(kernel_size=1)
        self.qfc1 = QLinear(self.fc1, qi=False, qo=True, num_bits=num_bits)
        self.qfc2 = QLinear(self.fc2, qi=False, qo=True, num_bits=num_bits)
        self.qfc3 = QLinear(self.fc3, qi=False, qo=True, num_bits=num_bits)
        self.qfc4 = QLinear(self.fc4, qi=False, qo=True, num_bits=num_bits)
        self.qsigmoid = QSigmoid(qi=False, qo=True, num_bits=num_bits)

    def quantize_forward(self, x):
        x = self.qconv(x)
        x = self.qmaxpool2d(x)
        x = x.view(x.shape[0], -1)
        x = self.qfc1(x)
        x = self.qfc2(x)
        x = self.qfc3(x)
        x = self.qfc4(x)
        x = self.qsigmoid(x)
        return x

    def freeze(self):
        self.qconv.freeze()
        self.qmaxpool2d.freeze(self.qconv.qo)
        self.qfc1.freeze(qi=self.qconv.qo)
        self.qfc2.freeze(qi=self.qfc1.qo)
        self.qfc3.freeze(qi=self.qfc2.qo)
        self.qfc4.freeze(qi=self.qfc3.qo)

    def quantize_inference(self, x):
        qx = self.qconv.qi.quantize_tensor(x)
        qx = self.qconv.quantize_inference(qx)
        qx = self.qmaxpool2d.quantize_inference(qx)
        qx = qx.view(qx.shape[0], -1)
        qx = self.qfc1.quantize_inference(qx)
        qx = self.qfc2.quantize_inference(qx)
        qx = self.qfc3.quantize_inference(qx)
        qx = self.qfc4.quantize_inference(qx)
        
        out = self.qfc4.qo.dequantize_tensor(qx)
        return out
    
batch_size = 128
learning_rate = 0.1
device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")

train_dataset = GDdataset("./train_data.csv")
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_dataset = GDdataset("./test_data.csv")
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=64)

In [2]:
def inference(model):   # 全精度推理
    model.eval()
    with torch.no_grad():
        correct = 0         # 分类正确个数
        test_loss = 0
        for value, target in test_loader:
            value, target = value.to(device), target.to(device)    # 扔给GPU
            output = model(value)       # (batch_size, 1)
            predicted = (output > 0.5).float()
            correct += (predicted == target).sum().item()
            test_loss += loss_function(output, target).item()
            
        test_loss /= len(test_loader.dataset)
        
    print("\nTest: average loss: {:.4f}, test_accuracy: {}/{} ({:.0f}%)".format(test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))

def qinference(model):  # 量化推理
    model.eval()
    with torch.no_grad(): 
        correct = 0         # 分类正确个数
        test_loss = 0
        for value, target in test_loader:
            value, target = value.to(device), target.to(device)    # 扔给GPU
            output = model.quantize_inference(value)       # (batch_size, 1)
            predicted = (output > 0.5).float()
            correct += (predicted == target).sum().item()
    print("\ntest_accuracy: {}/{} ({:.0f}%)".format(correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
        
            
    
def direct_quantize(model, test_loader):
    for i, (value, target) in enumerate(test_loader, 1):
        value, target = value.to(device), target.to(device)    # 扔给GPU
        output = model.quantize_forward(value)
        if i % 500 == 0:
            break
    print('direct quantization finish')

In [3]:
import functools
import torch
import torch.nn.functional as F
import types
import time
import pprint
# 创建一个字典来存储调用次数
call_count = {}

def count_calls(func, module_name=None):
    @functools.wraps(func)
    def wrapper_count_calls(*args, **kwargs):
        # module_name = func.__module__ if hasattr(func, '__module__') and func.__module__ else 'torch'
        full_name = module_name + '.' + func.__name__
        # call_count[full_name] = call_count.get(full_name, 0) + 1
        # print(f"Function {full_name} called {call_count[full_name]} times")
        # return func(*args, **kwargs)
        # 记录开始时间
        start_time = time.time()
        result = func(*args, **kwargs)
        # 记录结束时间
        end_time = time.time()
        
        # 计算调用时间
        elapsed_time = end_time - start_time
        
        if full_name not in call_count:
            call_count[full_name] = {"count": 0, "total_time": 0.0}
        
        call_count[full_name]["count"] += 1
        call_count[full_name]["total_time"] += elapsed_time
        
        return result
    wrapper_count_calls._is_decorated = True
    return wrapper_count_calls

def set_new_attr(module, attr_name, attr):
    if not hasattr(attr, "_is_decorated"):
        decorated_attr = count_calls(attr, module.__name__)
        decorated_attr._is_decorated = True
        setattr(module, attr_name, decorated_attr)

# 递归封装所有的包
def auto_decorate_module(module, visited=None):
    if visited is None:
        visited = set()
    
    module_name = module.__name__
    if module_name in visited:
        return
    visited.add(module_name)
    for attr_name in dir(module):
        try:
            attr = getattr(module, attr_name)
            # if isinstance(attr, types.FunctionType):
            if isinstance(attr, types.FunctionType):
                set_new_attr(module, attr_name, attr)
                # print(f"Decorated function: {module_name}.{attr_name}")
            elif isinstance(attr, types.ModuleType) and attr.__name__.startswith('torch'):
                # print(f"Descending into module: {attr.__name__}")
                auto_decorate_module(attr, visited)
            elif isinstance(attr, type):
                # print(f"Descending into class: {attr.__name__} in {module_name}")
                auto_decorate_class(attr)
            elif callable(attr):
                set_new_attr(module, attr_name, attr)
        except AttributeError:
            continue


def auto_decorate_class(cls):
    for attr_name in dir(cls):
        # if attr_name.startswith('__') and attr_name.endswith('__'):
        #     continue  # Skip special attributes
        try:
            attr = getattr(cls, attr_name)
            if isinstance(attr, types.FunctionType):
                set_new_attr(cls, attr_name, attr)
            elif attr_name in ['__add__', '__mul__', '__sub__', '__truediv__', '__matmul__', '__pow__', '__mod__']:
                # 特殊处理运算符重载方法
                set_new_attr(cls, attr_name, attr)
        except (AttributeError, TypeError) as e:
            continue
            
model = Net().to(device)
model.load_state_dict(torch.load('qmodel.pt', map_location='cpu'))sas
model.eval()
num_bits = 4
model.quantize(num_bits=num_bits)
direct_quantize(model, train_loader)
model.freeze()
value, _ = next(iter(test_loader))
value = value.to(device)
auto_decorate_module(torch)

direct quantization finish




In [None]:
model.quantize_inference(value)
pprint.pprint(call_count)

## model.ipynb

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import torch.utils.data
import math
from copy import deepcopy
import numpy as np
import cv2 as cv
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim     # for constructing optimizer
import torchvision.models as models
from module import *
from function import *

# 1. preparing the dataset
class GDdataset(Dataset):
    def __init__(self, path):
        self.data = pd.read_csv(path)
        SFR = torch.tensor(self.data.iloc[:, 2:46].values)
        blocks = [SFR[:, i:i+4].reshape(-1, 2, 2) for i in range(0, 45, 5)] # 将36列中的每4列合并成一个2*2矩阵，得到9个块
        
        self.value = torch.cat([torch.cat(blocks[i:i+3], dim=2) for i in range(0, 9, 3)], dim=1) # 将9个块按3*3的方式拼成一个大矩阵
        self.value = self.value.unsqueeze(1).to(torch.float32)
        
        self.target = torch.tensor([1.0 if x == 'OK' else 0.0 for x in self.data.iloc[:, 48].values])
        self.target = self.target.unsqueeze(1)
        
    def __getitem__(self, index):
        return self.value[index], self.target[index]
        
        
    def __len__(self):
        return len(self.data)
    
from module import *
# 2. define the model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=2, stride=2)
        self.bn = nn.BatchNorm2d(num_features=16)
        self.fc1 = nn.Linear(in_features=16*3*3, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=32)
        self.fc4 = nn.Linear(in_features=32, out_features=1)
        self.sigmoid = nn.Sigmoid()

        
        
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 1, 1)
        x = x.view(x.shape[0], -1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        # x = F.dropout(x, p=0.5)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        return x
    
    def quantize(self, num_bits=8):
        self.qconv = QConvBNReLU(self.conv, self.bn, qi=True, qo=True, num_bits=num_bits)
        self.qmaxpool2d = QMaxPooling2d(kernel_size=1)
        self.qfc1 = QLinear(self.fc1, qi=False, qo=True, num_bits=num_bits)
        self.qfc2 = QLinear(self.fc2, qi=False, qo=True, num_bits=num_bits)
        self.qfc3 = QLinear(self.fc3, qi=False, qo=True, num_bits=num_bits)
        self.qfc4 = QLinear(self.fc4, qi=False, qo=True, num_bits=num_bits)
        self.qsigmoid = QSigmoid(qi=False, qo=True, num_bits=num_bits)

    def quantize_forward(self, x):
        x = self.qconv(x)
        x = self.qmaxpool2d(x)
        x = x.view(x.shape[0], -1)
        x = self.qfc1(x)
        x = self.qfc2(x)
        x = self.qfc3(x)
        x = self.qfc4(x)
        x = self.qsigmoid(x)
        return x

    def freeze(self):
        self.qconv.freeze()
        self.qmaxpool2d.freeze(self.qconv.qo)
        self.qfc1.freeze(qi=self.qconv.qo)
        self.qfc2.freeze(qi=self.qfc1.qo)
        self.qfc3.freeze(qi=self.qfc2.qo)
        self.qfc4.freeze(qi=self.qfc3.qo)

    def quantize_inference(self, x):
        qx = self.qconv.qi.quantize_tensor(x)
        qx = self.qconv.quantize_inference(qx)
        qx = self.qmaxpool2d.quantize_inference(qx)
        qx = qx.view(qx.shape[0], -1)
        qx = self.qfc1.quantize_inference(qx)
        qx = self.qfc2.quantize_inference(qx)
        qx = self.qfc3.quantize_inference(qx)
        qx = self.qfc4.quantize_inference(qx)
        
        out = self.qfc4.qo.dequantize_tensor(qx)
        return out
    
batch_size = 128
learning_rate = 0.1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

train_dataset = GDdataset("./train_data.csv")
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_dataset = GDdataset("./test_data.csv")
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [4]:
model = Net().to(device)
value, _ = next(iter(test_loader))
model(value.to(device))

torch.Size([128, 144])


tensor([[0.4950],
        [0.4941],
        [0.4928],
        [0.4835],
        [0.4927],
        [0.4919],
        [0.4939],
        [0.4883],
        [0.4921],
        [0.4839],
        [0.4934],
        [0.4975],
        [0.4917],
        [0.4992],
        [0.4958],
        [0.4949],
        [0.4932],
        [0.4919],
        [0.4951],
        [0.4915],
        [0.4923],
        [0.4952],
        [0.4876],
        [0.4935],
        [0.4937],
        [0.4928],
        [0.4855],
        [0.4946],
        [0.4960],
        [0.4909],
        [0.4970],
        [0.4928],
        [0.4920],
        [0.4900],
        [0.4851],
        [0.4882],
        [0.4925],
        [0.4935],
        [0.4934],
        [0.4925],
        [0.4921],
        [0.4928],
        [0.4987],
        [0.4938],
        [0.4947],
        [0.4942],
        [0.4930],
        [0.4925],
        [0.4885],
        [0.4926],
        [0.4867],
        [0.4893],
        [0.4892],
        [0.4945],
        [0.4915],
        [0

In [None]:
import functools
import torch
import torch.nn.functional as F
import types
import time
import pprint
# 创建一个字典来存储调用次数
call_count = {}

def count_calls(func, module_name=None):
    @functools.wraps(func)
    def wrapper_count_calls(*args, **kwargs):
        # module_name = func.__module__ if hasattr(func, '__module__') and func.__module__ else 'torch'
        full_name = module_name + '.' + func.__name__
        # call_count[full_name] = call_count.get(full_name, 0) + 1
        # print(f"Function {full_name} called {call_count[full_name]} times")
        # return func(*args, **kwargs)
        # 记录开始时间
        start_time = time.time()
        result = func(*args, **kwargs)
        # 记录结束时间
        end_time = time.time()
        
        # 计算调用时间
        elapsed_time = end_time - start_time
        
        if full_name not in call_count:
            call_count[full_name] = {"count": 0, "total_time": 0.0}
        
        call_count[full_name]["count"] += 1
        call_count[full_name]["total_time"] += elapsed_time
        
        return result
    wrapper_count_calls._is_decorated = True
    return wrapper_count_calls

def set_new_attr(module, attr_name, attr):
    if not hasattr(attr, "_is_decorated"):
        decorated_attr = count_calls(attr, module.__name__)
        decorated_attr._is_decorated = True
        setattr(module, attr_name, decorated_attr)

# 递归封装所有的包
def auto_decorate_module(module, visited=None):
    if visited is None:
        visited = set()
    
    module_name = module.__name__
    if module_name in visited:
        return
    visited.add(module_name)
    for attr_name in dir(module):
        try:
            attr = getattr(module, attr_name)
            # if isinstance(attr, types.FunctionType):
            if isinstance(attr, types.FunctionType):
                set_new_attr(module, attr_name, attr)
                # print(f"Decorated function: {module_name}.{attr_name}")
            elif isinstance(attr, types.ModuleType) and attr.__name__.startswith('torch'):
                # print(f"Descending into module: {attr.__name__}")
                auto_decorate_module(attr, visited)
            elif isinstance(attr, type):
                # print(f"Descending into class: {attr.__name__} in {module_name}")
                auto_decorate_class(attr)
            elif callable(attr):
                set_new_attr(module, attr_name, attr)
        except AttributeError:
            continue


def auto_decorate_class(cls):
    for attr_name in dir(cls):
        # if attr_name.startswith('__') and attr_name.endswith('__'):
        #     continue  # Skip special attributes
        try:
            attr = getattr(cls, attr_name)
            if isinstance(attr, types.FunctionType):
                set_new_attr(cls, attr_name, attr)
            elif attr_name in ['__add__', '__mul__', '__sub__', '__truediv__', '__matmul__', '__pow__', '__mod__']:
                # 特殊处理运算符重载方法
                set_new_attr(cls, attr_name, attr)
        except (AttributeError, TypeError) as e:
            continue
            
model = Net().to(device)
model.load_state_dict(torch.load('model.pt', map_location='cpu'))
model.eval()
value, _ = next(iter(test_loader))

auto_decorate_module(torch)

In [None]:
model(value)
pprint.pprint(call_count)

## result

### model

In [None]:
{'ABCMeta.__instancecheck__': {'count': 11,
                               'total_time': 2.1457672119140625e-05},
 'BatchNorm2d.__getattr__': {'count': 5, 'total_time': 4.5299530029296875e-06},
 'BatchNorm2d._call_impl': {'count': 1, 'total_time': 0.00021457672119140625},
 'BatchNorm2d._check_input_dim': {'count': 1,
                                  'total_time': 4.291534423828125e-06},
 'BatchNorm2d._wrapped_call_impl': {'count': 1,
                                    'total_time': 0.00021958351135253906},
 'BatchNorm2d.forward': {'count': 1, 'total_time': 0.00020074844360351562},
 'ContextProp.__get__': {'count': 1, 'total_time': 4.76837158203125e-06},
 'Conv2d.__getattr__': {'count': 2, 'total_time': 1.6689300537109375e-06},
 'Conv2d._call_impl': {'count': 1, 'total_time': 0.0011267662048339844},
 'Conv2d._conv_forward': {'count': 1, 'total_time': 0.0011107921600341797},
 'Conv2d._wrapped_call_impl': {'count': 1, 'total_time': 0.0011310577392578125},
 'Conv2d.forward': {'count': 1, 'total_time': 0.0011196136474609375},
 'FunctionMeta.__getattribute__': {'count': 694,
                                   'total_time': 0.0006601810455322266},
 'FunctionMeta.__setattr__': {'count': 17,
                              'total_time': 1.5020370483398438e-05},
 'Linear.__getattr__': {'count': 8, 'total_time': 6.9141387939453125e-06},
 'Linear._call_impl': {'count': 4, 'total_time': 0.0005695819854736328},
 'Linear._wrapped_call_impl': {'count': 4, 'total_time': 0.0005776882171630859},
 'Linear.forward': {'count': 4, 'total_time': 0.00054168701171875},
 'Mapping.get': {'count': 2, 'total_time': 1.621246337890625e-05},
 'Module.__getattr__': {'count': 7, 'total_time': 1.2159347534179688e-05},
 'Module._call_impl': {'count': 2, 'total_time': 0.0024499893188476562},
 'Module._wrapped_call_impl': {'count': 2, 'total_time': 0.002457857131958008},
 'OpOverloadPacket.__getattr__': {'count': 1031,
                                  'total_time': 0.0009160041809082031},
 'OpOverloadPacket.__str__': {'count': 2062,
                              'total_time': 0.0016379356384277344},
 'Queue._get': {'count': 3, 'total_time': 2.384185791015625e-06},
 'Queue._put': {'count': 3, 'total_time': 2.384185791015625e-06},
 'Queue._qsize': {'count': 78, 'total_time': 3.0040740966796875e-05},
 'Queue.empty': {'count': 75, 'total_time': 0.00018930435180664062},
 'Queue.get': {'count': 3, 'total_time': 1.6450881958007812e-05},
 'Queue.put': {'count': 3, 'total_time': 2.4318695068359375e-05},
 'Sigmoid.forward': {'count': 1, 'total_time': 6.079673767089844e-05},
 'Tensor.__format__': {'count': 64, 'total_time': 0.0001811981201171875},
 'Tensor.__iter__': {'count': 2, 'total_time': 0.00019240379333496094},
 'Tensor.__repr__': {'count': 1, 'total_time': 0.0019884109497070312},
 'Tensor.__truediv__': {'count': 1, 'total_time': 3.6716461181640625e-05},
 'UnpackedDualTensor.__new__': {'count': 1,
                                'total_time': 1.430511474609375e-06},
 '_ClassPropertyDescriptor.__get__': {'count': 36,
                                      'total_time': 6.818771362304688e-05},
 '_Formatter.__init__': {'count': 1, 'total_time': 0.0010960102081298828},
 '_Formatter.format': {'count': 64, 'total_time': 9.608268737792969e-05},
 '_Formatter.width': {'count': 64, 'total_time': 1.3828277587890625e-05},
 '_GenericAlias.__getattr__': {'count': 14,
                               'total_time': 6.4373016357421875e-06},
 '_GenericAlias.__hash__': {'count': 70, 'total_time': 6.008148193359375e-05},
 '_ModeStackStateForPreDispatch.count': {'count': 1,
                                         'total_time': 1.9073486328125e-06},
 '_NoParamDecoratorContextManager.__new__': {'count': 2,
                                             'total_time': 3.0994415283203125e-06},
 '_ParameterMeta.__instancecheck__': {'count': 1,
                                      'total_time': 3.0994415283203125e-06},
 '_lazy_property_and_property.__init__': {'count': 8,
                                          'total_time': 5.0067901611328125e-06},
 '_reduce_op.__getattribute__': {'count': 6, 'total_time': 0.00070953369140625},
 'cached_property.__get__': {'count': 13, 'total_time': 6.198883056640625e-06},
 'lazy_property.__get__': {'count': 64, 'total_time': 5.3882598876953125e-05},
 'no_grad.__enter__': {'count': 2, 'total_time': 2.3365020751953125e-05},
 'no_grad.__exit__': {'count': 2, 'total_time': 1.2874603271484375e-05},
 'no_grad.__init__': {'count': 2, 'total_time': 9.5367431640625e-06},
 'set_grad_enabled.__init__': {'count': 4, 'total_time': 2.002716064453125e-05},
 'torch._C._functorch.is_functorch_wrapped_tensor': {'count': 1,
                                                     'total_time': 2.6226043701171875e-06},
 'torch._C._get_default_device': {'count': 1,
                                  'total_time': 1.6689300537109375e-06},
 'torch._C._get_tracing_state': {'count': 10,
                                 'total_time': 2.2172927856445312e-05},
 'torch._C._set_grad_enabled': {'count': 4,
                                'total_time': 4.5299530029296875e-06},
 'torch._is_functional_tensor': {'count': 1, 'total_time': 5.7220458984375e-06},
 'torch._jit_internal.is_scripting': {'count': 2,
                                      'total_time': 1.430511474609375e-06},
 'torch._ops._len_torch_dispatch_stack_pre_dispatch': {'count': 1,
                                                       'total_time': 8.106231689453125e-06},
 'torch._ops.mode_stack_state_for_pre_dispatch': {'count': 1,
                                                  'total_time': 9.5367431640625e-07},
 'torch._tensor._has_torch_function_unary': {'count': 65,
                                             'total_time': 1.5974044799804688e-05},
 'torch._tensor_str._add_suffixes': {'count': 1,
                                     'total_time': 3.5762786865234375e-06},
 'torch._tensor_str._str': {'count': 1, 'total_time': 0.0019779205322265625},
 'torch._tensor_str._str_intern': {'count': 1,
                                   'total_time': 0.0018842220306396484},
 'torch._tensor_str._tensor_str': {'count': 1,
                                   'total_time': 0.001772165298461914},
 'torch._tensor_str._tensor_str_with_formatter': {'count': 65,
                                                  'total_time': 0.0011162757873535156},
 'torch._tensor_str._vector_str': {'count': 64,
                                   'total_time': 0.00038051605224609375},
 'torch._tensor_str.tensor_totype': {'count': 3,
                                     'total_time': 1.9073486328125e-05},
 'torch.autograd.forward_ad.unpack_dual': {'count': 1,
                                           'total_time': 5.4836273193359375e-06},
 'torch.autograd.function._warn_traceable_deprecated': {'count': 6,
                                                        'total_time': 0.0002455711364746094},
 'torch.batch_norm': {'count': 1, 'total_time': 0.00015163421630859375},
 'torch.ceil': {'count': 1, 'total_time': 2.9325485229492188e-05},
 'torch.get_default_dtype': {'count': 2, 'total_time': 1.430511474609375e-06},
 'torch.is_grad_enabled': {'count': 6, 'total_time': 2.384185791015625e-06},
 'torch.isfinite': {'count': 1, 'total_time': 0.0001468658447265625},
 'torch.masked_select': {'count': 1, 'total_time': 9.846687316894531e-05},
 'torch.max_pool2d': {'count': 1, 'total_time': 0.00012755393981933594},
 'torch.nn.functional._has_torch_function_unary': {'count': 5,
                                                   'total_time': 1.9073486328125e-06},
 'torch.nn.functional._has_torch_function_variadic': {'count': 1,
                                                      'total_time': 7.152557373046875e-07},
 'torch.nn.functional.batch_norm': {'count': 1,
                                    'total_time': 0.0001735687255859375},
 'torch.nn.functional.conv2d': {'count': 1,
                                'total_time': 0.0010974407196044922},
 'torch.nn.functional.linear': {'count': 4,
                                'total_time': 0.0005085468292236328},
 'torch.nn.functional.max_pool2d': {'count': 1,
                                    'total_time': 0.00013709068298339844},
 'torch.nn.functional.relu': {'count': 4, 'total_time': 0.0001201629638671875},
 'torch.relu': {'count': 4, 'total_time': 0.00010156631469726562},
 'torch.sigmoid': {'count': 1, 'total_time': 5.650520324707031e-05},
 'torch.utils._python_dispatch._disable_current_modes': {'count': 1,
                                                         'total_time': 3.337860107421875e-06},
 'torch.utils._python_dispatch._len_torch_dispatch_stack': {'count': 1,
                                                            'total_time': 7.152557373046875e-07}}

### sas

In [None]:
{'BN.__call__': {'count': 64, 'total_time': 0.0003211498260498047},
 'BN._assert_inputs': {'count': 64, 'total_time': 1.8596649169921875e-05},
 'Conv2d.__call__': {'count': 64, 'total_time': 0.0007078647613525391},
 'Conv2d._assert_inputs': {'count': 64, 'total_time': 2.2172927856445312e-05},
 'Dense.__call__': {'count': 320, 'total_time': 0.0014071464538574219},
 'Dense._assert_inputs': {'count': 320, 'total_time': 7.319450378417969e-05},
 'InputLayer.__init__': {'count': 64, 'total_time': 0.0007097721099853516},
 'Layer.__call__': {'count': 256, 'total_time': 0.0012547969818115234},
 'Layer.__init__': {'count': 64, 'total_time': 7.200241088867188e-05},
 'Layer._assert_inputs': {'count': 128, 'total_time': 3.0517578125e-05},
 'Node.__init__': {'count': 576, 'total_time': 0.00013875961303710938},
 'Tensor.__init__': {'count': 704, 'total_time': 0.00018477439880371094},
 'dlpy.layers._unpack_config': {'count': 64,
                                'total_time': 0.00021314620971679688},
 'dlpy.layers.get_color': {'count': 64, 'total_time': 5.412101745605469e-05}}

### qmodel

In [None]:
{'ABCMeta.__instancecheck__': {'count': 23,
                               'total_time': 0.00011324882507324219},
 'ABCMeta.__subclasscheck__': {'count': 31,
                               'total_time': 0.00019693374633789062},
 'Conv2d.__getattr__': {'count': 2, 'total_time': 1.430511474609375e-06},
 'Conv2d._call_impl': {'count': 1, 'total_time': 0.0004940032958984375},
 'Conv2d._conv_forward': {'count': 1, 'total_time': 0.00046372413635253906},
 'Conv2d._wrapped_call_impl': {'count': 1, 'total_time': 0.0004985332489013672},
 'Conv2d.forward': {'count': 1, 'total_time': 0.0004734992980957031},
 'FunctionMeta.__getattribute__': {'count': 694,
                                   'total_time': 0.0006117820739746094},
 'FunctionMeta.__setattr__': {'count': 17,
                              'total_time': 1.3828277587890625e-05},
 'Linear.__getattr__': {'count': 8, 'total_time': 6.4373016357421875e-06},
 'Linear._call_impl': {'count': 4, 'total_time': 0.00022482872009277344},
 'Linear._wrapped_call_impl': {'count': 4,
                               'total_time': 0.00023365020751953125},
 'Linear.forward': {'count': 4, 'total_time': 0.0001964569091796875},
 'Mapping.get': {'count': 2, 'total_time': 1.6450881958007812e-05},
 'Module.__getattr__': {'count': 44, 'total_time': 5.269050598144531e-05},
 'OpOverloadPacket.__getattr__': {'count': 1031,
                                  'total_time': 0.0009312629699707031},
 'OpOverloadPacket.__str__': {'count': 2062, 'total_time': 0.09182119369506836},
 'Queue._get': {'count': 4, 'total_time': 2.384185791015625e-06},
 'Queue._put': {'count': 4, 'total_time': 3.337860107421875e-06},
 'Queue._qsize': {'count': 129, 'total_time': 4.601478576660156e-05},
 'Queue.empty': {'count': 125, 'total_time': 0.00028228759765625},
 'Queue.get': {'count': 4, 'total_time': 2.1696090698242188e-05},
 'Queue.put': {'count': 4, 'total_time': 2.956390380859375e-05},
 'Tensor.__add__': {'count': 6, 'total_time': 6.175041198730469e-05},
 'Tensor.__format__': {'count': 64, 'total_time': 0.0001678466796875},
 'Tensor.__iter__': {'count': 2, 'total_time': 0.00019860267639160156},
 'Tensor.__mul__': {'count': 6, 'total_time': 6.914138793945312e-05},
 'Tensor.__repr__': {'count': 1, 'total_time': 0.001844167709350586},
 'Tensor.__sub__': {'count': 6, 'total_time': 5.7220458984375e-05},
 'Tensor.__truediv__': {'count': 2, 'total_time': 0.00017309188842773438},
 'UnpackedDualTensor.__new__': {'count': 1,
                                'total_time': 1.430511474609375e-06},
 '_ClassPropertyDescriptor.__get__': {'count': 36,
                                      'total_time': 6.341934204101562e-05},
 '_Formatter.__init__': {'count': 1, 'total_time': 0.0009264945983886719},
 '_Formatter.format': {'count': 64, 'total_time': 7.081031799316406e-05},
 '_Formatter.width': {'count': 64, 'total_time': 1.7642974853515625e-05},
 '_GenericAlias.__getattr__': {'count': 14,
                               'total_time': 6.4373016357421875e-06},
 '_GenericAlias.__hash__': {'count': 112, 'total_time': 5.698204040527344e-05},
 '_ModeStackStateForPreDispatch.count': {'count': 1,
                                         'total_time': 2.1457672119140625e-06},
 '_NoParamDecoratorContextManager.__new__': {'count': 2,
                                             'total_time': 2.6226043701171875e-06},
 '_ParameterMeta.__instancecheck__': {'count': 1,
                                      'total_time': 3.337860107421875e-06},
 '_lazy_property_and_property.__init__': {'count': 8,
                                          'total_time': 3.337860107421875e-06},
 '_reduce_op.__getattribute__': {'count': 6,
                                 'total_time': 0.0006513595581054688},
 'cached_property.__get__': {'count': 13, 'total_time': 8.106231689453125e-06},
 'lazy_property.__get__': {'count': 64, 'total_time': 4.673004150390625e-05},
 'no_grad.__enter__': {'count': 2, 'total_time': 2.384185791015625e-05},
 'no_grad.__exit__': {'count': 2, 'total_time': 1.239776611328125e-05},
 'no_grad.__init__': {'count': 2, 'total_time': 9.775161743164062e-06},
 'set_grad_enabled.__init__': {'count': 4,
                               'total_time': 2.1457672119140625e-05},
 'torch._C._functorch.is_functorch_wrapped_tensor': {'count': 1,
                                                     'total_time': 2.86102294921875e-06},
 'torch._C._get_default_device': {'count': 1,
                                  'total_time': 1.9073486328125e-06},
 'torch._C._get_tracing_state': {'count': 7,
                                 'total_time': 1.5735626220703125e-05},
 'torch._C._set_grad_enabled': {'count': 4,
                                'total_time': 5.245208740234375e-06},
 'torch._is_functional_tensor': {'count': 1,
                                 'total_time': 9.775161743164062e-06},
 'torch._jit_internal.is_scripting': {'count': 2,
                                      'total_time': 1.430511474609375e-06},
 'torch._ops._len_torch_dispatch_stack_pre_dispatch': {'count': 1,
                                                       'total_time': 8.106231689453125e-06},
 'torch._ops.mode_stack_state_for_pre_dispatch': {'count': 1,
                                                  'total_time': 9.5367431640625e-07},
 'torch._tensor._has_torch_function_unary': {'count': 65,
                                             'total_time': 1.7404556274414062e-05},
 'torch._tensor_str._add_suffixes': {'count': 1,
                                     'total_time': 4.0531158447265625e-06},
 'torch._tensor_str._str': {'count': 1, 'total_time': 0.0018336772918701172},
 'torch._tensor_str._str_intern': {'count': 1,
                                   'total_time': 0.0017368793487548828},
 'torch._tensor_str._tensor_str': {'count': 1,
                                   'total_time': 0.001621246337890625},
 'torch._tensor_str._tensor_str_with_formatter': {'count': 65,
                                                  'total_time': 0.0011370182037353516},
 'torch._tensor_str._vector_str': {'count': 64,
                                   'total_time': 0.00037741661071777344},
 'torch._tensor_str.tensor_totype': {'count': 3,
                                     'total_time': 1.7642974853515625e-05},
 'torch.autograd.forward_ad.unpack_dual': {'count': 1,
                                           'total_time': 4.76837158203125e-06},
 'torch.autograd.function._warn_traceable_deprecated': {'count': 6,
                                                        'total_time': 0.0002243518829345703},
 'torch.ceil': {'count': 1, 'total_time': 2.3126602172851562e-05},
 'torch.get_default_dtype': {'count': 2, 'total_time': 1.6689300537109375e-06},
 'torch.is_grad_enabled': {'count': 6, 'total_time': 2.86102294921875e-06},
 'torch.isfinite': {'count': 1, 'total_time': 0.00011134147644042969},
 'torch.masked_select': {'count': 1, 'total_time': 8.20159912109375e-05},
 'torch.max_pool2d': {'count': 1, 'total_time': 3.910064697265625e-05},
 'torch.nn.functional._has_torch_function_unary': {'count': 1,
                                                   'total_time': 4.76837158203125e-07},
 'torch.nn.functional.conv2d': {'count': 1,
                                'total_time': 0.00045418739318847656},
 'torch.nn.functional.linear': {'count': 4,
                                'total_time': 0.00016880035400390625},
 'torch.nn.functional.max_pool2d': {'count': 1,
                                    'total_time': 4.982948303222656e-05},
 'torch.utils._python_dispatch._disable_current_modes': {'count': 1,
                                                         'total_time': 2.86102294921875e-06},
 'torch.utils._python_dispatch._len_torch_dispatch_stack': {'count': 1,
                                                            'total_time': 9.5367431640625e-07}}