In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import torch.utils.data
import math
from copy import deepcopy
import numpy as np
import cv2 as cv
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim     # for constructing optimizer
import torchvision.models as models
from module import *
from function import *

# 1. preparing the dataset
class GDdataset(Dataset):
    def __init__(self, path):
        self.data = pd.read_csv(path)
        SFR = torch.tensor(self.data.iloc[:, 2:46].values)
        blocks = [SFR[:, i:i+4].reshape(-1, 2, 2) for i in range(0, 45, 5)] # 将36列中的每4列合并成一个2*2矩阵，得到9个块
        
        self.value = torch.cat([torch.cat(blocks[i:i+3], dim=2) for i in range(0, 9, 3)], dim=1) # 将9个块按3*3的方式拼成一个大矩阵
        self.value = self.value.unsqueeze(1).to(torch.float32)
        
        self.target = torch.tensor([1.0 if x == 'OK' else 0.0 for x in self.data.iloc[:, 48].values])
        self.target = self.target.unsqueeze(1)
        
    def __getitem__(self, index):
        return self.value[index], self.target[index]
        
        
    def __len__(self):
        return len(self.data)

In [2]:
from module import *
# 2. define the model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=2, stride=2)
        self.bn = nn.BatchNorm2d(num_features=16)
        self.fc1 = nn.Linear(in_features=16*3*3, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=32)
        self.fc4 = nn.Linear(in_features=32, out_features=1)
        self.sigmoid = nn.Sigmoid()

        
        
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 1, 1)
        x = x.view(x.shape[0], -1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        # x = F.dropout(x, p=0.5)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        return x
    
    def quantize(self, num_bits=8):
        self.qconv = QConvBNReLU(self.conv, self.bn, qi=True, qo=True, num_bits=num_bits)
        self.qmaxpool2d = QMaxPooling2d(kernel_size=1)
        self.qfc1 = QLinear(self.fc1, qi=False, qo=True, num_bits=num_bits)
        self.qfc2 = QLinear(self.fc2, qi=False, qo=True, num_bits=num_bits)
        self.qfc3 = QLinear(self.fc3, qi=False, qo=True, num_bits=num_bits)
        self.qfc4 = QLinear(self.fc4, qi=False, qo=True, num_bits=num_bits)
        self.qsigmoid = QSigmoid(qi=False, qo=True, num_bits=num_bits)

    def quantize_forward(self, x):
        x = self.qconv(x)
        x = self.qmaxpool2d(x)
        x = x.view(x.shape[0], -1)
        x = self.qfc1(x)
        x = self.qfc2(x)
        x = self.qfc3(x)
        x = self.qfc4(x)
        x = self.qsigmoid(x)
        return x

    def freeze(self):
        self.qconv.freeze()
        self.qmaxpool2d.freeze(self.qconv.qo)
        self.qfc1.freeze(qi=self.qconv.qo)
        self.qfc2.freeze(qi=self.qfc1.qo)
        self.qfc3.freeze(qi=self.qfc2.qo)
        self.qfc4.freeze(qi=self.qfc3.qo)

    def quantize_inference(self, x):
        qx = self.qconv.qi.quantize_tensor(x)
        qx = self.qconv.quantize_inference(qx)
        qx = self.qmaxpool2d.quantize_inference(qx)
        qx = qx.view(qx.shape[0], -1)
        qx = self.qfc1.quantize_inference(qx)
        qx = self.qfc2.quantize_inference(qx)
        qx = self.qfc3.quantize_inference(qx)
        qx = self.qfc4.quantize_inference(qx)
        
        out = self.qfc4.qo.dequantize_tensor(qx)
        return out
    
batch_size = 128
learning_rate = 0.1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

train_dataset = GDdataset("./train_data.csv")
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_dataset = GDdataset("./test_data.csv")
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [3]:
from torch.utils.tensorboard import SummaryWriter   
tb = SummaryWriter()

model = Net()
model.to(device)

# 3. Construct Loss and Optimizer
loss_function = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr = learning_rate)


# 4. define training cycle
def train(epoch):
    model.train()
    total_loss = 0
    total_correct = 0
    for batch_idx, (value, target) in enumerate(train_loader):
        value, target = value.to(device), target.to(device)    # 扔给GPU
        optimizer.zero_grad()
        # forward + backward + update
        output = model(value)
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()

        predicted = (output > 0.5).float()
        total_correct += (predicted == target).sum().item()
        total_loss += loss.item()
        
        progress = math.ceil(batch_idx / len(train_loader) * 50)
        print("\rTrain epoch %d: %d/%d, [%-51s] % d%%" % (epoch, len(train_dataset), len(train_loader.dataset), '-' * progress + '>', progress * 2), end="")
        
    # 输出每轮的loss
    # print("\n\n[epoch %d] loss: %.3f train_accuracy: %d / %d=%.3f" % (epoch+1, running_loss, total_correct, len(train_dataset), total_correct / len(train_dataset)))
    
    tb.add_scalar("Loss", total_loss, epoch)              # scalar标量，即添加一个数字
    tb.add_scalar("Number Correct", total_correct, epoch)
    tb.add_scalar("Accuracy", total_correct / len(train_dataset), epoch)
    
def test(epoch):
    model.eval()
    with torch.no_grad(): 
        correct = 0         # 分类正确个数
        # total = 0           # 总数
        test_loss = 0
        for value, target in test_loader:
            value, target = value.to(device), target.to(device)    # 扔给GPU
            output = model(value)       # (batch_size, 1)
            predicted = (output > 0.5).float()
            # total += target.size(0)     # 加batch_size
            correct += (predicted == target).sum().item()
            test_loss += loss_function(output, target).item()
            
        test_loss /= len(test_loader.dataset)
        
        print("\nTest: average loss: {:.4f}, test_accuracy: {}/{} ({:.0f}%)".format(test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
        tb.add_scalar("Test_accuracy", correct / len(test_loader.dataset), epoch)
        
    # print("Accuracy on test set: %.1f%%" % (100 * correct / len(test_loader.dataset)))

## Pre-train

In [5]:
for epoch in range(30):
    train(epoch)
    test(epoch)
torch.save(model.state_dict(), "model1.pt")

Train epoch 0: 8773/8773, [-------------------------------------------------->]  100%
Test: average loss: 0.0010, test_accuracy: 2086/2194 (95%)
Train epoch 1: 8773/8773, [-------------------------------------------------->]  100%
Test: average loss: 0.0010, test_accuracy: 2097/2194 (96%)
Train epoch 2: 8773/8773, [-------------------------------------------------->]  100%
Test: average loss: 0.0010, test_accuracy: 2105/2194 (96%)
Train epoch 3: 8773/8773, [-------------------------------------------------->]  100%
Test: average loss: 0.0010, test_accuracy: 2119/2194 (97%)
Train epoch 4: 8773/8773, [-------------------------------------------------->]  100%
Test: average loss: 0.0010, test_accuracy: 2118/2194 (97%)
Train epoch 5: 8773/8773, [-------------------------------------------------->]  100%
Test: average loss: 0.0010, test_accuracy: 2123/2194 (97%)
Train epoch 6: 8773/8773, [-------------------------------------------------->]  100%
Test: average loss: 0.0011, test_accuracy: 2

## Post training quantization

In [6]:
def inference(model):   # 全精度推理
    model.eval()
    with torch.no_grad():
        correct = 0         # 分类正确个数
        test_loss = 0
        for value, target in test_loader:
            value, target = value.to(device), target.to(device)    # 扔给GPU
            output = model(value)       # (batch_size, 1)
            predicted = (output > 0.5).float()
            correct += (predicted == target).sum().item()
            test_loss += loss_function(output, target).item()
            
        test_loss /= len(test_loader.dataset)
        
    print("\nTest: average loss: {:.4f}, test_accuracy: {}/{} ({:.0f}%)".format(test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))

def qinference(model):  # 量化推理
    model.eval()
    with torch.no_grad(): 
        correct = 0         # 分类正确个数
        test_loss = 0
        for value, target in test_loader:
            value, target = value.to(device), target.to(device)    # 扔给GPU
            output = model.quantize_inference(value)       # (batch_size, 1)
            predicted = (output > 0.5).float()
            correct += (predicted == target).sum().item()
    print("\ntest_accuracy: {}/{} ({:.0f}%)".format(correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
        
            
    
def direct_quantize(model, test_loader):
    for i, (value, target) in enumerate(test_loader, 1):
        value, target = value.to(device), target.to(device)    # 扔给GPU
        output = model.quantize_forward(value)
        if i % 500 == 0:
            break
    print('direct quantization finish')

In [7]:
model = Net().to(device)
model.load_state_dict(torch.load('model1.pt', map_location='cpu'))
save_file = "model1.pt"

loss_function = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr = learning_rate)

In [8]:
inference(model)


Test: average loss: 0.0009, test_accuracy: 2124/2194 (97%)


In [9]:
num_bits = 8
model.quantize(num_bits=num_bits)
direct_quantize(model, train_loader)
# torch.save(model.state_dict(), save_file)
model.freeze()
qinference(model)

direct quantization finish

test_accuracy: 1935/2194 (88%)


## Profiler

### Before

In [4]:
import torch.profiler
from tqdm import tqdm
model = Net().to(device)
model.load_state_dict(torch.load('model1.pt', map_location='cpu'))
model.eval()

profiler = torch.profiler.profile(
    schedule=torch.profiler.schedule(wait=2, warmup=2, active=6, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler(dir_name='./performance/'),
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA
    ],
    with_stack=True,
)
profiler.start()
with torch.no_grad():
    # for value, target in test_loader:
    #     value, target = value.to(device), target.to(device)    # 扔给GPU
    #     output = model(value)       # (batch_size, 1)
        # predicted = (output > 0.5).float()
        # # total += target.size(0)     # 加batch_size
        # correct += (predicted == target).sum().item()
        # test_loss += loss_function(output, target).item()
    for batch_idx in tqdm(range(100), desc='Profiling ...'):
        value, _ = next(iter(train_loader))
        model(value.to(device))
        profiler.step()
profiler.stop()

Profiling ...:   0%|          | 0/100 [00:00<?, ?it/s]

Profiling ...:   1%|          | 1/100 [00:00<00:18,  5.25it/s]STAGE:2024-06-11 07:28:43 1978000:1978000 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-06-11 07:28:43 1978000:1978000 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-06-11 07:28:43 1978000:1978000 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
Profiling ...: 100%|██████████| 100/100 [00:00<00:00, 151.77it/s]


### After

In [9]:
model = Net().to(device)
model.load_state_dict(torch.load('qmodel.pt', map_location='cpu'))
model.eval()
num_bits = 4
model.quantize(num_bits=num_bits)
direct_quantize(model, train_loader)
model.freeze()

direct quantization finish


In [10]:
import torch.profiler
from tqdm import tqdm
profiler = torch.profiler.profile(
    schedule=torch.profiler.schedule(wait=2, warmup=2, active=6, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler(dir_name='./performance/'),
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA
    ],
    with_stack=True,
)
profiler.start()
with torch.no_grad():
    for batch_idx in tqdm(range(100), desc='Profiling ...'):
        value, _ = next(iter(train_loader))
        model.quantize_inference(value.to(device))
        profiler.step()
profiler.stop()

Profiling ...:   0%|          | 0/100 [00:00<?, ?it/s]STAGE:2024-06-11 07:29:23 1978000:1978000 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-06-11 07:29:23 1978000:1978000 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-06-11 07:29:23 1978000:1978000 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
Profiling ...: 100%|██████████| 100/100 [00:00<00:00, 259.29it/s]
