## Структурная   Спарсификация (filter/channel wise)

- сначала применяем свертку к фильтрам затем к каналам


In [1]:
import os
import random

import numpy as np
import torch

# from torchvision.io import decode_image
# from torchvision.models import ResNet18_Weights, resnet18
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Set the global seed for NumPy's random number generator
seed_value = 42
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")


set_seed(42)

Random seed set as 42


In [None]:
from structural_sparsification import StructuredSConv
from classify_single import evaluate_model, classify_single_image

In [None]:
import torch.nn as nn


class SparseLeNet(nn.Module):
    def __init__(self, sparsity_ratio=0.7):
        super().__init__()
        self.conv1 = StructuredSConv(
            sparsity_ratio,
            in_channels=3,
            out_channels=6,
            kernel_size=5,
        )
        self.conv2 = StructuredSConv(
            sparsity_ratio, in_channels=6, out_channels=16, kernel_size=5
        )
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


def weights_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)


model = SparseLeNet(0.8)
model.apply(weights_init)
model.to(device)


SparseLeNet(
  (conv1): StructuredSConv(
    (s_filter): StructuredSparsifier()
    (s_channel): StructuredSparsifier()
  )
  (conv2): StructuredSConv(
    (s_filter): StructuredSparsifier()
    (s_channel): StructuredSparsifier()
  )
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [4]:
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)


train_dataset = datasets.CIFAR10(
    root="./data",
    train=True,
    download=True,
    transform=transform,
)
test_dataset = datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=transform,
)

class_names = [
    "airplane",
    "automobile",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
]

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

100%|██████████| 170M/170M [00:04<00:00, 42.5MB/s]


In [6]:
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.01)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
num_epochs = 10

In [None]:
import time

model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0
    total = 0

    total_forward_time = 0.0
    total_backward_time = 0.0
    total_forward_memory = 0.0
    total_backward_memory = 0.0

    for batch_idx, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        torch.cuda.reset_peak_memory_stats()

        torch.cuda.synchronize()
        mem_before_forward = torch.cuda.memory_allocated()

        forward_start = time.perf_counter()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        torch.cuda.synchronize()
        forward_end = time.perf_counter()

        forward_time = forward_end - forward_start
        total_forward_time += forward_time

        mem_after_forward = torch.cuda.memory_allocated()
        forward_memory = (mem_after_forward - mem_before_forward) / 1024**2
        total_forward_memory += forward_memory

        torch.cuda.synchronize()

        # reset peak memory stats for isolated backward measurement
        torch.cuda.reset_peak_memory_stats()

        backward_start = time.perf_counter()
        loss.backward()
        torch.cuda.synchronize()
        backward_end = time.perf_counter()

        backward_time = backward_end - backward_start
        total_backward_time += backward_time

        # peak memory used inside backward
        backward_peak = torch.cuda.max_memory_allocated() / 1024**2
        total_backward_memory += backward_peak

        optimizer.step()

        # stats
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    # epoch metrics
    avg_forward = total_forward_time / len(train_loader)
    avg_backward = total_backward_time / len(train_loader)
    avg_forward_mem = total_forward_memory / len(train_loader)
    avg_backward_mem = total_backward_memory / len(train_loader)
    peak_memory = torch.cuda.max_memory_allocated() / 1024**2

    print(
        f"Epoch [{epoch + 1}/{num_epochs}] "
        f"Loss: {running_loss / len(train_loader):.4f}, "
        f"Acc: {100 * correct / total:.2f}%, "
        f"Fwd: {avg_forward * 1000:.2f} ms ({avg_forward_mem:.2f} MB), "
        f"Bwd: {avg_backward * 1000:.2f} ms ({avg_backward_mem:.2f} MB), "
        f"Peak: {peak_memory:.2f} MB"
    )


Epoch [1/10] Loss: 2.3030, Acc: 9.80%, Fwd: 2.42 ms (2.78 MB), Bwd: 2.64 ms (25.11 MB), Peak: 20.75 MB
Epoch [2/10] Loss: 2.3031, Acc: 9.98%, Fwd: 1.06 ms (2.77 MB), Bwd: 2.53 ms (25.11 MB), Peak: 20.75 MB
Epoch [3/10] Loss: 2.3030, Acc: 9.99%, Fwd: 1.00 ms (2.77 MB), Bwd: 2.46 ms (25.11 MB), Peak: 20.75 MB
Epoch [4/10] Loss: 2.3030, Acc: 9.82%, Fwd: 1.03 ms (2.77 MB), Bwd: 2.48 ms (25.11 MB), Peak: 20.75 MB
Epoch [5/10] Loss: 2.3031, Acc: 9.82%, Fwd: 1.02 ms (2.77 MB), Bwd: 2.46 ms (25.11 MB), Peak: 20.75 MB
Epoch [6/10] Loss: 2.3030, Acc: 9.88%, Fwd: 1.01 ms (2.77 MB), Bwd: 2.47 ms (25.11 MB), Peak: 20.75 MB
Epoch [7/10] Loss: 2.3030, Acc: 9.70%, Fwd: 1.00 ms (2.77 MB), Bwd: 2.46 ms (25.11 MB), Peak: 20.75 MB
Epoch [8/10] Loss: 2.3030, Acc: 9.85%, Fwd: 1.02 ms (2.77 MB), Bwd: 2.46 ms (25.11 MB), Peak: 20.75 MB
Epoch [9/10] Loss: 2.3030, Acc: 9.92%, Fwd: 0.99 ms (2.77 MB), Bwd: 2.45 ms (25.11 MB), Peak: 20.75 MB
Epoch [10/10] Loss: 2.3030, Acc: 10.06%, Fwd: 0.97 ms (2.77 MB), Bwd: 2.4

In [None]:
import json

acc = evaluate_model(model, test_loader, device)

report = {
    "name": "sparse_struct_lenet_kernel7_bs8",
    "batch_size": 64,
    "avg_forward": avg_forward,
    "avg_backward": avg_backward,
    "avg_forward_mem": avg_forward_mem,
    "avg_backward_mem": avg_backward_mem,
    "kernel_size": 5,
    "acc": acc,
}

file_path = "sparse_struct_lenet_kernel7_bs4.json"

with open(file_path, "w") as f:
    json.dump(report, f, indent=4)  # indent=4 makes the JSON file human-readable

Test Accuracy: 10.00%


In [None]:
from PIL import Image

transform = transforms.Compose(
    [
        transforms.Resize((32, 32)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ]
)

image = Image.open("/content/cat_watermelon.jpg").convert("RGB")
sample_image = transform(image).unsqueeze(0).to(device)


In [11]:
# torch.save(model.state_dict(), "lenet_cifar10_img2col_sparse.pth")


In [12]:
from torch.profiler import ProfilerActivity, profile

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    classify_single_image(model, sample_image, device, class_names)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         0.03%       9.115us        14.04%       4.022ms       2.011ms       0.000us         0.00%      53.022us      26.511us             2  
                                      aten::convolution         0.06%      16.329us        14.01%       4.013ms       2.007ms       0.000us         0.00%      53.022us      26.511us             2  
         