In [15]:
import time
from termcolor import colored
import torch
import torch.autograd.profiler as profiler

from modules.Swc2d import Swc2d
from modules.Dcls2dFull import Dcls2dFull



assert torch.cuda.is_available()
cuda_device = torch.device("cuda")  # device object representing GPU

In [16]:
in_channels = 1
out_channels = 1
kernel_size = (2,2)
dilation = (2,2)
stride = (1,1)
padding = (0,0)
groups = 1
bias = False

m = torch.nn.Conv2d(in_channels=in_channels,
              out_channels=out_channels,
              kernel_size=kernel_size,
              dilation=dilation,
              stride=stride,
              padding=padding,
              groups=groups,
              bias=bias).to(cuda_device)

n = Swc2d(in_channels=in_channels,
              out_channels=out_channels,
              kernel_size=kernel_size,
              dilation=dilation,
              stride=stride,
              padding=padding,
              groups=groups,
              bias=bias).to(cuda_device)



X1 = torch.nn.Parameter(
                      torch.tensor([[[[1., 2., 3., 4.],
                                    [5., 6., 7., 8.], 
                                    [9., 10., 11., 12.],
                                    [13., 14., 15., 16.]]]],device=cuda_device),
                      requires_grad = True) 
X2 = torch.nn.Parameter(
                      torch.tensor([[[[1., 2., 3., 4.],
                                    [5., 6., 7., 8.], 
                                    [9., 10., 11., 12.],
                                    [13., 14., 15., 16.]]]],device=cuda_device),
                      requires_grad = True) 

m.weight = torch.nn.Parameter(
                      torch.tensor([[[[20., 40.],
                                    [60., 80.]]]],device=cuda_device),
                      requires_grad = True)
n.weight = torch.nn.Parameter(
                      torch.tensor([[[[20., 40.],
                                    [60., 80.]]]],device=cuda_device),
                      requires_grad = True)

back_truth = torch.nn.Parameter(
                      torch.tensor([[[[1., 2.],
                                    [4., 5.]]]],device=cuda_device),
                      requires_grad = True)

with torch.autograd.profiler.profile(use_cuda=True, profile_memory=True) as prof:
    var2 = (n(X2) - back_truth).norm()
var1 = (m(X1) - back_truth).norm()

var1.backward();
var2.backward();

In [17]:
print(X1.size())
print(m.weight.size())
print(n.weight.size())

print(m(X1).size())
print(m(X1))
print(n(X2).size())
print(n(X2))


print(m.weight.grad) 
print(n.weight.grad)

print(X1.grad) 
print(X2.grad)



torch.Size([1, 1, 4, 4])
torch.Size([1, 1, 2, 2])
torch.Size([1, 1, 2, 2])
torch.Size([1, 1, 2, 2])
tensor([[[[1560., 1760.],
          [2360., 2560.]]]], device='cuda:0',
       grad_fn=<CudnnConvolutionBackward>)
torch.Size([1, 1, 2, 2])
tensor([[[[1560., 1760.],
          [2360., 2560.]]]], device='cuda:0', grad_fn=<swc2dBackward>)
tensor([[[[ 7.6718, 11.5944],
          [23.3621, 27.2847]]]], device='cuda:0')
tensor([[[[0., 0.],
          [0., 0.]]]], device='cuda:0')
tensor([[[[ 7.4323,  8.3810, 14.8646, 16.7620],
          [11.2319, 12.1806, 22.4637, 24.3611],
          [22.2968, 25.1429, 29.7291, 33.5239],
          [33.6956, 36.5417, 44.9274, 48.7222]]]], device='cuda:0')
tensor([[[[0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]]]], device='cuda:0')


In [18]:
n.weight.nonzero().size(0)*100/n.weight.numel()

100.0

In [19]:
batch = 16
in_channels = 2**9
out_channels = 2**10
kernel_size = (3,3)
dilation = (8,8)
stride = (1,1)
padding = (0,0)
groups = 1
bias = False
h = 200
w = 200
h_o = int((h + 2 * padding[0] - (dilation[0] * (kernel_size[0] - 1) + 1)) / stride[0] + 1)
w_o = int((w + 2 * padding[1] - (dilation[1] * (kernel_size[1] - 1) + 1)) / stride[1] + 1)

n = Swc2d(in_channels=in_channels,
              out_channels=out_channels,
              kernel_size=kernel_size,
              dilation=dilation,
              stride=stride,
              padding=padding,
              groups=groups,
              bias=bias).to(cuda_device)

X2 = torch.nn.Parameter(torch.rand(batch,in_channels,h,w,device=cuda_device), requires_grad = True)
back_truth = torch.nn.Parameter(torch.rand(batch,out_channels,h_o,w_o,device=cuda_device), requires_grad = True)

with torch.autograd.profiler.profile(use_cuda=True, profile_memory=True) as prof:
    var2 = (n(X2) - back_truth).norm()
    var2.backward();

In [20]:
print(torch.cuda.memory_summary(device=cuda_device, abbreviated=True))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |    6768 MB |    9732 MB |   45696 MB |   38928 MB |
|---------------------------------------------------------------------------|
| Active memory         |    6768 MB |    9732 MB |   45696 MB |   38928 MB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |    9734 MB |    9734 MB |    9734 MB |       0 B  |
|---------------------------------------------------------------------------|
| Non-releasable memory |     849 MB |     867 MB |    2636 MB |    1786 MB |
|---------------------------------------------------------------

In [21]:
print(prof.key_averages().table( row_limit=1000))
#prof.export_chrome_trace("trace.json")

-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                               Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                              swc2d         1.81%      35.057us         2.47%      47.811us      47.811us      51.328us         0.12%      51.328us      51.328us           0 b           0 b       2.07 Gb           0 b             1  
                         aten::view         2.61%      50.557u

In [None]:
-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                               Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                              swc2d         1.81%      35.057us         2.47%      47.811us      47.811us      51.328us         0.12%      51.328us      51.328us           0 b           0 b       2.07 Gb           0 b             1  
                         aten::view         2.61%      50.557us         2.61%      50.557us      10.111us       0.000us         0.00%       0.000us       0.000us           0 b           0 b           0 b           0 b             5  
                        aten::empty         5.65%     109.522us         5.65%     109.522us      10.952us       0.000us         0.00%       0.000us       0.000us           0 b           0 b       8.27 Gb       8.27 Gb            10  
                          aten::sub         1.60%      31.078us         1.85%      35.862us      35.862us      11.582ms        28.00%      11.582ms      11.582ms           0 b           0 b       2.07 Gb           0 b             1  
               aten::frobenius_norm         0.67%      13.056us         5.76%     111.609us     111.609us       5.504us         0.01%       3.684ms       3.684ms           0 b           0 b       1.00 Kb           0 b             1  
                         aten::norm         3.24%      62.703us         3.54%      68.657us      68.657us       3.672ms         8.88%       3.672ms       3.672ms           0 b           0 b         512 b           0 b             1  
                   aten::as_strided         0.08%       1.471us         0.08%       1.471us       1.471us       0.000us         0.00%       0.000us       0.000us           0 b           0 b           0 b           0 b             1  
                      aten::resize_         2.46%      47.604us         2.46%      47.604us      15.868us       0.000us         0.00%       0.000us       0.000us           0 b           0 b       2.07 Gb       2.07 Gb             3  
                        aten::copy_         1.15%      22.211us         1.15%      22.211us      22.211us       6.112us         0.01%       6.112us       6.112us           0 b           0 b           0 b           0 b             1  
                    aten::ones_like         0.41%       7.996us         1.31%      25.292us      25.292us       3.456us         0.01%       7.552us       7.552us           0 b           0 b         512 b           0 b             1  
                   aten::empty_like         1.28%      24.808us         4.54%      87.984us      21.996us       0.000us         0.00%       0.000us       0.000us           0 b           0 b       1.24 Gb           0 b             4  
                aten::empty_strided         3.26%      63.176us         3.26%      63.176us      15.794us       0.000us         0.00%       0.000us       0.000us           0 b           0 b       1.24 Gb       1.24 Gb             4  
                        aten::fill_         6.85%     132.681us         6.85%     132.681us      26.536us       2.249ms         5.44%       2.249ms     449.728us           0 b           0 b           0 b           0 b             5  
         torch::autograd::GraphRoot         0.16%       3.055us         0.16%       3.055us       3.055us       2.048us         0.00%       2.048us       2.048us           0 b           0 b           0 b           0 b             1  
     torch::autograd::CopyBackwards         2.56%      49.679us         3.32%      64.323us      64.323us       2.111us         0.01%       4.160us       4.160us           0 b           0 b           0 b           0 b             1  
                           aten::to         0.76%      14.644us         0.76%      14.644us      14.644us       2.049us         0.00%       2.049us       2.049us           0 b           0 b           0 b           0 b             1  
                      NormBackward1         4.69%      90.810us        26.74%     517.888us     517.888us      10.143us         0.02%       7.956ms       7.956ms           0 b           0 b       2.07 Gb      -1.00 Kb             1  
                          aten::div         4.40%      85.285us         5.73%     111.043us     111.043us       8.064us         0.02%       8.064us       8.064us           0 b           0 b         512 b           0 b             1  
                           aten::eq         6.62%     128.152us        11.40%     220.838us     110.419us      12.960us         0.03%      17.536us       8.768us           0 b           0 b       1.00 Kb           0 b             2  
                 aten::masked_fill_         3.03%      58.597us         3.03%      58.597us      58.597us       6.305us         0.02%       6.305us       6.305us           0 b           0 b           0 b           0 b             1  
                          aten::mul         9.68%     187.394us        11.50%     222.707us     111.354us      15.806ms        38.20%      15.806ms       7.903ms           0 b           0 b       4.13 Gb           0 b             2  
                       SubBackward0         2.12%      41.081us        18.10%     350.638us     350.638us       6.051us         0.01%      15.776ms      15.776ms           0 b           0 b       2.07 Gb      -2.07 Gb             1  
                          aten::neg         8.47%     164.000us        13.74%     266.156us     133.078us       7.883ms        19.05%      15.760ms       7.880ms           0 b           0 b       4.13 Gb           0 b             2  
    torch::autograd::AccumulateGrad         4.47%      86.639us        10.05%     194.741us      48.685us      13.738us         0.03%      30.340us       7.585us           0 b           0 b           0 b           0 b             4  
                       aten::detach         3.49%      67.622us         5.58%     108.102us      27.026us      13.176us         0.03%      16.602us       4.150us           0 b           0 b           0 b           0 b             4  
                             detach         2.09%      40.480us         2.09%      40.480us      10.120us       3.426us         0.01%       3.426us       0.856us           0 b           0 b           0 b           0 b             4  
                      swc2dBackward         8.58%     166.152us        30.24%     585.618us     585.618us       7.844us         0.02%       2.277ms       2.277ms           0 b           0 b       1.24 Gb    -132.50 Kb             1  
                   aten::zeros_like         3.46%      66.969us        14.73%     285.204us      95.068us      13.887us         0.03%       2.259ms     753.108us           0 b           0 b       1.24 Gb           0 b             3  
                        aten::zero_         3.01%      58.339us         7.15%     138.447us      46.149us       7.039us         0.02%       2.245ms     748.479us           0 b           0 b           0 b           0 b             3  
                         aten::ones         1.34%      26.019us         4.63%      89.663us      89.663us       4.098us         0.01%      10.242us      10.242us           0 b           0 b     132.50 Kb           0 b             1  
-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.937ms
CUDA time total: 41.371ms