I tried to change layers in trained network with lightweight layers.

### Converting output
Use PCA, find eigenvalues of covariance matrix, count how many of them are not less than 0.001 of the biggest eigenvalue.
So instead of ConvWide(x) we will have UpCh(DownCh(ConvWide(x))), where 'DownCh' decreases channels count and B increases back.
After that I combine DownCh(ConvWide(x)), it's a linear operation, which could be recalculated to ConvNotSoWide(x)
So result will be UpCh(ConvNotSoWide(x))

### Converting input: not implemented yet
Same as previous, but for layer inputs.

So ConvWide(x) will be replaced as ConvWide(UpCh(DownCh(x))) and recalculated as ConvNotSoWide(Down(Ch))

In terms of pytorch idea 2 and 3 leads to the next replacement for `WideConv`:

```Python
nn.Sequential(
    nn.Conv(in_ch, in_less_ch, kernel_size = 1, bias = False),
    nn.Conv(in_less_ch, out_less_ch, kernel_size = 3, bias = False),
    nn.Conv(out_less_ch, out_ch, kernel_size = 1, bias = True),
)
```

In [1]:
import itertools
import time
from typing import Dict, Optional, Tuple, List, Optional, Iterable
from dataclasses import dataclass

import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from myutil import CovarianceAccumulator

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.__version__

'1.13.1+cu117'

In [4]:
from torchvision import datasets
from torchvision.transforms import ToTensor

train_data = datasets.MNIST(
    root='../models/mnist',
    train=True,
    transform=ToTensor(),
    download=True,
)

test_data = datasets.MNIST(
    root='../models/mnist',
    train=False,
    transform=ToTensor(),
    download=True,
)

In [5]:
class NpAccumulator:
    def __init__(self):
        self.arrays: List[np.ndarray] = []

    def add(self, tensor: torch.Tensor):
        self.arrays.append(tensor.cpu().detach().numpy())

    @property
    def np_arr(self) -> np.ndarray:
        return np.concatenate(self.arrays, axis=0)

In [46]:
class TrainHelper:
    @staticmethod
    def train(cnn: nn.Module,
              *,
              epochs: int,
              train_dataset: datasets.MNIST,
              test_dataset: Optional[datasets.MNIST] = None,
              print_results: bool = True,
              batch_size: int,
              device_name: str = 'cuda') -> List[float]:

        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   num_workers=1)

        device = torch.device(device_name)

        cnn.to(device)
        cnn.train()

        optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)
        loss_func = nn.CrossEntropyLoss()

        eval_results: List[float] = []

        for epoch in range(epochs):
            for images, labels in train_loader:
                images = Variable(images.to(device))
                labels = Variable(labels.to(device))

                output = cnn(images)
                loss = loss_func(output, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if test_dataset is not None:
                eval_result = TrainHelper.test(cnn, test_dataset, device)
                eval_results.append(eval_result)
                if print_results:
                    print(f"epoch {epoch}, accuracy = {eval_result}, loss = {loss.detach()}")
                cnn.train()

        return eval_results

    @staticmethod
    def test(cnn: nn.Module, test_dataset: datasets.MNIST, device=None) -> float:
        cnn.eval()
        loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=1)
        correct = 0
        incorrect = 0

        for images, labels in loader:
            if device is not None:
                images = images.to(device)

            results = cnn(images)
            predictions = results.detach().cpu().numpy().argmax(axis=1)
            oks = (predictions == labels.numpy()).sum()
            correct += oks
            incorrect += len(predictions) - oks

        return correct / (correct + incorrect)

    @staticmethod
    def train_models(models: List[nn.Module], device_name: str) -> Tuple[int, float]:
        """
        generator yields pair (trainable parameters count, best accuracy) for each network
        :param device_name: 'cuda' or 'cpu'
        """
        assert len(models) > 0

        for model in models:
            start = time.time()
            eval_results = TrainHelper.train(
                cnn=model,
                epochs=20,
                train_dataset=train_data,
                test_dataset=test_data,
                batch_size=2048,
                device_name=device_name,
                print_results=False
            )
            end = time.time()
            best_acc = max(eval_results)
            params_count = TrainHelper.total_parameters_count(model)
            print(f"best accuracy = {best_acc}, parameters = {params_count}, training time = {end - start}")
            yield params_count, best_acc

    @staticmethod
    def total_parameters_count(model: nn.Module) -> int:
        return sum(np.prod(p.size()) for p in model.parameters())

    @staticmethod
    def print_parameters(model: nn.Module):
        print(f"total parameters = {TrainHelper.total_parameters_count(model)}")
        for p in model.parameters():
            print(f"size {np.prod(p.size())}: {p.size()}")

    @staticmethod
    def eval_layer(cnn: nn.Module, x: np.ndarray, batch_size: int) -> np.ndarray:
        acc = NpAccumulator()
        for tensor in TrainHelper.cuda_tensors_from_numpy(x, batch_size):
            acc.add(cnn(tensor))
        return acc.np_arr
    
    @staticmethod
    def compare_layers(layer1: nn.Module, layer2: nn.Module, x: np.ndarray, batch_size: int) -> float:
        y1 = TrainHelper.eval_layer(layer1, x, batch_size)
        y2 = TrainHelper.eval_layer(layer2, x, batch_size)
        return ((y1 - y2) ** 2).mean()

    @staticmethod
    def cuda_tensors_from_numpy(arr: np.ndarray, batch_size: int):
        for i in range(arr.shape[0] // batch_size):
            yield torch.from_numpy(arr[i * batch_size: (i + 1) * batch_size]).to('cuda')

In [7]:
class MyParallelLayer(nn.Module):
    def __init__(self, real_layer: nn.Module):
        super().__init__()
        self.use_real_layer: bool = True
        self.real_layer: nn.Module = real_layer
        self.mirror_layer: Optional[nn.Module] = None

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.use_real_layer:
            return self.real_layer(x)
        else:
            return self.mirror_layer(x)

In [8]:
class MyConvModel(nn.Module):
    def __init__(self, channels: int):
        super(MyConvModel, self).__init__()

        c = channels
        self.layers = nn.Sequential(
            *self.conv(1, c, kernel_size=3),  # 28 - 26
            *self.conv(c, c, kernel_size=3),  # 26 - 24
            nn.MaxPool2d(2),  # 24 - 12

            *self.conv(c, c * 2, kernel_size=3),  # 12 - 10
            *self.conv(c * 2, c * 2, kernel_size=3),  # 10 - 8
            nn.MaxPool2d(2),  # 8 - 4

            *self.conv(c * 2, c * 4, kernel_size=3),  # 4 - 2
            *self.conv(c * 4, c * 4, kernel_size=2),  # 2 - 1

            nn.Conv2d(c * 4, 10, kernel_size=1, padding='valid', bias=True),
            nn.Flatten(),
        )

    def conv(self, in_ch: int, out_ch: int, *, kernel_size) -> List[nn.Module]:
        return [
            MyParallelLayer(nn.Sequential(
                nn.Conv2d(in_ch, out_ch, kernel_size=kernel_size, padding='valid', bias=True),
                # nn.Conv2d(in_ch, out_ch, kernel_size=kernel_size, padding='valid', bias=False),
                # nn.BatchNorm2d(out_ch),
            )),
            nn.LeakyReLU(0.1),
        ]

    @property
    def paraller_layers(self) -> List[MyParallelLayer]:
        return [layer for layer in self.layers if isinstance(layer, MyParallelLayer)]

    @property
    def alt_losses(self) -> List[float]:
        return [layer.last_loss for layer in self.layers if isinstance(layer, MyParallelLayer)]

    def forward(self, x: torch.Tensor):
        return self.layers(x)

In [9]:
model = MyConvModel(32).to('cuda')

In [10]:
TrainHelper.train(
    model,
    epochs=10,
    train_dataset=train_data,
    test_dataset=test_data,
    batch_size=2048,
    print_results=True,
)

epoch 0, accuracy = 0.8929, loss = 0.438852459192276
epoch 1, accuracy = 0.9435, loss = 0.2200377881526947
epoch 2, accuracy = 0.9665, loss = 0.11827639490365982
epoch 3, accuracy = 0.9742, loss = 0.06652519851922989
epoch 4, accuracy = 0.9821, loss = 0.09297072887420654
epoch 5, accuracy = 0.9793, loss = 0.04297684505581856
epoch 6, accuracy = 0.9821, loss = 0.0517389252781868
epoch 7, accuracy = 0.9856, loss = 0.05742797255516052
epoch 8, accuracy = 0.9885, loss = 0.03669394180178642
epoch 9, accuracy = 0.9891, loss = 0.05633099004626274


[0.8929,
 0.9435,
 0.9665,
 0.9742,
 0.9821,
 0.9793,
 0.9821,
 0.9856,
 0.9885,
 0.9891]

In [11]:
TrainHelper.test(model, test_data, device='cuda')

0.9891

In [12]:
model.paraller_layers[0]

MyParallelLayer(
  (real_layer): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=valid)
  )
)

In [None]:
def get_x():
    for x, loader in torch.utils.data.DataLoader(train_data, batch_size=60000):
        return x.numpy()

@dataclass
class InOut:
    layer: MyParallelLayer
    inp: np.ndarray
    out: np.ndarray
    inp_cov: CovarianceAccumulator
    out_cov: CovarianceAccumulator
    
    @staticmethod
    def calc() -> Dict[MyParallelLayer, 'InOut']:
        x = get_x()
        model.eval()
        
        result = {}
        with torch.no_grad():
            for layer in model.layers:
                x_next = TrainHelper.eval_layer(layer, x, batch_size=6000)
                if isinstance(layer, MyParallelLayer):
                    result[layer] = InOut(
                        layer, x, x_next, 
                        inp_cov=CovarianceAccumulator().add_samples(x, axis=1),
                        out_cov=CovarianceAccumulator().add_samples(x_next, axis=1),
                    )
                    print(f"{x.shape} -> {x_next.shape}")
                x = x_next
        
        model.train()
        return result


inouts = InOut.calc()    

In [19]:
example = inouts[model.paraller_layers[0]][1][0:1, :, 0, 0]
shift, m_to, m_back = inouts[model.paraller_layers[0]].out_cov.to_eigenvalues_and_back(9)

restored = ((example - shift) @ m_to ) @ m_back + shift
np.sum((example - restored)**2)

5.672196456127747e-18

In [22]:
example = inouts[model.paraller_layers[0]][1][0:1, :, 0, 0]
shift, m_to, m_back = acc.to_eigenvalues_and_back(9)

shift_after = shift - (shift @ m_to) @ m_back

restored = example @ m_to @ m_back + shift_after
np.sum((example - restored)**2)

5.672196492102728e-18

In [None]:
def combine_convs(conv0: nn.Conv2d, conv1: nn.Conv2d) -> nn.Conv2d:
    assert conv0.bias is None, 'not supported yet'
    w0 = conv0.weight.cpu().detach()
    w1 = conv1.weight.cpu().detach()
    assert w0.size(2) == 1 or w1.size(2) == 1, 'no more than one non point-wise convolution'
    assert w0.size(3) == 1 or w1.size(3) == 1, 'no more than one non point-wise convolution'
    with torch.no_grad():
        w01 = torch.tensordot(w0, w1, dims=[[0], [1]])
        w01 = torch.moveaxis(w01, 3, 0)
        if w01.size(5) != 1:
            w01 = torch.swapaxes(w01, 3, 5)
        if w01.size(4) != 1:
            w01 = torch.swapaxes(w01, 2, 4)
        w01 = torch.reshape(w01, shape=[w01.size(i) for i in range(4)])

    bias = conv1.bias 
    conv01 = nn.Conv2d(
        in_channels=conv0.in_channels, 
        out_channels=conv1.out_channels,
        kernel_size=(w01.size(2), w01.size(3)),
        bias = bias is not None
    )
    
    conv01.weight = nn.Parameter(w01)
    if bias is not None:
        conv01.bias = nn.Parameter(bias.detach())
        
    return conv01

In [None]:
def make_parallel_layer_v3(layer: MyParallelLayer, mid_ch: int):
    inout = inouts[layer]
    real_conv: nn.Conv2d = inout.layer.real_layer[0]
    conv_wide = nn.Conv2d(real_conv.in_channels, real_conv.out_channels, kernel_size=3, bias=False)
    conv_downch = nn.Conv2d(real_conv.out_channels, mid_ch, kernel_size=1, bias=False)
    conv_upch = nn.Conv2d(9, real_conv.out_channels, kernel_size=1, bias=True)

    shift, m_to, m_back = inout.out_cov.to_eigenvalues_and_back(mid_ch)
    
    conv_wide.weight = nn.Parameter(real_conv.weight.detach())
    
    conv_downch.weight = nn.Parameter(torch.from_numpy(m_to.astype(np.float32).T[:, :, np.newaxis, np.newaxis]))
    
    conv_upch.weight = nn.Parameter(torch.from_numpy(m_back.astype(np.float32).T[:, :, np.newaxis, np.newaxis]))
    shift_after = shift - shift @ m_to @ m_back + real_conv.bias.cpu().detach().numpy() @ m_to @ m_back
    conv_upch.bias = nn.Parameter(torch.from_numpy(shift_after.astype(np.float32)))
    
    layer.mirror_layer = nn.Sequential(
        combine_convs(conv_wide, conv_downch),
        conv_upch
    ).to('cuda')
    
    model.eval()
    diff = TrainHelper.compare_layers(layer.real_layer, layer.mirror_layer, inout.inp, batch_size=1000)
    print(f"diff = {diff}")

In [None]:
make_parallel_layer_v3(model.paraller_layers[0])

In [43]:
model.paraller_layers[0].use_real_layer = True
acc_real = TrainHelper.test(model, test_data, device='cuda')
model.paraller_layers[0].use_real_layer = False
acc_fake = TrainHelper.test(model, test_data, device='cuda')
acc_real, acc_fake

(0.9891, 0.9891)

In [59]:
eig2 = inouts[model.paraller_layers[1]].out_cov.covariance_eigenvalues_normalized
eig2

array([7.21801990e-01, 1.91455742e-01, 5.65268987e-02, 1.85796809e-02,
       5.34610336e-03, 2.01267471e-03, 1.20641796e-03, 6.34483205e-04,
       4.62602565e-04, 4.08309667e-04, 3.27326782e-04, 2.22613143e-04,
       1.95019735e-04, 1.33247302e-04, 1.16834259e-04, 9.62859578e-05,
       6.82935015e-05, 5.93736429e-05, 5.77363589e-05, 4.82616588e-05,
       3.95258291e-05, 3.52306002e-05, 3.29214845e-05, 2.69340636e-05,
       2.03147854e-05, 1.69029318e-05, 1.61334960e-05, 1.35787066e-05,
       1.13907037e-05, 1.06677627e-05, 9.24518569e-06, 7.25946034e-06])

In [66]:
np.sum(eig2[0 : 24])

0.9998945069677841

In [68]:
make_parallel_layer_v3(model.paraller_layers[1], mid_ch=24)

diff = 3.7337347748689353e-05


In [70]:
def test_layers(mirror_count: int = 2) -> float:
    for i, layer in enumerate(model.paraller_layers):
        layer.use_real_layer = i >= mirror_count
    return TrainHelper.test(model, test_data, device='cuda')

[test_layers(i) for i in range(3)]

[0.9891, 0.9891, 0.989]

In [71]:
eig3 = inouts[model.paraller_layers[2]].out_cov.covariance_eigenvalues_normalized
eig3

array([3.98621153e-01, 3.17266506e-01, 1.57635545e-01, 5.98474561e-02,
       3.03713952e-02, 1.48863100e-02, 8.98813380e-03, 3.59621111e-03,
       2.08005192e-03, 1.52518295e-03, 1.17355143e-03, 7.21837250e-04,
       6.33482410e-04, 4.77868832e-04, 3.14994616e-04, 2.35329196e-04,
       1.64776569e-04, 1.34083996e-04, 1.25370560e-04, 1.08282432e-04,
       9.66021730e-05, 8.56046465e-05, 7.40780945e-05, 6.69511216e-05,
       6.10044431e-05, 5.96564364e-05, 5.04671822e-05, 4.99504145e-05,
       4.47515962e-05, 4.11101845e-05, 3.92834768e-05, 3.14957778e-05,
       3.06943620e-05, 2.87876921e-05, 2.67469199e-05, 2.33836573e-05,
       2.28876734e-05, 2.15009870e-05, 2.05588511e-05, 2.01399175e-05,
       1.82099134e-05, 1.61466538e-05, 1.60470239e-05, 1.26072783e-05,
       1.18291653e-05, 1.11950646e-05, 1.02468106e-05, 9.83551262e-06,
       9.37500363e-06, 9.06421014e-06, 8.25862275e-06, 7.31429400e-06,
       6.72568918e-06, 6.32245518e-06, 6.17946466e-06, 5.63239661e-06,
      

In [76]:
np.sum(eig3[0:48])

0.9999092961663699

In [77]:
make_parallel_layer_v3(model.paraller_layers[2], 48)

diff = 0.0001967307471204549


In [78]:
[test_layers(i) for i in range(4)]

[0.9891, 0.9891, 0.989, 0.9892]

In [85]:
eig4 = inouts[model.paraller_layers[3]].out_cov.covariance_eigenvalues_normalized
print(np.sum(eig4[0: 48]))
eig4

0.9999152771902584


array([3.66725551e-01, 2.51428744e-01, 1.42088955e-01, 9.05062631e-02,
       6.10640768e-02, 2.69974589e-02, 2.02198277e-02, 1.57382069e-02,
       1.04477853e-02, 4.42133867e-03, 2.43355359e-03, 1.65977160e-03,
       1.27007021e-03, 1.10580801e-03, 7.96767569e-04, 4.95258566e-04,
       3.81705991e-04, 3.28707081e-04, 2.89658646e-04, 2.09699784e-04,
       1.61241347e-04, 1.49700827e-04, 1.27559809e-04, 1.06041094e-04,
       9.25884390e-05, 7.89887976e-05, 7.04337573e-05, 5.94126190e-05,
       5.52904411e-05, 4.84235764e-05, 4.19588006e-05, 3.71376810e-05,
       2.97058810e-05, 2.94681449e-05, 2.73750223e-05, 2.28806645e-05,
       2.18899477e-05, 2.09659036e-05, 1.86513905e-05, 1.67675853e-05,
       1.41510087e-05, 1.34653397e-05, 1.20864496e-05, 1.18601276e-05,
       1.08535615e-05, 9.64599321e-06, 9.15571786e-06, 8.36891977e-06,
       8.31144036e-06, 7.72269454e-06, 7.30881634e-06, 6.53522131e-06,
       6.38056828e-06, 5.89283849e-06, 5.54023236e-06, 5.42166084e-06,
      

In [86]:
make_parallel_layer_v3(model.paraller_layers[3], 48)
[test_layers(i) for i in range(5)]

diff = 0.0013124644756317139


[0.9891, 0.9891, 0.989, 0.9892, 0.9891]

In [98]:
eig5 = inouts[model.paraller_layers[4]].out_cov.covariance_eigenvalues_normalized
print(np.sum(eig5[0: 120]))
eig5

0.9998843121887715


array([2.43690771e-01, 1.73586383e-01, 1.11664893e-01, 8.90731260e-02,
       6.74945956e-02, 5.12942415e-02, 4.17527746e-02, 3.92276549e-02,
       3.08176542e-02, 2.52081717e-02, 1.97083733e-02, 1.79131832e-02,
       1.53828615e-02, 1.23680698e-02, 8.94442923e-03, 7.08214315e-03,
       5.80210136e-03, 4.52001756e-03, 3.69312546e-03, 3.31474831e-03,
       2.79704253e-03, 2.44696772e-03, 2.01795981e-03, 1.53979313e-03,
       1.36174411e-03, 1.08393098e-03, 9.86231254e-04, 9.16796300e-04,
       8.74226954e-04, 7.70646738e-04, 7.16569045e-04, 6.88808719e-04,
       6.42453809e-04, 5.94571769e-04, 4.74879256e-04, 4.60719565e-04,
       4.25142997e-04, 4.03016530e-04, 3.56473735e-04, 3.39513038e-04,
       3.22476362e-04, 2.97842728e-04, 2.94394693e-04, 2.77438548e-04,
       2.52162699e-04, 2.39222168e-04, 2.34437800e-04, 2.25015493e-04,
       2.12013510e-04, 2.05108608e-04, 1.88855425e-04, 1.86559554e-04,
       1.77778412e-04, 1.75313666e-04, 1.62507249e-04, 1.56938537e-04,
      

In [99]:
make_parallel_layer_v3(model.paraller_layers[4], 120)
[test_layers(i) for i in range(6)]

diff = 0.0026109495665878057


[0.9891, 0.9891, 0.989, 0.9892, 0.9891, 0.9891]

In [102]:
eig6 = inouts[model.paraller_layers[5]].out_cov.covariance_eigenvalues_normalized
print(np.sum(eig6[0: 120]))
eig6

0.9998760280751985


array([2.85275797e-01, 2.15411545e-01, 1.42798639e-01, 1.08577120e-01,
       7.29416720e-02, 4.39001407e-02, 2.47902287e-02, 2.07809419e-02,
       1.44208003e-02, 9.53332544e-03, 8.56598224e-03, 7.15373152e-03,
       6.43379337e-03, 5.26924263e-03, 3.45413519e-03, 2.87227830e-03,
       2.38438452e-03, 2.14197565e-03, 1.75961052e-03, 1.55965337e-03,
       1.35720235e-03, 1.22384886e-03, 1.00196688e-03, 9.60871824e-04,
       8.82959406e-04, 7.87697579e-04, 7.34013454e-04, 7.05988872e-04,
       6.67132330e-04, 5.87881473e-04, 5.54149380e-04, 5.14772326e-04,
       4.84256414e-04, 4.44633819e-04, 3.86901264e-04, 3.59703973e-04,
       3.53053688e-04, 3.11276462e-04, 3.05259649e-04, 2.87851134e-04,
       2.77231244e-04, 2.53441006e-04, 2.49156882e-04, 2.41988271e-04,
       2.30228862e-04, 2.21777048e-04, 2.04725398e-04, 1.97476063e-04,
       1.85942864e-04, 1.82984929e-04, 1.78949376e-04, 1.66212053e-04,
       1.60145383e-04, 1.54094244e-04, 1.48235394e-04, 1.44568857e-04,
      

In [103]:
make_parallel_layer_v3(model.paraller_layers[5], 120)
[test_layers(i) for i in range(7)]

diff = 0.002402564976364374


[0.9891, 0.9891, 0.989, 0.9892, 0.9891, 0.9891, 0.989]