In [1]:
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
import pathlib 
import os
import pickle

import torch
from torch import nn
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"

import PT_files.save_load as sl
from DnCNN_NP.layers  import relu, np_BatchNorm2d

import time 
from collections import OrderedDict
import pdb

**The goal of this notebook is to reduce the runtime of the numpy forward implementation of the pytorch denoising algorithm. It does this by saving the output of `get_indices` 3 times. It uses those saved indices in `im2col` via `np.ravel_multi_index()`.**

This reduces the runtime considerably because we only call `get_indices` 3 times instead of however many times the model calls a `conv` layer. In our case, that is 20 times. It then further saves time by only using those 3 saved indices via `np.ravel_multi_index()` instead of calling the older version a unique time.

**NOTE: This is for a 2020x2020 patch due to memory issues trying to use the full 6k by 6k image.**
- The reason the patches are 2020 x 2020 instead of 2000x2000 is because we found that there are artifacts with the 2000x2000. If we do patchs of 2020x2020 (ie. a patch w/ a 10 pixel border) and pad the full FVC image with a 10 pixel border we're able use these larger patches, but cropping there extra 10 pixel border and not have any artifacts at all! This reduces the runtime/scale of the afterburner function, which is what we want to do.
- If you need to re-save the any of the intermediate layer indices for the NumPy implementation of the code you'll need to reshape the image data, as well as, all the `np.reshapes` found within the subsequent code. 
    - The reason I don't do this is because I do not suspect I'll ever need to do this more than maybe once after the code is on `fpoffline`.
        - That said I'll probably hate myself for this, but the code is pretty much all written anyway.

In [2]:
# Getting path to weights file and loading in actual weights dict
PATH = pathlib.Path(os.getenv('PSCRATCH'))
DATA = PATH / 'DESI_dn' /'Model_params'
assert DATA.exists()
name = '2k_model_bs64_e800_ps50_Adam.pth'
weights = torch.load(str(DATA / name))


#Load the actual data that we're working on & print the shape of this data
test_data = sl.NERSC_load('test_data_40%_6000.npy')
sample = test_data[0]
print('Shape of test set=', sample.shape)

# Reshape the data 
samp = sample[0][0][1000:3020, 1000:3020]
samp = samp.reshape((1, 1, 2020, 2020))

Shape of test set= (108, 1, 6000, 6000)


Need to call this three times and save the outputs:
1. First for the untransformed input. (1 channel -> 64 channels)
2. For the middle layers (64 channels -> 64 channels)
3. For the last layer (64 channels -> 1 channel)

**NOTE:** Code/Code Blog where I got this numpy im2col conversion is [here](https://hackmd.io/@machine-learning/blog-post-cnnumpy-fast)

In [3]:
def get_indices(input_data, weights_dict, prefix, stride=1, padding=1):
    get_indices_start = time.perf_counter()

    # Get input size
    
    # Checking to see if a single sample or a batch of samples is given.
    # If batch take the batch_size, in_channels, H, and W
    # If single sample is given reshape so the values above can be calculated
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2020 , 2020))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 2:
        input_data = input_data.reshape((1, 1, 2020, 2020))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    # Load the weights and biases needed for a convolution
    # then take off gpu memory, move to CPU memory,
    # and lastly transform to numpy
    weight = weights_dict[str(prefix) + 'weight']
    weight = weight.detach().cpu().numpy()
    
    bias = weights_dict[str(prefix) + 'bias']
    bias = bias.detach().cpu().numpy()
    
    # Calculate the kernel size and output channels from
    # the loaded weights from above
    kernel_size = weight[0][0].shape
    output_channels = len(weight)
    
    # Calculations for the output H and W dimensions.
    height_out = ((height + (2*padding) - (kernel_size[0] - 1) - 1) / stride) + 1
    height_out = int(height_out)
    width_out = ((width + (2*padding) - (kernel_size[1] - 1) - 1) / stride) + 1
    width_out = int(width_out)
    
    
    # ----Compute matrix of index i----

    # Level 1 vector.
    level1 = np.repeat(np.arange(kernel_size[0]), kernel_size[1])
    # Duplicate for the other channels.
    level1 = np.tile(level1, input_channels)
    # Create a vector with an increase by 1 at each level.
    everyLevels = stride * np.repeat(np.arange(height_out), width_out)
    # Create matrix of index i at every levels for each channel.
    i = level1.reshape(-1, 1) + everyLevels.reshape(1, -1)
    
    # ----Compute matrix of index j----
    
    # Slide 1 vector.
    slide1 = np.tile(np.arange(kernel_size[1]), kernel_size[0])
    # Duplicate for the other channels.
    slide1 = np.tile(slide1, input_channels)
    # Create a vector with an increase by 1 at each slide.
    everySlides = stride * np.tile(np.arange(width_out), height_out)
    # Create matrix of index j at every slides for each channel.
    j = slide1.reshape(-1, 1) + everySlides.reshape(1, -1)
    
    # ----Compute matrix of index d----

    # This is to mark delimitation for each channel
    # during multi-dimensional arrays indexing.
    d = np.repeat(np.arange(input_channels), kernel_size[0] * kernel_size[1]).reshape(-1, 1)
    
    get_indices_end = time.perf_counter()
    print('get_indices takes:', get_indices_end-get_indices_start, 'seconds')
    
    return i, j, d

def im2col(input_data, weights_dict, prefix, stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        -----------
        input_data: nd.array
            The input image(s)
        weights_dict: OrderedDict
            Dictionary containing the PyTorch trained weights for every 
            layer of the model
        prefix: str
            The prefix that picks out the specific layer's weights to be used
            E.g. prefix='layers.0.0.' would be the first layers convolutional
            weights and bias's

        Returns:
        --------
        cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        input_data = input_data.reshape((1, 1, 2020 , 2020))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)

    elif len(input_data.shape) == 2:
        input_data = input_data.reshape((1, 1, 2020, 2020))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    i, j, d = get_indices(input_data=input_data, weights_dict=weights_dict, prefix=prefix)
    # Multi-dimensional arrays indexing.
    cols = input_padded[:, d, i, j]
    cols = np.concatenate(cols, axis=-1)
    
    im2col_end = time.perf_counter()
    print('Im2col takes:', im2col_end-im2col_start, 'seconds')
    
    return cols


def np_Conv2d(input_data, weights_dict, prefix):
    """
        Performs a forward convolution.

        Parameters:
        - X : Last conv layer of shape (m, n_C_prev, n_H_prev, n_W_prev).
        Returns:
        - out: previous layer convolved.
    """
    
    conv_start = time.perf_counter()
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2020 , 2020))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
    
    elif len(input_data.shape) == 2:
        input_data = input_data.reshape((1, 1, 2020, 2020))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)

    output_channels = len(weights_dict[str(prefix) + 'weight']) # num_of_filters
    height_out = int((height + 2 * 1 - 3)/ 1) + 1
    width_out = int((width + 2 * 1 - 3)/ 1) + 1

    X_col = im2col(input_data=input_data, weights_dict=weights_dict, prefix=prefix)
    w_col = weights_dict[str(prefix) + 'weight'].detach().cpu().numpy().reshape((output_channels, -1))
    b_col = weights_dict[str(prefix) + 'bias'].detach().cpu().numpy().reshape(-1, 1)
    
    print('X_col.shape = ', X_col.shape)
    print('w_col.shape = ', w_col.shape)
    # Perform matrix multiplication.
    out = w_col @ X_col + b_col
    # Reshape back matrix to image.
    out = np.array(np.hsplit(out, batch_size)).reshape((batch_size, output_channels, height_out, width_out))
    
    conv_end = time.perf_counter()
    print('Conv takes:', conv_end-conv_start, 'seconds')
    return out

Need to load `im2col` &` np_Conv2d`because we need to get the output of `np_Conv2d` for the first layer, intermediate layers, and last layer so that we have the correct shapes for the indices (via `get_indices` that will be used in `im2col` and thus `np_Conv2d`). The differences of `im2col2` and `np_conv2d2` are the versions of the functions that instead of creating the `i, j, d` index matrices it automatically loads them in thus saving ~4 seconds in the intermediate layers of the model that would call `get_indices` for every Conv layer.

In [4]:
from collections import OrderedDict

# Replace the last part of the key that describes what layer it is
# part of and replaces it with empty space
layers_list = [x.replace('weight', '').replace('bias', '').replace('running_mean', '').replace('running_var', '').replace('num_batches_tracked', '') for x in weights.keys()]
# Convert this list which has duplicated elements due to removing
# identifying elements ie. for the first conv layer we had
# layers.0.0.weight & layers.0.0.bias, but now after removing them we
# have layers.0.0 & layers.0.0
# The code below deletes the duplicated elements
layers_list = list(OrderedDict.fromkeys(layers_list))

We run the model through the baseline NumPy functions, so we get the correct shape of every layer. **BUT,** we only need the shape of the output from the first layer (1C -> 64C), an intermediate layer (64C->64C), and the last layer (64C->1C). We just need to instantiate the 1st, 2nd, and last layer without having to worry about the other layers of the model because those layers don't change the dimensionality of the actual data.

In [5]:
# Creating the correct shapes/values of the intermediate arrays that are necessary
# for creating the intermediate and final index matrices.
#
# These use the original functions that take a long time to process
# (ie. the unoptimized versions)

# First layer
conv0 = np_Conv2d(input_data=samp, weights_dict=weights, prefix='layers.0.0.')
# Second layer (ie. intermediate layer)
conv1 = np_Conv2d(input_data=conv0, weights_dict=weights, prefix='layers.1.0.')
# Last layer
conv = np_Conv2d(input_data=conv1, weights_dict=weights, prefix='layers.19.')

get_indices takes: 0.09712557913735509 seconds
Im2col takes: 0.28065531607717276 seconds
X_col.shape =  (9, 4080400)
w_col.shape =  (64, 9)
Conv takes: 0.6142878120299429 seconds
get_indices takes: 4.695406832033768 seconds
Im2col takes: 16.266699539031833 seconds
X_col.shape =  (576, 4080400)
w_col.shape =  (64, 576)
Conv takes: 18.508120679995045 seconds
get_indices takes: 4.827232877025381 seconds
Im2col takes: 16.416074029169977 seconds
X_col.shape =  (576, 4080400)
w_col.shape =  (1, 576)
Conv takes: 16.77784837805666 seconds


In [6]:
# Creation of the first index matrix (1 C -> 64 C) and the intermediate
# index matrix (64 C -> 64 C).
#
# For the intermediate index matrix we need the shape of the input data, but
# because the first layer transforms the shape of the input data, we need 
# to run the first layer of the model to get the correct shape of the data
# that will be used for creating the index matrix


# First layer
i_start, j_start, d_start = get_indices(input_data=samp, weights_dict=weights, prefix='layers.0.0.')
index_mat_start = (i_start, j_start, d_start)

# Second layer
i_mid, j_mid, d_mid = get_indices(input_data=conv0, weights_dict=weights, prefix='layers.1.0.')
index_mat_mid = (i_mid, j_mid, d_mid)

# Last layer
i_last, j_last, d_last = get_indices(input_data=conv1, weights_dict=weights, prefix='layers.19.')
index_mat_last = (i_last, j_last, d_last)

get_indices takes: 0.09534529317170382 seconds
get_indices takes: 4.82147181709297 seconds
get_indices takes: 4.38091963599436 seconds


In [7]:
index_matrices = {'start': index_mat_start, 'mid': index_mat_mid, 'last': index_mat_last}
# sl.NERSC_save(data=index_matrices, name='index_matrices_2k.pkl')

In [8]:
def im2col2_save(input_data, layer_matrices,  stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        -----------
        input_data: nd.array
            The input image(s)
        weights_dict: OrderedDict
            Dictionary containing the PyTorch trained weights for every 
            layer of the model
        prefix: str
            Prefix to use to identify which multi-dimensional array indexing 
            array we're saving (ie. first, mid, last). Similar to the naming
            convetion we have for the individual matrix indices from 
            get_indices

        Returns:
        --------
        cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2020 , 2020))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(indput_data.shape) == 2:
        input_data = input_data.reshape((1, 1, 2020, 2020))
        batch_size, input_channels, height, width = input_data.shape

    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    i, j, d = layer_matrices
    # Multi-dimensional arrays indexing.
    idx = np.ravel_multi_index(([0], d, i, j), input_padded.shape)
    
    return idx

Saving the arrays as `np.int32` instead of `np.int64` (done by default) to reduce the memory size of the index arrays.

In [9]:
idx_start = im2col2_save(input_data=samp, layer_matrices=index_matrices['start'])
idx_start = idx_start.astype(np.int32)
idx_mid = im2col2_save(input_data=conv0, layer_matrices=index_matrices['mid'])
idx_mid = idx_mid.astype(np.int32)
idx_last = im2col2_save(input_data=conv1, layer_matrices=index_matrices['last'])
idx_last = idx_last.astype(np.int32)


im2col_layer_dict = {'start': idx_start, 'mid':idx_mid, 'last': idx_last}
sl.NERSC_save(name='im2col_2k_indices.pkl', data=im2col_layer_dict)

You are not on NERSC?


In [10]:
def im2col2(input_data, im2col_mat, col_prefix, stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        -----------
        input_data: nd.array
            The input image(s)
        weights_dict: OrderedDict
            Dictionary containing the PyTorch trained weights for every 
            layer of the model
        prefix: str
            The prefix that picks out the specific layer's weights to be used
            E.g. prefix='layers.0.0.' would be the first layers convolutional
            weights and bias's

        Returns:
        --------
        cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)

    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    # Multi-dimensional arrays indexing.
    idx = im2col_mat[str(col_prefix)]
    cols2 = input_padded.reshape(-1)[idx]  
    
    im2col_end = time.perf_counter()
    print('Im2col takes:', im2col_end-im2col_start, 'seconds')
    
    return cols2

def np_Conv2d2(input_data, weights_dict, prefix, im2col_mat, col_prefix):
    """
        Performs a forward convolution.

        Parameters:
        - X : Last conv layer of shape (m, n_C_prev, n_H_prev, n_W_prev).
        Returns:
        - out: previous layer convolved.
    """
    
    conv_start = time.perf_counter()
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)


    output_channels = len(weights_dict[str(prefix) + 'weight']) # num_of_filters
    height_out = int((height + 2 * 1 - 3)/ 1) + 1
    width_out = int((width + 2 * 1 - 3)/ 1) + 1

    
    X_col = im2col2(input_data=input_data, im2col_mat=im2col_mat, col_prefix=str(col_prefix))
    w_col = weights_dict[str(prefix) + 'weight'].detach().cpu().numpy().reshape((output_channels, -1))
    b_col = weights_dict[str(prefix) + 'bias'].detach().cpu().numpy().reshape(-1, 1)
    # Perform matrix multiplication.
    out = w_col @ X_col + b_col
    # Reshape back matrix to image.
    out = np.array(np.hsplit(out, batch_size)).reshape((batch_size, output_channels, height_out, width_out))
    
    conv_end = time.perf_counter()
    print('Conv takes:', conv_end-conv_start, 'seconds')
    return out

In [11]:
conv2 = np_Conv2d2(input_data=samp,
                   weights_dict=weights,
                   prefix='layers.0.0.',
                   im2col_mat=im2col_layer_dict,
                   col_prefix='start')

Im2col takes: 0.06225603702478111 seconds
Conv takes: 0.3629320561885834 seconds


In [12]:
output = np_Conv2d2(input_data=samp,
                   weights_dict=weights,
                   prefix=layers_list[0],
                   im2col_mat=im2col_layer_dict,
                   col_prefix='start')
output = relu(output)

# Layer 2 - Layer 19
for i in range(len(layers_list)-2):

    if layers_list[i+1].endswith('0.'):
        output = np_Conv2d2(input_data=output,
                           weights_dict=weights,
                           prefix=layers_list[i+1],
                           im2col_mat=im2col_layer_dict,
                           col_prefix='mid')

    elif layers_list[i+1].endswith('1.'):

        output = np_BatchNorm2d(x=output, 
                                weights_dict=weights,
                                prefix=layers_list[i+1])
        output = relu(output)

# Layer 20 (last layer)
output1 = np_Conv2d2(input_data=output,
                   weights_dict=weights,
                   prefix=layers_list[-1],
                   im2col_mat=im2col_layer_dict,
                   col_prefix='last')

resid_img = samp - output1

Im2col takes: 0.0671454300172627 seconds
Conv takes: 0.369020669022575 seconds
Im2col takes: 3.9114704439416528 seconds
Conv takes: 5.694785895990208 seconds
Batch takes 0.5338732560630888 seconds
Im2col takes: 3.9073991880286485 seconds
Conv takes: 5.773855175124481 seconds
Batch takes 0.5317959820386022 seconds
Im2col takes: 3.9137064630631357 seconds
Conv takes: 5.842254780931398 seconds
Batch takes 0.5307894931174815 seconds
Im2col takes: 3.909976133843884 seconds
Conv takes: 5.806535869836807 seconds
Batch takes 0.5313932751305401 seconds
Im2col takes: 3.9199831520672888 seconds
Conv takes: 5.945462896022946 seconds
Batch takes 0.5291646278928965 seconds
Im2col takes: 3.9153064938727766 seconds
Conv takes: 5.970251590013504 seconds
Batch takes 0.5310004490893334 seconds
Im2col takes: 3.908295843983069 seconds
Conv takes: 5.932337681995705 seconds
Batch takes 0.5303960929159075 seconds
Im2col takes: 3.9216194341424853 seconds
Conv takes: 5.980398446088657 seconds
Batch takes 0.5300