In [1]:
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
import pathlib 
import os

import torch
from torch import nn
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"

import PT_files.save_load as sl
from DnCNN_NP.layers  import relu

import time 
from collections import OrderedDict

In [2]:
PATH = pathlib.Path(os.getenv('PSCRATCH'))
DATA = PATH / 'DESI_dn' /'Model_params'
assert DATA.exists()
name = '6k_model_wb_e800_lys20_58feat.pth'

# weights = np.load(DATA / name)
weights = torch.load(str(DATA / name))


#Load the actual data that we're working on & print the shape of this data
test_data = sl.NERSC_load('test_data_40%_6000.npy')
sample = test_data[0]
print('Shape of test set=', sample.shape)

Shape of test set= (108, 1, 6000, 6000)


# **Testing the fftconvolve broadcasting version. Just want to see if it's faster and then see if the results are any better...**

Might be interesting to look at the [numpy-ml package](https://github.com/ddbourgin/numpy-ml/blob/master/numpy_ml/neural_nets/utils/utils.py) to see how they create a 2D Convolution layer.

- They take inspiration from Andrej Karpathy's `im2col.py` file which can be found in these [slides here](http://cs231n.stanford.edu/slides/2016/winter1516_lecture11.pdf). 

- An article that I believe talks about the speed of `im2col.py` is [linked here](https://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/). Need to read.

- Lastly, there's a nice package that implements convolutions in a slow (ie. nested for loops) and a fast way (ie. uses im2col idea). Seems like it's fast? That's linked [here](https://github.com/3outeille/CNNumpy/blob/master/src/slow/layers.py)
    - [Blog post](https://hackmd.io/@machine-learning/blog-post-cnnumpy-slow) discussing the `slow` version 
    - [Blog post](https://hackmd.io/@machine-learning/blog-post-cnnumpy-fast) discussing the `fast` version that uses im2col

In [3]:
def np_Conv2d(input_data, weights_dict, prefix, stride=1, padding="same", dilation=1):
    """
    Numpy implementation of the PyTorch Conv2d layer that uses the 
    learned PyTorch weights in the model.
    
    Parameters:
    -----------
    input_data: nd.array
        Input data of shape '(batch_size, in_channels, height, width)'
    weights_dict: OrderedDict
        weights_dict['weight']: torch.Tensor
            Weights tensor of shape '(out_channels, in_channels, kernel_size[0], kernel_size[1])'
        weights_dict['bias']: torch.Tensor
            Bias tensor of shape '(out_channels)'
    stride: int, optional
        The number of entries by which the filter is moved at each step.
        Defaults to 1
    padding: str, optional
        What padding strategy to use for this conv layer. Defaults to "same",
        which pads the layer such that the output has the same height and width
        as the input when the stride = 1. Specifically makes output of
        scipy.correlate2d have same shape as in1. An alternative option is "valid",
        which means no padding is done and the output has smaller dimensions
        than the input.
    dilation: int, optional
        Spacing between kernel elements.
        Defaults to 1.
     
        
    Returns:
    --------
    output: nd.array
        Array output of the convolution step with shape
        `(batch_size, out_channels, out_height, out_width)`.
    
    """
    
    # Checking to see if a single sample or a batch of samples is given.
    # If batch take the batch_size, in_channels, H, and W
    # If single sample is given reshape so the values above can be calculated
    dimensions_start = time.perf_counter()
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 6000 , 6000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    dimensions_end = time.perf_counter()
    print('Getting input dimensions takes', dimensions_end-dimensions_start, 'seconds')

    # Load the weights and biases needed for a convolution
    # then take off gpu memory, move to CPU memory,
    # and lastly transform to numpy
    loading_start = time.perf_counter()
    weight = weights_dict[str(prefix) + 'weight']
    weight = weight.detach().cpu().numpy()
    
    bias = weights_dict[str(prefix) + 'bias']
    bias = bias.detach().cpu().numpy()
    
    # Calculate the kernel size and output channels from
    # the loaded weights from above
    kernel_size = weight[0][0].shape
    output_channels = len(weight)
    loading_end = time.perf_counter()
    print('Loading the weights takes', loading_end-loading_start, 'seconds')
    
    # Convert string padding into numerical padding
    # Using strings allow for one variable to account for padding & mode (see signal.correlated2d)
    out_dimensions_start = time.perf_counter()
    mode = padding
    if mode == "same":
        padding = 1
    elif mode == "valid":
        padding = 0
    
    # Calculations for the output H and W dimensions
    height_out = ((height + (2*padding) - dilation * (kernel_size[0] - 1) - 1) / stride) + 1
    height_out = int(height_out)
    width_out = ((width + (2*padding) - dilation * (kernel_size[1] - 1) - 1) / stride) + 1
    width_out = int(width_out)

    # Create empty array of correct output dimensions
    # output = np.empty((batch_size, output_channels, height_out, width_out))
    output = np.zeros((batch_size, output_channels, height_out, width_out))
    out_dimensions_end = time.perf_counter()
    print('Getting output dimensions takes', out_dimensions_end-out_dimensions_start, 'seconds')
    
    # Place the cross correlated elements into the newly created 
    # empty array of correct output dimensions
    loop_start = time.perf_counter()
    
    for i in range(batch_size):
        for j in range(output_channels):
            for k in range(input_channels):
                # See PyTorch docs for this eqn: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
                output[i, j, :, :] = bias[j] + signal.fftconvolve(input_data[i][k], weight[j][k][::-1, ::-1], mode=mode)
                # output[i, j, :, :] = bias[j] + signal.correlate2d(input_data[i][k], weight[j][k], mode=mode)

    loop_end = time.perf_counter()
    print('Convolution loop takes', loop_end-loop_start, 'seconds')
    
    return output

In [4]:
np_conv_out = np_Conv2d(input_data=sample[0], weights_dict=weights, prefix='layers.0.0.')

Getting input dimensions takes 5.330002750270069e-06 seconds
Loading the weights takes 0.0003740029933396727 seconds
Getting output dimensions takes 0.0001379629975417629 seconds
Convolution loop takes 46.13671796300332 seconds


In [13]:
weight = weights['layers.0.0.weight'].detach().cpu().numpy()
bias = weights['layers.0.0.bias'].detach().cpu().numpy()
samples = sample[:2]
print(samples.shape)
print(weight.shape)

samples = samples.reshape((2, 1, 1, 6000, 6000))
weight = weight.reshape((1, 1, 58, 3, 3))

broadcasting_start = time.perf_counter()
speed = signal.fftconvolve(samples, weight, mode='same', axes=(3, 4))
broadcasting_end = time.perf_counter()
print('Convolution via broadcasting takes', broadcasting_end-broadcasting_start, 'seconds')

(2, 1, 6000, 6000)
(58, 1, 3, 3)
Convolution via broadcasting takes 49.164740362990415 seconds


In [14]:
speed.shape

(2, 1, 1, 6000, 6000)

# BatchNorm

In [None]:


def np_BatchNorm2d(input_data, prefix, weights_dict, epsilon=1e-5):
    
    x = input_data
    
    gamma = weights_dict[str(prefix) + 'weight'].detach().cpu().numpy().reshape(-1, 1, 1)
    beta = weights_dict[str(prefix) + 'bias'].detach().cpu().numpy().reshape(-1, 1, 1)
    mean = weights_dict[str(prefix) + 'running_mean'].detach().cpu().numpy().reshape(-1, 1, 1)
    var = weights_dict[str(prefix) + 'running_var'].detach().cpu().numpy().reshape(-1, 1, 1)
        
        
    output = ((x - mean) / np.sqrt(var + epsilon)) * gamma + beta
    return output