In [3]:
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
import pathlib 
import os

import torch
from torch import nn
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"

import PT_files.save_load as sl
from DnCNN_NP.layers  import relu, np_BatchNorm2d

import time 
from collections import OrderedDict
import pdb

In [4]:
PATH = pathlib.Path(os.getenv('PSCRATCH'))
DATA = PATH / 'DESI_dn' /'Model_params'
assert DATA.exists()
# name = '6k_model_wb_e800_lys20_58feat.pth'
name = '2k_model_bs64_e800_ps50_Adam.pth'
# weights = np.load(DATA / name)
weights = torch.load(str(DATA / name))


#Load the actual data that we're working on & print the shape of this data
test_data = sl.NERSC_load('test_data_40%_6000.npy')
sample = test_data[0]
print('Shape of test set=', sample.shape)

samp = sample[0][0][1000:3000, 1000:3000]
samp = samp.reshape((1, 1, 2000, 2000))

Shape of test set= (108, 1, 6000, 6000)


In [5]:
def get_indices(input_data, weights_dict, prefix, stride=1, padding=1):
    get_indices_start = time.perf_counter()

    # Get input size
    
    # Checking to see if a single sample or a batch of samples is given.
    # If batch take the batch_size, in_channels, H, and W
    # If single sample is given reshape so the values above can be calculated
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    # Load the weights and biases needed for a convolution
    # then take off gpu memory, move to CPU memory,
    # and lastly transform to numpy
    weight = weights_dict[str(prefix) + 'weight']
    weight = weight.detach().cpu().numpy()
    
    bias = weights_dict[str(prefix) + 'bias']
    bias = bias.detach().cpu().numpy()
    
    # Calculate the kernel size and output channels from
    # the loaded weights from above
    kernel_size = weight[0][0].shape
    output_channels = len(weight)
    
    # Calculations for the output H and W dimensions
    height_out = ((height + (2*padding) - (kernel_size[0] - 1) - 1) / stride) + 1
    height_out = int(height_out)
    width_out = ((width + (2*padding) - (kernel_size[1] - 1) - 1) / stride) + 1
    width_out = int(width_out)
    
    
    # ----Compute matrix of index i----

    # Level 1 vector.
    level1 = np.repeat(np.arange(kernel_size[0]), kernel_size[1])
    # Duplicate for the other channels.
    level1 = np.tile(level1, input_channels)
    # Create a vector with an increase by 1 at each level.
    everyLevels = stride * np.repeat(np.arange(height_out), width_out)
    # Create matrix of index i at every levels for each channel.
    i = level1.reshape(-1, 1) + everyLevels.reshape(1, -1)
    
    # ----Compute matrix of index j----
    
    # Slide 1 vector.
    slide1 = np.tile(np.arange(kernel_size[1]), kernel_size[0])
    # Duplicate for the other channels.
    slide1 = np.tile(slide1, input_channels)
    # Create a vector with an increase by 1 at each slide.
    everySlides = stride * np.tile(np.arange(width_out), height_out)
    # Create matrix of index j at every slides for each channel.
    j = slide1.reshape(-1, 1) + everySlides.reshape(1, -1)
    
    # ----Compute matrix of index d----

    # This is to mark delimitation for each channel
    # during multi-dimensional arrays indexing.
    d = np.repeat(np.arange(input_channels), kernel_size[0] * kernel_size[1]).reshape(-1, 1)
    
    get_indices_end = time.perf_counter()
    print('get_indices takes:', get_indices_end-get_indices_start, 'seconds')
    
    return i, j, d

def im2col(input_data, weights_dict, prefix, stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        - X: input image.
        - HF: filter height.
        - WF: filter width.
        - stride: stride value.
        - pad: padding value.

        Returns:
        -cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)

    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    i, j, d = get_indices(input_data=input_data, weights_dict=weights_dict, prefix=prefix)
    # Multi-dimensional arrays indexing.
    cols = input_padded[:, d, i, j]
    cols = np.concatenate(cols, axis=-1)
    
    im2col_end = time.perf_counter()
    print('Im2col takes:', im2col_end-im2col_start, 'seconds')
    
    return cols

def np_Conv2d(input_data, weights_dict, prefix):
    """
        Performs a forward convolution.

        Parameters:
        - X : Last conv layer of shape (m, n_C_prev, n_H_prev, n_W_prev).
        Returns:
        - out: previous layer convolved.
    """
    
    conv_start = time.perf_counter()
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)


    output_channels = len(weights_dict[str(prefix) + 'weight']) # num_of_filters
    height_out = int((height + 2 * 1 - 3)/ 1) + 1
    width_out = int((width + 2 * 1 - 3)/ 1) + 1

    X_col = im2col(input_data=input_data, weights_dict=weights_dict, prefix=prefix)
    w_col = weights_dict[str(prefix) + 'weight'].detach().cpu().numpy().reshape((output_channels, -1))
    b_col = weights_dict[str(prefix) + 'bias'].detach().cpu().numpy().reshape(-1, 1)
    # Perform matrix multiplication.
    out = w_col @ X_col + b_col
    # Reshape back matrix to image.
    out = np.array(np.hsplit(out, batch_size)).reshape((batch_size, output_channels, height_out, width_out))
    
    conv_end = time.perf_counter()
    print('Conv takes:', conv_end-conv_start, 'seconds')
    return out

In [6]:
# First layer convolution

# Note: The time for im2col is the time for im2col as well as get_indices
# due to im2col calling get_indices

# Note: This is the same for conv, except it calls im2col, which then calls
# get_indices, thus you need to subtract the time of the previous function
# against the current function to get the correct time
get_indices(input_data=samp, weights_dict=weights, prefix='layers.0.0.');
print()
im2col(input_data=samp, weights_dict=weights, prefix='layers.0.0.');
print()
out = np_Conv2d(input_data=samp, weights_dict=weights, prefix='layers.0.0.');

get_indices takes: 0.0936635509970074 seconds

get_indices takes: 0.09243214500020258 seconds
Im2col takes: 0.27699330099858344 seconds

get_indices takes: 0.09232533899921691 seconds
Im2col takes: 0.3036277600003814 seconds
Conv takes: 0.6068648460022814 seconds


In [5]:
# Second layer convolution
get_indices(input_data=out, weights_dict=weights, prefix='layers.1.0.');
print()
im2col(input_data=out, weights_dict=weights, prefix='layers.1.0.');
print()
out2 = np_Conv2d(input_data=out, weights_dict=weights, prefix='layers.1.0.');

get_indices takes: 4.5244469749995915 seconds

get_indices takes: 4.50654714799748 seconds
Im2col takes: 15.913829680997878 seconds

get_indices takes: 4.469645105997188 seconds
Im2col takes: 15.874881094001466 seconds
Conv takes: 17.774021723998885 seconds


Testing to find the bottleneck.

THe bottleneck is in the slicing of the multi-dimensional arrays indexing

In [6]:
def im2col_testing(input_data, weights_dict, prefix, stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        - X: input image.
        - HF: filter height.
        - WF: filter width.
        - stride: stride value.
        - pad: padding value.

        Returns:
        -cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
    
    padding_start = time.perf_counter()
    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    i, j, d = get_indices(input_data=input_data, weights_dict=weights_dict, prefix=prefix)

    padding_end = time.perf_counter()
    print('Im2col padding takes:', padding_end-padding_start, 'seconds')
    print('This includes get_indices')
    print()
    
    array_indexing_start = time.perf_counter()
    # Multi-dimensional arrays indexing.
    array_padding_start = time.perf_counter()
    cols = input_padded[:, d, i, j] # BOTTLENECK IS HERE
    array_padding_end = time.perf_counter()
    print('Im2col array padding takes:', array_padding_end-array_padding_start, 'seconds')
    print()
    
    col_concat_start = time.perf_counter()
    cols = np.concatenate(cols, axis=-1)
    col_concat_end = time.perf_counter()
    print('Im2col array column concatenation takes:', col_concat_end-col_concat_start, 'seconds')
    print()
    
    array_indexing_end = time.perf_counter()
    print('Im2col array indexing takes:', array_indexing_end-array_indexing_start, 'seconds')
    print()
    
    im2col_end = time.perf_counter()
    print('Im2col takes:', im2col_end-im2col_start, 'seconds')
    
    return cols

In [7]:
im2col_testing(input_data=out, weights_dict=weights, prefix='layers.1.0.');

get_indices takes: 4.626220476999151 seconds
Im2col padding takes: 4.756974407999223 seconds
This includes get_indices

Im2col array padding takes: 10.207006034001097 seconds

Im2col array column concatenation takes: 1.0699494200016488 seconds

Im2col array indexing takes: 11.277642648998153 seconds

Im2col takes: 16.034689907999564 seconds


In [7]:
def im2col_testing(input_data, weights_dict, prefix, stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        - X: input image.
        - HF: filter height.
        - WF: filter width.
        - stride: stride value.
        - pad: padding value.

        Returns:
        -cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
    
    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    i, j, d = get_indices(input_data=input_data, weights_dict=weights_dict, prefix=prefix)
    
    array_indexing_start = time.perf_counter()
    # Multi-dimensional arrays indexing.
    array_slicing_start = time.perf_counter()
    transposed_input_padded = input_padded.T
    cols = transposed_input_padded[:, d, i, j].T
    array_slicing_end = time.perf_counter()
    print('Im2col array slicing takes:', array_slicing_end-array_slicing_start, 'seconds')
    print()    
    cols = np.concatenate(cols, axis=-1)
    
    im2col_end = time.perf_counter()
    print('Im2col takes:', im2col_end-im2col_start, 'seconds')
    
    return cols

In [8]:
im2col_testing(input_data=out, weights_dict=weights, prefix='layers.1.0.');

get_indices takes: 4.650804878998315 seconds


IndexError: index 64 is out of bounds for axis 2 with size 64

# Testing to see how many times 'get_indices' will be called

In [10]:
def get_indices_testing(input_data, weights_dict, prefix, stride=1, padding=1):
    get_indices_start = time.perf_counter()

    # Get input size
    
    # Checking to see if a single sample or a batch of samples is given.
    # If batch take the batch_size, in_channels, H, and W
    # If single sample is given reshape so the values above can be calculated
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    # Load the weights and biases needed for a convolution
    # then take off gpu memory, move to CPU memory,
    # and lastly transform to numpy
    weight = weights_dict[str(prefix) + 'weight']
    weight = weight.detach().cpu().numpy()
    
    bias = weights_dict[str(prefix) + 'bias']
    bias = bias.detach().cpu().numpy()
    
    # Calculate the kernel size and output channels from
    # the loaded weights from above
    kernel_size = weight[0][0].shape
    output_channels = len(weight)
    
    # Calculations for the output H and W dimensions
    height_out = ((height + (2*padding) - (kernel_size[0] - 1) - 1) / stride) + 1
    height_out = int(height_out)
    width_out = ((width + (2*padding) - (kernel_size[1] - 1) - 1) / stride) + 1
    width_out = int(width_out)
    
    mati_start = time.perf_counter()

    # ----Compute matrix of index i----

    # Level 1 vector.
    level1 = np.repeat(np.arange(kernel_size[0]), kernel_size[1])
    # Duplicate for the other channels.
    level1 = np.tile(level1, input_channels)
    # Create a vector with an increase by 1 at each level.
    everyLevels = stride * np.repeat(np.arange(height_out), width_out)
    # Create matrix of index i at every levels for each channel.
    i = level1.reshape(-1, 1) + everyLevels.reshape(1, -1)
    
    mati_end = time.perf_counter()
    print('get_indices matrix i computation takes:', mati_end-mati_start, 'seconds')

    matj_start = time.perf_counter()
    # ----Compute matrix of index j----
    
    # Slide 1 vector.
    slide1 = np.tile(np.arange(kernel_size[1]), kernel_size[0])
    # Duplicate for the other channels.
    slide1 = np.tile(slide1, input_channels)
    # Create a vector with an increase by 1 at each slide.
    everySlides = stride * np.tile(np.arange(width_out), height_out)
    # Create matrix of index j at every slides for each channel.
    j = slide1.reshape(-1, 1) + everySlides.reshape(1, -1)
    
    matj_end = time.perf_counter()
    print('get_indices matrix j computation takes:', matj_end-matj_start, 'seconds')

    matd_start = time.perf_counter()
    # ----Compute matrix of index d----

    # This is to mark delimitation for each channel
    # during multi-dimensional arrays indexing.
    d = np.repeat(np.arange(input_channels), kernel_size[0] * kernel_size[1]).reshape(-1, 1)
    
    matd_end = time.perf_counter()
    print('get_indices matrix d computation takes:', matd_end-matd_start, 'seconds')
    print()
    
    get_indices_end = time.perf_counter()
    print('get_indices takes:', get_indices_end-get_indices_start, 'seconds')
    
    return i, j, d

In [11]:
last = np_Conv2d(input_data=out2, weights_dict=weights, prefix='layers.18.0.')

get_indices takes: 4.493157625001913 seconds
Im2col takes: 15.974034223996568 seconds
Conv takes: 17.68410379800116 seconds


In [12]:
i,j,d = get_indices(input_data=samp, weights_dict=weights, prefix='layers.0.0.');
i1, j1, d1=get_indices(input_data=out, weights_dict=weights, prefix='layers.1.0.');
i2, j2, d2=get_indices(input_data=out2, weights_dict=weights, prefix='layers.2.0.');

get_indices takes: 0.09129129400025704 seconds
get_indices takes: 4.535640594000142 seconds
get_indices takes: 4.608535564002523 seconds


In [13]:
ilast, jlast, dlast = get_indices(input_data=last, weights_dict=weights, prefix='layers.19.')

get_indices takes: 4.304084053001134 seconds


In [14]:
# print(np.allclose(i,i1))
print(np.allclose(i1,i2))

# print(np.allclose(j,j1))
print(np.allclose(j1,j2))

# print(np.allclose(d,d1))
print(np.allclose(d1,d2))

True
True
True


In [15]:
np.allclose(i,i1)

ValueError: operands could not be broadcast together with shapes (9,4000000) (576,4000000) 

In [18]:
last.shape

(1, 64, 2000, 2000)

In [16]:
np.allclose(i, ilast)

ValueError: operands could not be broadcast together with shapes (9,4000000) (576,4000000) 