In [1]:
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
import pathlib 
import os
import pickle

import torch
from torch import nn
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"

import PT_files.save_load as sl
from DnCNN_NP.layers  import relu, np_BatchNorm2d

import time 
from collections import OrderedDict
import pdb

**The goal of this notebook is to implement the optimization we found in notebook `11B_testing_im2col_times` where we call `get_indices` 3 times and then saving those 3 indice matrices. Then we use `np.ravel_multi_index()` in `im2col`.**

**NOTE: This is for a 2k by 2k image instead of the full 6k by 6k image.**

This notebook is creating the respective index matrices, so that it can be used in another notebook which just uses the already created matrices, instead of creating their own every call. This is supposedly to save a lot of time.

In [2]:
# Loading data & weights dictionary

PATH = pathlib.Path(os.getenv('PSCRATCH'))
DATA = PATH / 'DESI_dn' /'Model_params'
assert DATA.exists()
# name = '6k_model_wb_e800_lys20_58feat.pth'
name = '2k_model_bs64_e800_ps50_Adam.pth'
# weights = np.load(DATA / name)
weights = torch.load(str(DATA / name))


#Load the actual data that we're working on & print the shape of this data
test_data = sl.NERSC_load('test_data_40%_6000.npy')
sample = test_data[0]
print('Shape of test set=', sample.shape)

samp = sample[0][0][1000:3000, 1000:3000]
samp = samp.reshape((1, 1, 2000, 2000))

Shape of test set= (108, 1, 6000, 6000)


Need to call this three times:
1. First for the untransformed input. (1 channel -> 64 channels)
2. For the middle layers (64 channels -> 64 channels)
3. For the last layer (64 channels -> 1 channel)

In [3]:
def get_indices(input_data, weights_dict, prefix, stride=1, padding=1):
    get_indices_start = time.perf_counter()

    # Get input size
    
    # Checking to see if a single sample or a batch of samples is given.
    # If batch take the batch_size, in_channels, H, and W
    # If single sample is given reshape so the values above can be calculated
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    # Load the weights and biases needed for a convolution
    # then take off gpu memory, move to CPU memory,
    # and lastly transform to numpy
    weight = weights_dict[str(prefix) + 'weight']
    weight = weight.detach().cpu().numpy()
    
    bias = weights_dict[str(prefix) + 'bias']
    bias = bias.detach().cpu().numpy()
    
    # Calculate the kernel size and output channels from
    # the loaded weights from above
    kernel_size = weight[0][0].shape
    output_channels = len(weight)
    
    # Calculations for the output H and W dimensions.
    height_out = ((height + (2*padding) - (kernel_size[0] - 1) - 1) / stride) + 1
    height_out = int(height_out)
    width_out = ((width + (2*padding) - (kernel_size[1] - 1) - 1) / stride) + 1
    width_out = int(width_out)
    
    
    # ----Compute matrix of index i----

    # Level 1 vector.
    level1 = np.repeat(np.arange(kernel_size[0]), kernel_size[1])
    # Duplicate for the other channels.
    level1 = np.tile(level1, input_channels)
    # Create a vector with an increase by 1 at each level.
    everyLevels = stride * np.repeat(np.arange(height_out), width_out)
    # Create matrix of index i at every levels for each channel.
    i = level1.reshape(-1, 1) + everyLevels.reshape(1, -1)
    
    # ----Compute matrix of index j----
    
    # Slide 1 vector.
    slide1 = np.tile(np.arange(kernel_size[1]), kernel_size[0])
    # Duplicate for the other channels.
    slide1 = np.tile(slide1, input_channels)
    # Create a vector with an increase by 1 at each slide.
    everySlides = stride * np.tile(np.arange(width_out), height_out)
    # Create matrix of index j at every slides for each channel.
    j = slide1.reshape(-1, 1) + everySlides.reshape(1, -1)
    
    # ----Compute matrix of index d----

    # This is to mark delimitation for each channel
    # during multi-dimensional arrays indexing.
    d = np.repeat(np.arange(input_channels), kernel_size[0] * kernel_size[1]).reshape(-1, 1)
    
    get_indices_end = time.perf_counter()
    print('get_indices takes:', get_indices_end-get_indices_start, 'seconds')
    
    return i, j, d

Need to load `im2col` &` np_Conv2d`because we need to get the output of `np_Conv2d` for the first layer, intermediate layers, and last layer so that we have the correct shapes for the indices (via `get_indices` that will be used in `im2col` and thus `np_Conv2d`. The differences of `im2col2` and `np_conv2d2` are the versions of the functions that instead of creating the `i, j, d` index matrices it automatically loads them in thus saving ~4 seconds in the intermediate layers of the model that would call `get_indices` for every Conv layer.

In [4]:
def im2col(input_data, weights_dict, prefix, stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        -----------
        input_data: nd.array
            The input image(s)
        weights_dict: OrderedDict
            Dictionary containing the PyTorch trained weights for every 
            layer of the model
        prefix: str
            The prefix that picks out the specific layer's weights to be used
            E.g. prefix='layers.0.0.' would be the first layers convolutional
            weights and bias's

        Returns:
        --------
        cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)

    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    i, j, d = get_indices(input_data=input_data, weights_dict=weights_dict, prefix=prefix)
    # Multi-dimensional arrays indexing.
    cols = input_padded[:, d, i, j]
    cols = np.concatenate(cols, axis=-1)
    
    im2col_end = time.perf_counter()
    print('Im2col takes:', im2col_end-im2col_start, 'seconds')
    
    return cols

def im2col2(input_data, weights_dict, prefix, layer_matrices,  stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        -----------
        input_data: nd.array
            The input image(s)
        weights_dict: OrderedDict
            Dictionary containing the PyTorch trained weights for every 
            layer of the model
        prefix: str
            The prefix that picks out the specific layer's weights to be used
            E.g. prefix='layers.0.0.' would be the first layers convolutional
            weights and bias's

        Returns:
        --------
        cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)

    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    i, j, d = layer_matrices
    # Multi-dimensional arrays indexing.
    idx = np.ravel_multi_index(([0], d, i, j), input_padded.shape)
    cols2 = input_padded.reshape(-1)[idx]  
    
    im2col_end = time.perf_counter()
    print('Im2col takes:', im2col_end-im2col_start, 'seconds')
    
    return cols2

def np_Conv2d(input_data, weights_dict, prefix):
    """
        Performs a forward convolution.

        Parameters:
        - X : Last conv layer of shape (m, n_C_prev, n_H_prev, n_W_prev).
        Returns:
        - out: previous layer convolved.
    """
    
    conv_start = time.perf_counter()
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)


    output_channels = len(weights_dict[str(prefix) + 'weight']) # num_of_filters
    height_out = int((height + 2 * 1 - 3)/ 1) + 1
    width_out = int((width + 2 * 1 - 3)/ 1) + 1

    X_col = im2col(input_data=input_data, weights_dict=weights_dict, prefix=prefix)
    w_col = weights_dict[str(prefix) + 'weight'].detach().cpu().numpy().reshape((output_channels, -1))
    b_col = weights_dict[str(prefix) + 'bias'].detach().cpu().numpy().reshape(-1, 1)
    # Perform matrix multiplication.
    out = w_col @ X_col + b_col
    # Reshape back matrix to image.
    out = np.array(np.hsplit(out, batch_size)).reshape((batch_size, output_channels, height_out, width_out))
    
    conv_end = time.perf_counter()
    print('Conv takes:', conv_end-conv_start, 'seconds')
    return out

def np_Conv2d2(input_data, weights_dict, prefix, layers_matrices):
    """
        Performs a forward convolution.

        Parameters:
        - X : Last conv layer of shape (m, n_C_prev, n_H_prev, n_W_prev).
        Returns:
        - out: previous layer convolved.
    """
    
    conv_start = time.perf_counter()
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)


    output_channels = len(weights_dict[str(prefix) + 'weight']) # num_of_filters
    height_out = int((height + 2 * 1 - 3)/ 1) + 1
    width_out = int((width + 2 * 1 - 3)/ 1) + 1

    
    X_col = im2col2(input_data=input_data,weights_dict=weights_dict,prefix=prefix,layer_matrices=layers_matrices)    
    w_col = weights_dict[str(prefix) + 'weight'].detach().cpu().numpy().reshape((output_channels, -1))
    b_col = weights_dict[str(prefix) + 'bias'].detach().cpu().numpy().reshape(-1, 1)
    # Perform matrix multiplication.
    out = w_col @ X_col + b_col
    # Reshape back matrix to image.
    out = np.array(np.hsplit(out, batch_size)).reshape((batch_size, output_channels, height_out, width_out))
    
    conv_end = time.perf_counter()
    print('Conv takes:', conv_end-conv_start, 'seconds')
    return out

Creating the correct shapes/values of the intermediate arrays that are necessary
for creating the intermediate and final index matrices.

These use the original functions that take a long time to process
(ie. the unoptimized versions)


In [5]:
# Creating the correct shapes/values of the intermediate arrays that are necessary
# for creating the intermediate and final index matrices.
#
# These use the original functions that take a long time to process
# (ie. the unoptimized versions)

# First layer
conv0 = np_Conv2d(input_data=samp, weights_dict=weights, prefix='layers.0.0.')
out0 = relu(conv0)

# Second layer
conv1 = np_Conv2d(input_data=out0, weights_dict=weights, prefix='layers.1.0.')
batch1 = np_BatchNorm2d(x=conv1, weights_dict=weights, prefix='layers.1.1.')
out1 = relu(batch1)

# Third layer 
conv2 = np_Conv2d(input_data=out1, weights_dict=weights, prefix='layers.2.0.')
batch2 = np_BatchNorm2d(x=conv2, weights_dict=weights, prefix='layers.2.1.')
out2 = relu(batch2)

# Last layer: Due to get_indices being only dependent on shape and not actual 
# values we can just use any of the intermediate outputs, due to them 
# having the correct shape.
conv19 = np_Conv2d(input_data=out2, weights_dict=weights, prefix='layers.19.')


get_indices takes: 0.20318904699524865 seconds
Im2col takes: 0.38422512699617073 seconds
Conv takes: 0.6892922529950738 seconds
get_indices takes: 4.506992716982495 seconds
Im2col takes: 16.17006884800503 seconds
Conv takes: 18.831207142968196 seconds
Batch takes 0.5197017129976302 seconds
get_indices takes: 4.58227440295741 seconds
Im2col takes: 16.320511292957235 seconds
Conv takes: 18.291405173018575 seconds
Batch takes 0.5131230020197108 seconds
get_indices takes: 4.383848374010995 seconds
Im2col takes: 15.872206142987125 seconds
Conv takes: 16.21529218700016 seconds


In [10]:
print(out2.shape)
print(conv19.shape)
print(weights['layers.19.weight'].shape)
print(len(weights['layers.19.weight']))

(1, 64, 2000, 2000)
(1, 1, 2000, 2000)
torch.Size([1, 64, 3, 3])
1


Creation of the first index matrix (1 C -> 64 C), the intermediate
index matrix (64 C -> 64 C), and the final index matrix (64 C -> 1 C).

For the intermediate index matrix we need the shape of the input data, but
because the first layer transforms the shape of the input data, we need 
to run the first layer of the model to get the correct shape of the data
that will be used for creating the index matrix.

NOTE: If the values of the last index matrix don't work, we'll need to create the whole model and use the final output to create the correct `i_last, j_last, d_last` index matrices.

**NOTE: It seems it does NOT work. Thus we'll need to run the whole model toward the end except for the last layer to get the correct indices**

In [6]:
# Creation of the first index matrix (1 C -> 64 C) and the intermediate
# index matrix (64 C -> 64 C).
#
# For the intermediate index matrix we need the shape of the input data, but
# because the first layer transforms the shape of the input data, we need 
# to run the first layer of the model to get the correct shape of the data
# that will be used for creating the index matrix


# First layer
i_start, j_start, d_start = get_indices(input_data=samp, weights_dict=weights, prefix='layers.0.0.')
index_mat_start = (i_start, j_start, d_start)

# Second layer
i_mid, j_mid, d_mid = get_indices(input_data=out0, weights_dict=weights, prefix='layers.1.0.')
index_mat_mid = (i_mid, j_mid, d_mid)

# Last layer
# NOTE: If these don't work we'll need to run the entire model to get the
# correct values/shape of the output data.
i_last, j_last, d_last = get_indices(input_data=conv19, weights_dict=weights, prefix='layers.19.')
index_mat_last = (i_last, j_last, d_last)

get_indices takes: 0.09487872099271044 seconds
get_indices takes: 4.8536589840077795 seconds
get_indices takes: 0.09628924203570932 seconds


In [11]:
index_matrices = {'start': index_mat_start, 'mid': index_mat_mid, 'last': index_mat_last}

In [12]:
ind = sl.NERSC_load(name='index_matrices_2k.pkl')

Creating a new version of `im2col` where it loads in the saved index matrices created from `get_indices`, and outputs the output of `np.ravel_multi_index()` to then be run 3 times (due to the 3 calls of `get_indices` we must do to go from 1C->64C, 64C->64C, 64C->1C) to then combine the three runs of `np.ravel_multi_index()` into a dictionary to then be saved to the `Data` directory.

In [13]:
# def im2col2_save(input_data, prefix, layer_matrices,  stride=1, padding=1):
#     """
#         Transforms our input image into a matrix.

#         Parameters:
#         -----------
#         input_data: nd.array
#             The input image(s)
#         weights_dict: OrderedDict
#             Dictionary containing the PyTorch trained weights for every 
#             layer of the model
#         prefix: str
#             Prefix to use to identify which multi-dimensional array indexing 
#             array we're saving (ie. first, mid, last). Similar to the naming
#             convetion we have for the individual matrix indices from 
#             get_indices

#         Returns:
#         --------
#         cols: output matrix.
#     """
#     im2col_start = time.perf_counter()

#     if len(input_data.shape) == 4:
    
#         batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
#     elif len(input_data.shape) == 3:
        
#         input_data = input_data.reshape((1, 1, 2000 , 2000))
#         batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)

#     # Padding
#     input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
#     i, j, d = layer_matrices
#     # Multi-dimensional arrays indexing.
#     idx = np.ravel_multi_index(([0], d, i, j), input_padded.shape)
    
    
#     sl.NERSC_save(name='array_idx_' +str(prefix), data=idx)
    
    
#     cols2 = input_padded.reshape(-1)[idx]  
    
#     im2col_end = time.perf_counter()
#     print('Im2col takes:', im2col_end-im2col_start, 'seconds')
    
#     return cols2

In [None]:
# cols2 = im2col2_save(input_data=samp, prefix='start_2k.pkl', layer_matrices=index_matrices['start'])

In [22]:
def im2col2_save(input_data, layer_matrices,  stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        -----------
        input_data: nd.array
            The input image(s)
        weights_dict: OrderedDict
            Dictionary containing the PyTorch trained weights for every 
            layer of the model
        prefix: str
            Prefix to use to identify which multi-dimensional array indexing 
            array we're saving (ie. first, mid, last). Similar to the naming
            convetion we have for the individual matrix indices from 
            get_indices

        Returns:
        --------
        cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)

    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    i, j, d = layer_matrices
    # Multi-dimensional arrays indexing.
    idx = np.ravel_multi_index(([0], d, i, j), input_padded.shape)
    
    return idx

In [29]:
idx_start = im2col2_save(input_data=samp, layer_matrices=index_matrices['start'])
idx_mid = im2col2_save(input_data=out0, layer_matrices=index_matrices['mid'])
idx_last = im2col2_save(input_data=conv19, layer_matrices=index_matrices['last'])

im2col_layer_dict = {'start': idx_start, 'mid':idx_mid, 'last': idx_last}

sl.NERSC_save(name='im2col_layer_dict_2k.pkl', data=im2col_layer_dict)

You are not on NERSC?


In [30]:
im2col_mat = sl.NERSC_load(name='im2col_layer_dict_2k.pkl')

Testing the outputs of the original functions for a 2D Convolution (ie. `get_indices`, `im2col`, and `np_Conv2d`) and comparing them to the modified versions of the functions that take the saved index matrices and saved sliced matrices from `im2col2` and `np_Conv2d2`.

In [45]:
def im2col(input_data, weights_dict, prefix, stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        -----------
        input_data: nd.array
            The input image(s)
        weights_dict: OrderedDict
            Dictionary containing the PyTorch trained weights for every 
            layer of the model
        prefix: str
            The prefix that picks out the specific layer's weights to be used
            E.g. prefix='layers.0.0.' would be the first layers convolutional
            weights and bias's

        Returns:
        --------
        cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)

    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    i, j, d = get_indices(input_data=input_data, weights_dict=weights_dict, prefix=prefix)
    # Multi-dimensional arrays indexing.
    cols = input_padded[:, d, i, j]
    cols = np.concatenate(cols, axis=-1)
    
    im2col_end = time.perf_counter()
    print('Im2col takes:', im2col_end-im2col_start, 'seconds')
    
    return cols

def im2col2(input_data, im2col_mat, col_prefix, stride=1, padding=1):
    """
        Transforms our input image into a matrix.

        Parameters:
        -----------
        input_data: nd.array
            The input image(s)
        weights_dict: OrderedDict
            Dictionary containing the PyTorch trained weights for every 
            layer of the model
        prefix: str
            The prefix that picks out the specific layer's weights to be used
            E.g. prefix='layers.0.0.' would be the first layers convolutional
            weights and bias's

        Returns:
        --------
        cols: output matrix.
    """
    im2col_start = time.perf_counter()

    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)

    # Padding
    input_padded = np.pad(input_data, ((0,0), (0,0), (padding, padding), (padding, padding)), mode='constant')
    # Multi-dimensional arrays indexing.
    idx = im2col_mat[str(col_prefix)]
    cols2 = input_padded.reshape(-1)[idx]  
    
    im2col_end = time.perf_counter()
    print('Im2col takes:', im2col_end-im2col_start, 'seconds')
    
    return cols2

In [46]:
cols_start = im2col2(input_data=samp,
                     im2col_mat=im2col_layer_dict,
                     col_prefix='start')

cols_mid = im2col2(input_data=out1,
                     im2col_mat=im2col_layer_dict,
                     col_prefix='mid')


cols_last = im2col2(input_data=conv19,
                     im2col_mat=im2col_layer_dict,
                     col_prefix='last')

Im2col takes: 0.039900068019051105 seconds
Im2col takes: 2.5748618410434574 seconds
Im2col takes: 0.04051882104249671 seconds


In [47]:
def np_Conv2d(input_data, weights_dict, prefix):
    """
        Performs a forward convolution.

        Parameters:
        - X : Last conv layer of shape (m, n_C_prev, n_H_prev, n_W_prev).
        Returns:
        - out: previous layer convolved.
    """
    
    conv_start = time.perf_counter()
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)


    output_channels = len(weights_dict[str(prefix) + 'weight']) # num_of_filters
    height_out = int((height + 2 * 1 - 3)/ 1) + 1
    width_out = int((width + 2 * 1 - 3)/ 1) + 1

    X_col = im2col(input_data=input_data, weights_dict=weights_dict, prefix=prefix)
    w_col = weights_dict[str(prefix) + 'weight'].detach().cpu().numpy().reshape((output_channels, -1))
    b_col = weights_dict[str(prefix) + 'bias'].detach().cpu().numpy().reshape(-1, 1)
    # Perform matrix multiplication.
    out = w_col @ X_col + b_col
    # Reshape back matrix to image.
    out = np.array(np.hsplit(out, batch_size)).reshape((batch_size, output_channels, height_out, width_out))
    
    conv_end = time.perf_counter()
    print('Conv takes:', conv_end-conv_start, 'seconds')
    return out

def np_Conv2d2(input_data, weights_dict, prefix, im2col_mat, col_prefix):
    """
        Performs a forward convolution.

        Parameters:
        - X : Last conv layer of shape (m, n_C_prev, n_H_prev, n_W_prev).
        Returns:
        - out: previous layer convolved.
    """
    
    conv_start = time.perf_counter()
    if len(input_data.shape) == 4:
    
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)
        
    elif len(input_data.shape) == 3:
        
        input_data = input_data.reshape((1, 1, 2000 , 2000))
        batch_size, input_channels, height, width = input_data.shape # (N, Cin, Hin, Win)


    output_channels = len(weights_dict[str(prefix) + 'weight']) # num_of_filters
    height_out = int((height + 2 * 1 - 3)/ 1) + 1
    width_out = int((width + 2 * 1 - 3)/ 1) + 1

    
    X_col = im2col2(input_data=input_data, im2col_mat=im2col_mat, col_prefix=str(col_prefix))
    w_col = weights_dict[str(prefix) + 'weight'].detach().cpu().numpy().reshape((output_channels, -1))
    b_col = weights_dict[str(prefix) + 'bias'].detach().cpu().numpy().reshape(-1, 1)
    # Perform matrix multiplication.
    out = w_col @ X_col + b_col
    # Reshape back matrix to image.
    out = np.array(np.hsplit(out, batch_size)).reshape((batch_size, output_channels, height_out, width_out))
    
    conv_end = time.perf_counter()
    print('Conv takes:', conv_end-conv_start, 'seconds')
    return out

In [48]:
conv1 = np_Conv2d(input_data=samp, weights_dict=weights, prefix='layers.0.0.')
conv2 = np_Conv2d2(input_data=samp,
                   weights_dict=weights,
                   prefix='layers.0.0.',
                   im2col_mat=im2col_mat,
                   col_prefix='start')

np.allclose(conv1, conv2)

get_indices takes: 0.08942202798789367 seconds
Im2col takes: 0.2713753590360284 seconds
Conv takes: 0.5717813199735247 seconds
Im2col takes: 0.04423292598221451 seconds
Conv takes: 0.3409007480368018 seconds


True