In [3]:
import numpy as np
# import joblib
# from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

from torchinfo import summary
# import torchmetrics

# import utils
# import engine

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
class CRNN(nn.Module):
    def __init__(self, # bug in dropout_specification of gru, do contiguous memory after permutate
                 
                # weight initialization was using their ref 46

                 # for input they had 40 mel bands, time steps varied
                 input_shape, # (Frequecies, Timesteps, Channels)

                 # cnn layers, do grid search 1, 2, 3, 4
                 #      the kernel, best 5, next 3
                 #      the filters used by them 96
                 #      pool sizes 2x1 non overlapping stride
                 #      dropout after cnn was used with 0.25
                 cnn_layers=3,kernels=(5, 5, 5), filters=(96, 96, 96), pool_sizes=(2, 2, 2),
                 cnn_dropout=(0.25, 0.25, 0.25),
                # rnn layers, do grid search 1, 2, 3
                #       their dropout was done in custom with ref 35
                #           I have just used the same constant for prob, and implemented the inbuilt pytorch dropout for rnns
                 rnn_layers=2, rnn_hidden=(256, 256),
                 rnn_dropout=(0.25,),

                # fnn layer hidden units is taken from their ref 21, where it shares a lot of simularity, 
                #           no real basis for the hidden units though, as the arch in ref 21 is very different
                #           do grid search then 128, 256, 512, 1024, 2048, 4096(overkill)
                #       dropout i just used the same constant
                 fnn_layers=1, fnn_hidden=(1024,), fnn_dropout=(0.25,),
                 output_shape=5):
        
        super(CRNN, self).__init__()

        self.time_steps = input_shape[1]

        # =============================================================================

        self.cnn_blocks = nn.Sequential()
        in_channels = input_shape[2]
        freq = input_shape[0]

        for i in range(cnn_layers):
            self.cnn_blocks.add_module(f'cnn_{i}', nn.Conv2d(in_channels, filters[i], kernel_size=kernels[i], padding="same"))
            self.cnn_blocks.add_module(f'relu_{i}', nn.ReLU())
            self.cnn_blocks.add_module(f'pool_{i}', nn.MaxPool2d(kernel_size=(pool_sizes[i], 1), stride=(pool_sizes[i], 1)))
            self.cnn_blocks.add_module(f'batchnorm_{i}', nn.BatchNorm2d(filters[i]))
            self.cnn_blocks.add_module(f'dropout_{i}', nn.Dropout(cnn_dropout[i]))

            in_channels = filters[i]
            freq = (freq - (pool_sizes[i] - 1) -1)//pool_sizes[i] + 1

        self.freq = freq
        self.channels = filters[-1]

        # ==============================================================================

        self.rnn_input_size = self.freq * self.channels

        self.rnn = nn.Sequential()
        input_size = self.rnn_input_size
        for i in range(rnn_layers):
            self.rnn.add_module(f'gru_{i}', nn.GRU(input_size, rnn_hidden[i], batch_first=True, dropout=rnn_dropout[i] if i < rnn_layers - 1 else 0))
            input_size = rnn_hidden[i]

        # ===============================================================================

        input_size = self.time_steps*rnn_hidden[-1]

        self.fnn_blocks = nn.Sequential()
        for i in range(fnn_layers):
            self.fnn_blocks.add_module(f'fc_{i}', nn.Linear(input_size, fnn_hidden[i]))
            self.fnn_blocks.add_module(f'relu_{i}', nn.ReLU())
            self.fnn_blocks.add_module(f'batchnorm_{i}', nn.BatchNorm1d(fnn_hidden[i]))
            self.fnn_blocks.add_module(f'dropout_{i}', nn.Dropout(fnn_dropout[i]))
            input_size = fnn_hidden[i]

        self.output_layer = nn.Linear(input_size, output_shape)

    def forward(self, x): 

        x = x.permute(0, 3, 1, 2) # (B, C_in, F, T)
        
          
        # print("cnn input: ", x.shape)
        for name, layer in self.cnn_blocks.named_children(): # outs = # (B, C_out, F', T)
            x = layer(x)
            # print(f"{name}: ", x.shape)
        
        # print("after conv: ", x.shape)

        B, C_out, F_prime, T = x.size()
        x = x.permute(0, 3, 1, 2)  # (B, T, C_out, F')

        x = x.reshape(B, T, F_prime * C_out)  # (B, T, F' * C_out)

        # print("for rnn: ", x.shape)

        for name, layer in self.rnn.named_children(): # outs: (B, T, H_out)
            x, _ = layer(x)
            # print(f"{name} ", x.shape)

        B, T, H_out = x.size()
        # print("after rnn: ", x.shape)

        x = x.reshape(B, T*H_out)

        # print("for fnn: ", x.shape)

        x = self.fnn_blocks(x)

        # print("after fnn: ", x.shape)

        x = self.output_layer(x)

        # print("outs: ", x.shape)

        return x

In [5]:
# taking 40 bands, arbitary time steps
test = CRNN(input_shape=(40, 128, 1), output_shape=5)
summary(test, input_size=(64, 40, 128, 1), col_names=['input_size', 'output_size', 'num_params'])

# NOTE: main contributor to the parameter number is: concatenation of outputs from gru and feeding to linear layer
# need reduction methodology here



Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
CRNN                                     [64, 40, 128, 1]          [64, 5]                   --
├─Sequential: 1-1                        --                        --                        --
│    └─Conv2d: 2-1                       [64, 1, 40, 128]          [64, 96, 40, 128]         2,496
│    └─ReLU: 2-2                         [64, 96, 40, 128]         [64, 96, 40, 128]         --
│    └─MaxPool2d: 2-3                    [64, 96, 40, 128]         [64, 96, 20, 128]         --
│    └─BatchNorm2d: 2-4                  [64, 96, 20, 128]         [64, 96, 20, 128]         192
│    └─Dropout: 2-5                      [64, 96, 20, 128]         [64, 96, 20, 128]         --
│    └─Conv2d: 2-6                       [64, 96, 20, 128]         [64, 96, 20, 128]         230,496
│    └─ReLU: 2-7                         [64, 96, 20, 128]         [64, 96, 20, 128]         --
│    └─MaxPool2d: 2-8     

In [None]:
# <class 'torch.Tensor'>
# after conv:  torch.Size([64, 256, 12, 88])
# for rnn:  torch.Size([64, 88, 3072])
# after rnn:  torch.Size([64, 88, 256])
# ---------------------------------------------------------------------------
# RuntimeError    

In [10]:
class CRNN2(nn.Module):
    def __init__(self, # bug in dropout_specification of gru, do contiguous memory after permutate
                 
                # weight initialization was using their ref 46

                 # for input they had 40 mel bands, time steps varied
                 input_shape, # (Frequecies, Timesteps, Channels)

                 # cnn layers, do grid search 1, 2, 3, 4
                 #      the kernel, best 5, next 3
                 #      the filters used by them 96
                 #      pool sizes 2x1 non overlapping stride
                 #      dropout after cnn was used with 0.25
                 cnn_layers=3,kernels=(5, 5, 5), filters=(96, 96, 96), pool_sizes=(2, 2, 2),
                 cnn_dropout=(0.25, 0.25, 0.25),
                # rnn layers, do grid search 1, 2, 3
                #       their dropout was done in custom with ref 35
                #           I have just used the same constant for prob, and implemented the inbuilt pytorch dropout for rnns
                 rnn_layers=2, rnn_hidden=(256, 256),
                 rnn_dropout=(0.25,),

                # fnn layer hidden units is taken from their ref 21, where it shares a lot of simularity, 
                #           no real basis for the hidden units though, as the arch in ref 21 is very different
                #           do grid search then 128, 256, 512, 1024, 2048, 4096(overkill)
                #       dropout i just used the same constant
                # encoder out nodes: refers to the output nodes of the encoder layer
                #           the encoder layer is applied on each of the [time_steps] outputs of the rnn layer
                #           effectively transforms from the rnn output from (time_steps, rnn_hidden[-1]) to (time_steps, enc_out_nodes)
                encoder_out_nodes = 16,
                fnn_layers=1, fnn_hidden=(1024,), fnn_dropout=(0.25,),
                output_shape=5):
        
        super(CRNN2, self).__init__()

        self.time_steps = input_shape[1]

        # =============================================================================

        self.cnn_blocks = nn.Sequential()
        in_channels = input_shape[2]
        freq = input_shape[0]

        for i in range(cnn_layers):
            self.cnn_blocks.add_module(f'cnn_{i}', nn.Conv2d(in_channels, filters[i], kernel_size=kernels[i], padding="same"))
            self.cnn_blocks.add_module(f'relu_{i}', nn.ReLU())
            self.cnn_blocks.add_module(f'pool_{i}', nn.MaxPool2d(kernel_size=(pool_sizes[i], 1), stride=(pool_sizes[i], 1)))
            self.cnn_blocks.add_module(f'batchnorm_{i}', nn.BatchNorm2d(filters[i]))
            self.cnn_blocks.add_module(f'dropout_{i}', nn.Dropout(cnn_dropout[i]))

            in_channels = filters[i]
            freq = (freq - (pool_sizes[i] - 1) -1)//pool_sizes[i] + 1

        self.freq = freq
        self.channels = filters[-1]

        # ==============================================================================

        self.rnn_input_size = self.freq * self.channels

        self.rnn = nn.Sequential()
        input_size = self.rnn_input_size
        for i in range(rnn_layers):
            self.rnn.add_module(f'gru_{i}', nn.GRU(input_size, rnn_hidden[i], batch_first=True, dropout=rnn_dropout[i] if i < rnn_layers - 1 else 0))
            input_size = rnn_hidden[i]

        # ===============================================================================
        # current shape: (time_steps, h_out)

        input_size = (self.time_steps, rnn_hidden[-1])

        self.encoder_layer = nn.Linear(input_size[1], encoder_out_nodes)

        input_size = self.time_steps*encoder_out_nodes

        self.fnn_blocks = nn.Sequential()
        for i in range(fnn_layers):
            self.fnn_blocks.add_module(f'fc_{i}', nn.Linear(input_size, fnn_hidden[i]))
            self.fnn_blocks.add_module(f'relu_{i}', nn.ReLU())
            self.fnn_blocks.add_module(f'batchnorm_{i}', nn.BatchNorm1d(fnn_hidden[i]))
            self.fnn_blocks.add_module(f'dropout_{i}', nn.Dropout(fnn_dropout[i]))
            input_size = fnn_hidden[i]

        self.output_layer = nn.Linear(input_size, output_shape)

    def forward(self, x): 

        x = x.permute(0, 3, 1, 2) # (B, C_in, F, T)
        
          
        # print("cnn input: ", x.shape)
        for name, layer in self.cnn_blocks.named_children(): # outs = # (B, C_out, F', T)
            x = layer(x)
            # print(f"{name}: ", x.shape)
        
        # print("after conv: ", x.shape)

        B, C_out, F_prime, T = x.size()
        x = x.permute(0, 3, 1, 2)  # (B, T, C_out, F')

        x = x.reshape(B, T, F_prime * C_out)  # (B, T, F' * C_out)

        # print("for rnn: ", x.shape)

        for name, layer in self.rnn.named_children(): # outs: (B, T, H_out)
            x, _ = layer(x)
            # print(f"{name} ", x.shape)

        B, T, H_out = x.size()
        # print("after rnn: ", x.shape)


        # =====================================

        x = self.encoder_layer(x)

        B, enc_nodes, H_out = x.size()
        print(x.size())


        x = x.reshape(B, enc_nodes*H_out)

        # print("for fnn: ", x.shape)

        x = self.fnn_blocks(x)

        # print("after fnn: ", x.shape)

        x = self.output_layer(x)

        # print("outs: ", x.shape)

        return x

In [14]:
# taking 40 bands, arbitary time steps
test = CRNN2(input_shape=(40, 128, 1), fnn_layers=2, fnn_hidden=(1024,1024), fnn_dropout=(0.25,0.25), output_shape=5)
summary(test, input_size=(64, 40, 128, 1), col_names=['input_size', 'output_size', 'num_params'])

# NOTE: main contributor to the parameter number is: concatenation of outputs from gru and feeding to linear layer
# need reduction methodology here

# NOTE: That ^ has been reduced as follows: use an encoder layer that goes over each time_step's the gru outputs
# to reduce the hidden units parameter to encoder_units

torch.Size([64, 128, 16])


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
CRNN2                                    [64, 40, 128, 1]          [64, 5]                   --
├─Sequential: 1-1                        --                        --                        --
│    └─Conv2d: 2-1                       [64, 1, 40, 128]          [64, 96, 40, 128]         2,496
│    └─ReLU: 2-2                         [64, 96, 40, 128]         [64, 96, 40, 128]         --
│    └─MaxPool2d: 2-3                    [64, 96, 40, 128]         [64, 96, 20, 128]         --
│    └─BatchNorm2d: 2-4                  [64, 96, 20, 128]         [64, 96, 20, 128]         192
│    └─Dropout: 2-5                      [64, 96, 20, 128]         [64, 96, 20, 128]         --
│    └─Conv2d: 2-6                       [64, 96, 20, 128]         [64, 96, 20, 128]         230,496
│    └─ReLU: 2-7                         [64, 96, 20, 128]         [64, 96, 20, 128]         --
│    └─MaxPool2d: 2-8     