In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import torch

In [35]:
class Wav2vec(nn.Module):

    def __init__(self):
        super(Wav2vec, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        activation = nn.ReLU()
        dropout = 0.0
        self.encoder = Encoder()
        self.context = Context(10,10,3)

    def forward(self, x):
        z = self.encoder(x)
        #c = self.context(x)
        # x = x.view(-1, self.num_flat_features(x))
        return z#, c
    
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        dropout=0.0
        self.in_c = 10
        activation = nn.ReLU()
        self.encoder = nn.Sequential(nn.Conv1d(in_channels=1, out_channels=self.in_c, kernel_size=10, stride=5),
                                     nn.Dropout(p=dropout),
                                     nn.GroupNorm(1, self.in_c),  # Affine, what to do?
                                     activation,
                                     # 2nd layer
                                     nn.Conv1d(in_channels=self.in_c, out_channels=self.in_c, kernel_size=8, stride=4),
                                     nn.Dropout(p=dropout),
                                     ## See norm_block - FB_repo
                                     nn.GroupNorm(1, self.in_c),  # Affine, what to do?
                                     activation,
                                     # 3rd layer
                                     nn.Conv1d(in_channels=self.in_c, out_channels=self.in_c, kernel_size=4, stride=2),
                                     nn.Dropout(p=dropout),
                                     nn.GroupNorm(1, self.in_c),  # Affine, what to do?
                                     activation,
                                     # Fourth layer
                                     nn.Conv1d(in_channels=self.in_c, out_channels=self.in_c, kernel_size=4, stride=2),
                                     nn.Dropout(p=dropout),
                                     nn.GroupNorm(1, self.in_c),  # Affine, what to do?
                                     activation,
                                     # Fifth layer
                                     nn.Conv1d(in_channels=self.in_c, out_channels=self.in_c, kernel_size=4, stride=2),
                                     nn.Dropout(p=dropout),
                                     nn.GroupNorm(1, self.in_c),  # Affine, what to do?
                                     activation)
    def log_compression(self, x):
        # https://www.edn.com/log-1-x-compression/
        x = x.abs()
        x = x + 1
        return x.log()

    def forward(self, x):
        x = self.encoder(x)
        x = self.log_compression(x)
        # TODO implement skipped connections?
        return x

class Context(nn.Module):
    def __init__(self, n_in, n_out, k,dropout, activation, layers=5):
        super(Context, self).__init__()

        def conv_block(n_in, n_out, k, dropout, activation):
            return nn.Sequential(
                nn.Conv1d(n_in, n_out, k),
                nn.Dropout(p=dropout),
                nn.GroupNorm(1, n_out),
                activation
            )

        self.conv = nn.ModuleList()
        for i in range(0, layers):
            self.conv.append(conv_block(n_in, n_out, k, dropout, activation))
        self.conv = nn.Sequential(*self.conv)

    def forward(self, x):
        return self.conv(x)


In [36]:
waveform, sample_rate = torchaudio.load("wav_16k_example.wav")

In [37]:
torch.unsqueeze(waveform, 1).shape

torch.Size([1, 1, 31440])

In [38]:
m = Wav2vec()
m(waveform.unsqueeze(1))

TypeError: __init__() missing 4 required positional arguments: 'n_in', 'n_out', 'dropout', and 'activation'

In [3]:
a = nn.ModuleList()

In [6]:
a.append(nn.Conv1d(5,5,5))

ModuleList(
  (0): Conv1d(5, 5, kernel_size=(5,), stride=(1,))
)

In [8]:
nn.Sequential(*a)

Sequential(
  (0): Conv1d(5, 5, kernel_size=(5,), stride=(1,))
)

In [17]:
def conv_block(n_in, n_out, k, dropout, activation): 
            return nn.Sequential(
                nn.Conv1d(n_in, n_out, k),
                nn.Dropout(p=dropout),
                nn.GroupNorm(1, n_out), 
                activation
            )

In [20]:
layers = n_in = n_out = k =5
dropout = 0.5
activation = nn.ReLU()
conv = nn.ModuleList()
for i in range(0, layers): 
    conv.append(conv_block(n_in, n_out, k, dropout, activation))
conv = nn.Sequential(*conv)
            

In [21]:
conv

Sequential(
  (0): Sequential(
    (0): Conv1d(5, 5, kernel_size=(5,), stride=(1,))
    (1): Dropout(p=0.5, inplace=False)
    (2): GroupNorm(1, 5, eps=1e-05, affine=True)
    (3): ReLU()
  )
  (1): Sequential(
    (0): Conv1d(5, 5, kernel_size=(5,), stride=(1,))
    (1): Dropout(p=0.5, inplace=False)
    (2): GroupNorm(1, 5, eps=1e-05, affine=True)
    (3): ReLU()
  )
  (2): Sequential(
    (0): Conv1d(5, 5, kernel_size=(5,), stride=(1,))
    (1): Dropout(p=0.5, inplace=False)
    (2): GroupNorm(1, 5, eps=1e-05, affine=True)
    (3): ReLU()
  )
  (3): Sequential(
    (0): Conv1d(5, 5, kernel_size=(5,), stride=(1,))
    (1): Dropout(p=0.5, inplace=False)
    (2): GroupNorm(1, 5, eps=1e-05, affine=True)
    (3): ReLU()
  )
  (4): Sequential(
    (0): Conv1d(5, 5, kernel_size=(5,), stride=(1,))
    (1): Dropout(p=0.5, inplace=False)
    (2): GroupNorm(1, 5, eps=1e-05, affine=True)
    (3): ReLU()
  )
)