In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import math

"""Time Delay Neural Network as mentioned in the 1989 paper by Waibel et al. (Hinton) and the 2015 paper by Peddinti et al. (Povey)"""

class TDNN(nn.Module):
    def __init__(self, context, input_dim, output_dim, full_context = True):
        """
        Definition of context is the same as the way it's defined in the Peddinti paper. It's a list of integers, eg: [-2,2]
        By deault, full context is chosen, which means: [-2,2] will be expanded to [-2,-1,0,1,2] i.e. range(-2,3)
        """
        super(TDNN,self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.check_valid_context(context)
        self.kernel_width, context = self.get_kernel_width(context,full_context)
        self.register_buffer('context',torch.LongTensor(context))
        self.full_context = full_context
        stdv = 1./math.sqrt(input_dim)
        self.kernel = nn.Parameter(torch.Tensor(output_dim, input_dim, self.kernel_width).normal_(0,stdv))
        self.bias = nn.Parameter(torch.Tensor(output_dim).normal_(0,stdv))
        # self.cuda_flag = False

    def forward(self,x):
        """
        x is one batch of data
        x.size(): [batch_size, sequence_length, input_dim]
        sequence length is the length of the input spectral data (number of frames) or if already passed through the convolutional network, it's the number of learned features
        output size: [batch_size, output_dim, len(valid_steps)]
        """
        # Check if parameters are cuda type and change context
        # if type(self.bias.data) == torch.cuda.FloatTensor and self.cuda_flag == False:
        #     self.context = self.context.cuda()
        #     self.cuda_flag = True
        conv_out = self.special_convolution(x, self.kernel, self.context, self.bias)
        return F.relu(conv_out)

    def special_convolution(self, x, kernel, context, bias):
        """
        This function performs the weight multiplication given an arbitrary context. Cannot directly use convolution because in case of only particular frames of context,
        one needs to select only those frames and perform a convolution across all batch items and all output dimensions of the kernel.
        """
        input_size = x.size()
        assert len(input_size) == 3, 'Input tensor dimensionality is incorrect. Should be a 3D tensor'
        [batch_size, input_sequence_length, input_dim] = input_size
        x = x.transpose(1,2).contiguous()

        # Allocate memory for output
        valid_steps = self.get_valid_steps(self.context, input_sequence_length)
        xs = Variable(self.bias.data.new(batch_size, kernel.size()[0], len(valid_steps)))

        # Perform the convolution with relevant input frames
        for c, i in enumerate(valid_steps):
            features = torch.index_select(x, 2, context+i)
            xs[:,:,c] = F.conv1d(features, kernel, bias = bias)[:,:,0]
        return xs

    @staticmethod
    def check_valid_context(context):
        # here context is still a list
        assert context[0] <= context[-1], 'Input tensor dimensionality is incorrect. Should be a 3D tensor'

    @staticmethod
    def get_kernel_width(context, full_context):
        if full_context:
            context = range(context[0],context[-1]+1)
        return len(context), context

    @staticmethod
    def get_valid_steps(context, input_sequence_length):
        start = 0 if context[0] >= 0 else -1*context[0]
        end = input_sequence_length if context[-1] <= 0 else input_sequence_length - context[-1]
        return range(start, end)

In [51]:
from pytorch_tdnn.tdnn import TDNN as TDNNLayer

tdnn1 = TDNNLayer(
  16, # input dim
  8, # output dim
  [-1,0,1], # context
)

tdnn2 = TDNNLayer(
  8, # input dim
  3, # output dim
  [-2,0,2], # context
)

layer3 = nn.Conv1d(3, 50, 1)
sigmoid = nn.Sigmoid()

import random
import numpy as np
input1 = []
for i in range(50):
    curr = []
    for j in range(16):
        currow = []
        for j in range(15):
            currow.append(random.uniform(0, 1))
        curr.append(currow)
    input1.append(curr)
# input1 = np.array(input1)
# input1 = np.zeros((16, 16, 16))
input1 = torch.Tensor(input1)
print(input1.shape)

output1 = tdnn1(input1)
print(output1.shape)
output2 = tdnn2(output1)
print(output2.shape)

res = layer3(output2)
res = sigmoid(res)
print(res.shape)
#flatten layer
print(res.min (), res.max ())


torch.Size([50, 16, 15])
torch.Size([50, 8, 15])
torch.Size([50, 3, 15])
torch.Size([50, 50, 15])
tensor(0.3142, grad_fn=<MinBackward1>) tensor(0.6859, grad_fn=<MaxBackward1>)


In [46]:
#layer 1
context = [0, 2]
input_dim = 16
output_dim = 8
net = TDNN(context, input_dim, output_dim, full_context=True)

In [11]:
import random
import numpy as np
input1 = []
for i in range(50):
    curr = []
    for j in range(15):
        currow = []
        for j in range(16):
            currow.append(random.uniform(0, 1))
        curr.append(currow)
    input1.append(curr)
# input1 = np.array(input1)
# input1 = np.zeros((16, 16, 16))
input1 = torch.Tensor(input1)
print(input1.shape)
output = net(input1)
print(output.shape)

torch.Size([50, 15, 16])
torch.Size([50, 8, 13])


In [13]:
#layer 2
context = [0,5]
input_dim = 8
output_dim = 3
net = TDNN(context, input_dim, output_dim, full_context=True)

In [14]:
output2 = net(output)
print(output2.shape)

RuntimeError: Given groups=1, weight of size [3, 8, 6], expected input[50, 13, 6] to have 8 channels, but got 13 channels instead

In [129]:
# print(output2)

In [139]:
m = nn.Sigmoid()
res = m(output2)
def sigmoid(Module):
    def __init__(self, M):
        # M is the dimension of input feature
        super(network, self).__init__()
        self.layer1 = nn.Linear(M, 3)

    def forward(self,x):
        return F.sigmoid(self.out(self.layer1(x)))

In [140]:
print(res)

tensor([[[0.5000, 0.9683, 0.6393, 0.5000, 0.5000],
         [0.5790, 0.5000, 0.9997, 0.5000, 0.7838],
         [0.9749, 0.5000, 0.9076, 0.9925, 0.5000]],

        [[0.5000, 0.9349, 0.7289, 0.5000, 0.5546],
         [0.6372, 0.5000, 0.9644, 0.8038, 0.6664],
         [0.9940, 0.5000, 0.7488, 0.9680, 0.5000]],

        [[0.5000, 0.5000, 0.7320, 0.5000, 0.6013],
         [0.7973, 0.9545, 0.6180, 0.5000, 0.5970],
         [0.8972, 0.5000, 0.6526, 0.9733, 0.5000]],

        [[0.5000, 0.8734, 0.7013, 0.5000, 0.9024],
         [0.6133, 0.5000, 0.9815, 0.5269, 0.6266],
         [0.9104, 0.5000, 0.8537, 0.9972, 0.7114]],

        [[0.5000, 0.5000, 0.5000, 0.5000, 0.5000],
         [0.5931, 0.5000, 0.9670, 0.5000, 0.6617],
         [0.9873, 0.5000, 0.5867, 0.9951, 0.5000]],

        [[0.5000, 0.5000, 0.7412, 0.5000, 0.5000],
         [0.6234, 0.5000, 0.8017, 0.5000, 0.5339],
         [0.5178, 0.5000, 0.5000, 0.9408, 0.5000]],

        [[0.5000, 0.5569, 0.5940, 0.5000, 0.5000],
         [0.7058, 0