In [1]:
import torch
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import os
import numpy as np 
import torchaudio

import mapping
import Decoder.Greedy as Greedy
import dataPreprocessing.preprocessing as data_preprocess


## Build the text and int Mapping 

In [2]:
class TextMapping():
    
    def __init__(self):
        self.chars = ['\'', ' ',
             'a', 'b', 
             'c', 'd', 
             'e', 'f', 
             'g', 'h', 
             'i', 'j', 
             'k', 'l', 
             'm', 'n', 
             'o', 'p', 
             'q', 'r', 
             's', 't', 
             'u', 'v', 
             'w', 'x', 
             'y', 'z']
        self.index = [i for i in range(len(self.chars))]
        self.char2Int = {}
        self.int2Char = {}
        for idx in range(len(self.index)):
            self.char2Int[self.chars[idx]] = idx
            self.int2Char[idx] = self.chars[idx]
            
        
    def convert_TextToInt(self, text_sequence):
        
        text_sequence = text_sequence.lower()
        sequence_index = []
        for char in text_sequence:
            char_idx = self.char2Int[char]
            sequence_index.append(char_idx)
            
        return sequence_index
    
    def convert_IntToText(self, label_indexes):
        
        sequence_text = []
        for index in label_indexes:
            sequence_text.append(self.int2Char[index])
            
        return ''.join(sequence_text)
    
    
text_mapping = TextMapping()
text = "This is a test demo for text Mapping"
indexes = text_mapping.convert_TextToInt(text)
print(indexes)
conv_to_text = text_mapping.convert_IntToText(indexes)
print(conv_to_text)

            
        


[21, 9, 10, 20, 1, 10, 20, 1, 2, 1, 21, 6, 20, 21, 1, 5, 6, 14, 16, 1, 7, 16, 19, 1, 21, 6, 25, 21, 1, 14, 2, 17, 17, 10, 15, 8]
this is a test demo for text mapping


## Load Dataset 

In [3]:

#using librispeech dataset first

train_set_API = "train-clean-100"

valid_set_API = "test-clean"

train_set = torchaudio.datasets.LIBRISPEECH("../Data", url = train_set_API, download = True)

valid_set = torchaudio.datasets.LIBRISPEECH("../Data", url = valid_set_API, download = True)

num_workers = 8


#batch_size = 20
batch_size = 12



trainLoader = data.DataLoader(dataset = train_set, batch_size = batch_size, shuffle = True,
                              collate_fn = lambda data : data_preprocess.data_preprocessing(data),
                             num_workers = num_workers)

validLoader = data.DataLoader(dataset = valid_set, batch_size = batch_size, shuffle = True, 
                             collate_fn = lambda data: data_preprocess.valid_preprocessing(data), num_workers = num_workers)



## Design the model for ASR system

In [23]:
class LayerNormalization(nn.Module):
    
    def __init__(self, num_features):
        super(LayerNormalization, self).__init__()
        self.norm = nn.LayerNorm(num_features)
        
    def forward(self, x):
        x = x.transpose(2,3).contiguous()
        x = self.norm(x)
        return x.transpose(2,3).contiguous()
    

In [24]:
#Define Resnet and LSTM Model 
class ResNet(nn.Module):
    
    def __init__(self, input_channel, output_channel, kernel, stride, num_features,dropout):
        
        super(ResNet,self).__init__()
        
        self.conv_1 = nn.Conv2d(input_channel, output_channel, kernel, stride, padding = kernel // 2)
        self.conv_2 = nn.Conv2d(output_channel, output_channel, kernel, stride, padding = kernel // 2)
        #apply layer normolization to input
        self.layerNorm_1 = LayerNormalization(num_features)
        self.layerNorm_2 = LayerNormalization(num_features)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x):
        
        residual = x 
        x = self.layerNorm_1(x)
        x = F.gelu(x) # perform better than Relu
        x = self.dropout_1(x)
        print(x.shape)
        print(x)
        x = self.conv_1(x)
        
        x = self.layerNorm_2(x)
        x = F.gelu(x)
        x = self.dropout_2(x)
        x = self.conv_2(x)
        
        #concat the residual to the output for skip connection
        output = x + residual
        return output
    
        
    
    
class LSTM(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers, dropout, num_classes = 29, batch_first = True):
        super(LSTM, self).__init__()
        
        self.input_size = input_size
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, 
                           bidirectional = True)
        
        self.norm = nn.LayerNorm(input_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        #print("here")
        
        
        print("rnn dim :", self.input_size)
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        print("the shape:",x.shape)
        x = self.norm(x)
        print("shape after norm", x.shape)
        x = F.gelu(x)
        
        output, _ = self.lstm(x, (h0, c0))
        output = self.dropout(output)
        print("output shape:", output.shape)
        
        return output

In [25]:
#Build the whole ASR model
    
class ASRModel(nn.Module):
    
    def __init__(self, num_resnet, num_lstm, num_class, num_features, input_size, stride, num_kernels,dropout = 0.1):
        super(ASRModel, self).__init__()
        num_features = num_features // 2
        
        self.conv_1 = nn.Conv2d(1, 32, 3, stride = stride, padding = 1)
        
        #apply residual nets 
        self.residual_nets = nn.Sequential(*[ResNet(32, 32, kernel = 3, 
                                                   stride = 1, dropout = dropout, num_features = num_features)
                                             for num in range(num_resnet)
        ])
        self.fc = nn.Linear(num_features * num_kernels, input_size)
        
        self.lstm_nets = nn.Sequential(*[LSTM(input_size = input_size if num == 0 else input_size * 2 , hidden_size = input_size,num_layers =1, batch_first = num == 0,dropout = dropout) for num in range(num_lstm)
        ])
        
        self.activation = nn.GELU()
        self.Dropout = nn.Dropout(dropout)
        self.lstm_to_linear = nn.Linear(input_size * 2, input_size)
        self.classifier = nn.Linear(input_size, num_class)
        
    def forward(self, x):
        
        print(x.shape)
        x = self.conv_1(x)
        x = self.residual_nets(x)
        x = x.view(x.size(0), x.size(1) * x.size(2), x.size(3)) # get the size of (batch, features, time)
        x = x.transpose(1, 2) # transpose to (batch, time, feature) for lstm 
        x = self.fc(x)
        print("shape:", x.shape)
        x = self.lstm_nets(x)
        x = self.lstm_to_linear(x)
        x = self.activation(x)
        x = self.Dropout(x)
        x = self.classifier(x)
        
        return x
        
        
        
        
    
    
        
    
    
        
    
        


## Train the Model and Evaluate the trained Model

In [26]:

cuda_available = torch.cuda.is_available()
device = torch.device("cuda" if cuda_available else "cpu")
print(device)
ASR_model = ASRModel(num_resnet = 3, num_lstm = 3, input_size = 512, num_class = 29, num_features = 128, stride = 2, num_kernels = 32, dropout = 0.1).to(device)
ASR_model.to(device)

loss_function = nn.CTCLoss(blank = 28).to(device)

optimizer = optim.AdamW(ASR_model.parameters(), lr = 0.001)

lambda_1 = lambda epoch: 0.7 ** epoch

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda_1)

cuda


In [27]:
def train(model, train_loader, loss_func, optimizer, scheduler, epochs, device):
    
    model.train()
    
    
    for epoch in range(epochs):
        
        print("current epoch: ", epoch)
        total_loss = 0
        num_batch = 0
        
        for idx, data in enumerate(train_loader):

            optimizer.zero_grad()

            melspecs, labels, melspec_length, label_length = data

            melspecs, labels = melspecs.to(device), labels.to(device)

            print("melsepc shape", melspecs.shape)

            print("labels shape", labels.shape)
            print("melspecs data : ", melspecs)

            output_matrix = model(melspecs) #shape is [batch, time, n_classes]
            print("output shape:", output_matrix.shape)
            print("output data :", output_matrix)

            output_matrix = F.log_softmax(output, dim = 2)
            print("output : ", output_matrix)



            output_matrix = output_matrix.transpose(0, 1)
            seq_len = output_matrix.shape[0]

            loss = loss_func(output_matrix, labels, melspec_length, label_length)
            
            loss += loss.item()
            num_batch += 1

            loss.backward()

            optimizer.step()

            scheduler.step()
            
        avg_loss = total_loss/num_batch
        print(f"EPOCH : {epoch} | Avg Loss: {avg_loss}")
        
#         with open ('loss_info.txt', 'a') as f:
#             f.write(f"EPOCH : {epoch} | Avg Loss: {avg_loss}")
            
        #call the evalutaion function to run valid set
        
        evaluation(model, valid_loader, loss_func, device, epoch)
        
            
        
        
        
        
        
        
        
        
        

In [28]:
def evaluation(model, valid_loader, loss_func, device, epoch_index):
    
    model.eval()
    total_loss = 0
    num_batch = 0
    WER, CER = [], []
    
    with torch.no_grad():
        
        for idx, data in enumerate(valid_loader):

            melspecs, labels, melspec_length, label_length = data

            melspecs, labels = melspecs.to(device), labels.to(device)
            print("melsepc shape", melspecs.shape)
            output_matrix = model(melspecs) #shape is [batch, time, n_classes]
            print("output shape:", output_matrix.shape)
            print("output data :", output_matrix)
            output_matrix = F.log_softmax(output_matrix, dim = 2)
            loss = loss_func(output_matrix, labels, melspec_length, label_length)
            
            
            output_matrix = output_matrix.transpose(0,1)
            pred_decoded, label_decoded = Greedy.SequenceDecoder(output_matrix, labels, label_length)
            print("prediction:", pred_decoded)
            print("labels:", label_decoded)
            
            total_loss += loss.item()
            
            num_batch += 1
            
            for i in range(len(pred_decoded)):
                WER.append(Greedy.calc_word_error_rate(pred_decoded[i], label_decoded[i]))
                CER.append(Greedy.calc_char_error_rate(pred_decoded[i], label_decoded[i]))
                
        WER_avg.append(sum(WER)/ len(WER)) #get average WER
        CER_avg.append(sum(CER)/ len(CER)) # get average CER
        valid_loss = total_loss / num_batch
        loss_eval.append(valid_loss)
        
        
        
#         with open('evaluation.txt', 'a') as f:
#             f.write(f"Epoch : {epoch_index} | WER: {sum(WER)/ len(WER)} | CER: {sum(CER) / len(CER)} | valid Loss: {valid_loss} ")
            
                
        
            
            

            
    

In [29]:
WER_avg = []
CER_avg = []
loss_eval = []
loss_train = []

lr = 0.001
epochs = 15

train(ASR_model, trainLoader, loss_function, optimizer, scheduler, 1, device)


current epoch:  0


  "At least one mel filterbank has all zero values. "
  "At least one mel filterbank has all zero values. "
  "At least one mel filterbank has all zero values. "
  "At least one mel filterbank has all zero values. "
  "At least one mel filterbank has all zero values. "
  "At least one mel filterbank has all zero values. "
  "At least one mel filterbank has all zero values. "
  "At least one mel filterbank has all zero values. "


melsepc shape torch.Size([12, 1, 128, 1253])
labels shape torch.Size([12, 256])
melspecs data :  tensor([[[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [3.3428e-03, 1.4761e-03, 1.2705e-03,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [1.7999e-02, 7.9475e-03, 6.8408e-03,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [2.3657e-08, 5.3112e-07, 1.3724e-07,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [4.9053e-08, 2.3955e-07, 1.8263e-07,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [1.3505e-07, 5.8300e-07, 1.3230e-07,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00]]],


        [[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [4.9798e-06, 3.5619e-06, 7.0976e-06,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.6813e-05, 1.9178e-05, 3.8215e-05,  ..., 0.0000e+00,
           

tensor([[[[-1.6613e-01,  6.7429e+00,  8.2372e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-5.4210e-03, -1.9336e-02, -6.9564e-03,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 7.8860e+00,  0.0000e+00,  9.5595e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          ...,
          [ 1.5439e-02, -1.1005e-02, -2.3862e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-4.5629e-02, -3.3764e-02, -6.3657e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-5.6261e-02, -3.8596e-02, -6.9732e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00]],

         [[-1.6483e-01, -4.0455e-11, -2.1322e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-2.3745e-12, -7.5120e-04, -4.3794e-05,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 1.2694e+00,  9.2758e-01, -5.9347e-04,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          ...,
     

NameError: name 'output' is not defined