#Frame-Level Speech Recognition

# Libraries

In [None]:
!pip install torchsummaryX wandb --quiet

[K     |████████████████████████████████| 1.8 MB 14.9 MB/s 
[K     |████████████████████████████████| 162 kB 73.3 MB/s 
[K     |████████████████████████████████| 181 kB 74.8 MB/s 
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
[K     |████████████████████████████████| 158 kB 90.2 MB/s 
[K     |████████████████████████████████| 157 kB 90.8 MB/s 
[K     |████████████████████████████████| 157 kB 87.5 MB/s 
[K     |████████████████████████████████| 157 kB 88.0 MB/s 
[K     |████████████████████████████████| 157 kB 87.3 MB/s 
[K     |████████████████████████████████| 157 kB 81.1 MB/s 
[K     |████████████████████████████████| 157 kB 75.2 MB/s 
[K     |████████████████████████████████| 157 kB 59.1 MB/s 
[K     |████████████████████████████████| 156 kB 88.8 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [None]:
import torch
import numpy as np
from torchsummaryX import summary
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime
import wandb
import sklearn.metrics
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
### PHONEME LIST
PHONEMES = [
            'SIL',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',  
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '<sos>', '<eos>']

# Kaggle

This section contains code that install kaggle's API, 

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"jingrugongruby","key":""}') 
    # Put your kaggle username & key here

!chmod 600 /root/.kaggle/kaggle.json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kaggle==1.5.8
  Downloading kaggle-1.5.8.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 4.8 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.8-py3-none-any.whl size=73275 sha256=8a2c69ef7229757100c57327ca5fc64a1e23e02b206e9a29f1d674c85f393f56
  Stored in directory: /root/.cache/pip/wheels/de/f7/d8/c3902cacb7e62cb611b1ad343d7cc07f42f7eb76ae3a52f3d1
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    Uninstalling kaggle-1.5.12:
      Successfully uninstalled kaggle-1.5.12
Successfully installed kaggle-1.5.8


In [None]:
# commands to download data from kaggle

!kaggle competitions download -c 11-785-f22-hw1p2
!mkdir '/content/data'

!unzip -qo '11-785-f22-hw1p2.zip' -d '/content/data'

Downloading 11-785-f22-hw1p2.zip to /content
 99% 2.12G/2.13G [00:08<00:00, 253MB/s]
100% 2.13G/2.13G [00:08<00:00, 263MB/s]


# Dataset

In [None]:
# Dataset class to load train and validation data

class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, data_path, context, offset=0, partition= "train", limit=-1): 

        self.context = context
        self.offset = offset
        self.data_path = data_path

        self.mfcc_dir = ["/train-clean-100/mfcc" if partition == "train" else "/dev-clean/mfcc"][0]
        
        self.transcript_dir = ["/train-clean-100/transcript" if partition == "train" else "/dev-clean/transcript"][0]
        mfcc_names = sorted(os.listdir(self.data_path+self.mfcc_dir)) 
        transcript_names = sorted(os.listdir(self.data_path+self.transcript_dir))

        assert len(mfcc_names) == len(transcript_names) # Making sure that we have the same no. of mfcc and transcripts

        self.mfccs, self.transcripts = [], []

        
        # Iterate through mfccs and transcripts
        for i in range(0, len(mfcc_names)):
        #   Load a single mfcc
            mfcc = np.load(self.data_path+self.mfcc_dir+"/"+mfcc_names[i],allow_pickle=True)
            mfcc -= (np.mean(mfcc, axis=0) + 1e-8)
        #   Optionally do Cepstral Normalization of mfcc
        #   Load the corresponding transcript
            transcript = np.load(self.data_path+self.transcript_dir+"/"+transcript_names[i],allow_pickle=True)
            # Remove [SOS] and [EOS] from the transcript (Is there an efficient way to do this 
            # without traversing through the transcript?)
            index1 = np.where(transcript=='<sos>')
            index2 = np.where(transcript=='<eos>')
            transcript = np.delete(transcript,[index1,index2])
            
        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs.append(mfcc)
            self.transcripts.append(transcript)

        

        # NOTE:
        # Each mfcc is of shape T1 x 15, T2 x 15, ...
        # Each transcript is of shape (T1+2) x 15, (T2+2) x 15 before removing [SOS] and [EOS]

        self.mfccs = np.concatenate(self.mfccs,axis = 0)
        self.length = len(self.mfccs)

        self.transcripts = np.concatenate(self.transcripts,axis = 0)
        padding_width = [(self.context, self.context), (0,0)]

        self.mfccs = np.pad(self.mfccs, padding_width, mode='constant', constant_values=0)
        

        # These are the available phonemes in the transcript
        self.phonemes = [
            'SIL',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',  
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH', '<sos>', '<eos>']
          
        # But the neural network cannot predict strings as such. Instead we map these phonemes to integers
        for i,p in enumerate(self.transcripts):
          self.transcripts[i] = self.phonemes.index(p)
        
        self.transcripts = self.transcripts.astype(int)
        
        # Now, if an element in self.transcript is 0, it means that it is 'SIL' (as per the above example)

        # Length of the dataset is now the length of concatenated mfccs/transcripts
        

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        
        frames = self.mfccs[ind:ind+2*self.context+1] 
        # After slicing, you get an array of shape 2*context+1 x 15. But our MLP needs 1d data and not 2d.
        frames = frames.flatten() # Flatten to get 1d data

        frames = torch.FloatTensor(frames) # Convert to tensors
        phoneme = torch.tensor(self.transcripts[ind])       

        return frames, phoneme

In [None]:
class AudioTestDataset(torch.utils.data.Dataset):

    def __init__(self, data_path, context, offset=0, limit=-1): 

        self.context = context
        self.offset = offset
        self.data_path = data_path

        self.mfcc_dir = ["/test-clean/mfcc"][0]
        mfcc_names = sorted(os.listdir(self.data_path+self.mfcc_dir)) # List files in sefl.mfcc_dir_dir using os.listdir in sorted order, optionally subset using limit to slice the number of files you load
        self.mfccs = []

        
        # Iterate through mfccs and transcripts
        for i in range(0, len(mfcc_names)):
        #   Load a single mfcc
            mfcc = np.load(self.data_path+self.mfcc_dir+"/"+mfcc_names[i],allow_pickle=True)
            mfcc -= (np.mean(mfcc, axis=0) + 1e-8)
        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs.append(mfcc)

        

        # NOTE:
        # Each mfcc is of shape T1 x 15, T2 x 15, ...
        # Each transcript is of shape (T1+2) x 15, (T2+2) x 15 before removing [SOS] and [EOS]

        # Concatenate all mfccs in self.mfccs such that the final shape is T x 15 (Where T = T1 + T2 + ...) 
        self.mfccs = np.concatenate(self.mfccs,axis = 0)
        self.length = len(self.mfccs)

        padding_width = [(self.context, self.context), (0,0)]

        self.mfccs = np.pad(self.mfccs, padding_width, mode='constant', constant_values=0)    
        
        # Length of the dataset is now the length of concatenated mfccs/transcripts
        

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        
        frames = self.mfccs[ind:ind+2*self.context+1] # Based on context and offset, return a frame at given index with context frames to the left, and right.
        # After slicing, you get an array of shape 2*context+1 x 15. But our MLP needs 1d data and not 2d.
        frames = frames.flatten() # Flatten to get 1d data

        frames = torch.FloatTensor(frames) # Convert to tensors
        

        return frames
    

# Parameters Configuration

Storing your parameters and hyperparameters in a single configuration dictionary makes it easier to keep track of them during each experiment. It can also be used with weights and biases to log your parameters for each experiment and keep track of them across multiple experiments. 

In [None]:
config = {
    'epochs': 90,
    'batch_size' :16384,
    'context' :50,
    'offset':0,
    'learning_rate' : 0.001,
    'architecture' : 'high-cutoff1',
    'dropout':0.25 ,
    'weight_decay': 0.0001
    
}

# Create Datasets

In [None]:
train_data = AudioDataset(data_path='/content/data', context = config['context'], offset=config['offset'], partition= "train", limit=-1) #Create a dataset object using the AudioDataset class for the training data 
val_data = AudioDataset(data_path='/content/data', context = config['context'], offset=config['offset'], partition= "dev", limit=-1) #Create a dataset object using the AudioDataset class for the training data 
test_data = AudioTestDataset(data_path='/content/data', context = config['context'], offset=config['offset'], limit=-1) # Create a dataset object using the AudioTestDataset class for the test data 

In [None]:
# Define dataloaders for train, val and test datasets
# Dataloaders will yield a batch of frames and phonemes of given batch_size at every iteration
train_loader = torch.utils.data.DataLoader(train_data, num_workers= 4,
                                           batch_size=config['batch_size'], pin_memory= True,
                                           shuffle= True)

val_loader = torch.utils.data.DataLoader(val_data, num_workers= 4,
                                         batch_size=config['batch_size'], pin_memory= True,
                                         shuffle= False)

test_loader = torch.utils.data.DataLoader(test_data, num_workers= 4, 
                                          batch_size=config['batch_size'], pin_memory= True, 
                                          shuffle= False)


print("Batch size: ", config['batch_size'])
print("Context: ", config['context'])
print("Input size: ", (2*config['context']+1)*15)
print("Output symbols: ", len(PHONEMES))

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size:  16384
Context:  50
Input size:  1515
Output symbols:  42
Train dataset samples = 36191134, batches = 2209
Validation dataset samples = 1937496, batches = 119
Test dataset samples = 1943253, batches = 119


In [None]:
# Testing code to check if data loaders are working
for i, data in enumerate(train_loader):
    frames, phoneme = data
    print(frames.shape, phoneme.shape)
    break

torch.Size([16384, 1515]) torch.Size([16384])


# Network Architecture


In [None]:
from torch.nn.modules.dropout import Dropout
from torch.nn.modules.batchnorm import BatchNorm1d

class Network(torch.nn.Module):

    def __init__(self, context):

        super(Network, self).__init__()

        input_size = (2*context + 1) * 15 
        output_size = 40 
        dimension_list = [input_size, 2048, 2048, 2048,2048, 1024,1024,1024, output_size]
        #building layers
        layers = []
        for i in range(len(dimension_list)-2):
          layers.append(torch.nn.Linear(dimension_list[i], dimension_list[i+1]))
          layers.append(torch.nn.Softplus())
          
          layers.append(torch.nn.BatchNorm1d(dimension_list[i+1]))
          layers.append(torch.nn.Dropout(p = config['dropout']))
        
        layers.append(torch.nn.LazyLinear(dimension_list[-1]))
        self.model = torch.nn.Sequential(*layers)

    def forward(self, x):
      #col unknown
        out = x.reshape((x.shape[0],-1))
        for layer in self.model:
          out = layer(out)

        return out

# Define Model, Loss Function and Optimizer

Here we define the model, loss function, optimizer and optionally a learning rate scheduler. 

In [None]:
def init_weights(m):
    if type(m) == torch.nn.Linear:
      #torch.nn.init.xavier_uniform(m.weight)
      torch.nn.init.kaiming_normal(m.weight, mode = 'fan_in')
      
   


In [None]:
input_size = 15*(2*config['context'] + 1)
model = Network(config['context']).to(device)
# Applying initialization to our net
model.apply(init_weights)
checkpoint_path = '"./content/drive/MyDrive/11785/HW1/model_checkpoint.pth"'
frames,phoneme = next(iter(train_loader))
# Check number of parameters of your network - Remember, you are limited to 20 million parameters for HW1 (including ensembles)
summary(model, frames.to(device))

  after removing the cwd from sys.path.


                         Kernel Shape   Output Shape     Params  Mult-Adds
Layer                                                                     
0_model.Linear_0         [1515, 2048]  [16384, 2048]  3.104768M   3.10272M
1_model.Softplus_1                  -  [16384, 2048]          -          -
2_model.BatchNorm1d_2          [2048]  [16384, 2048]     4.096k     2.048k
3_model.Dropout_3                   -  [16384, 2048]          -          -
4_model.Linear_4         [2048, 2048]  [16384, 2048]  4.196352M  4.194304M
5_model.Softplus_5                  -  [16384, 2048]          -          -
6_model.BatchNorm1d_6          [2048]  [16384, 2048]     4.096k     2.048k
7_model.Dropout_7                   -  [16384, 2048]          -          -
8_model.Linear_8         [2048, 2048]  [16384, 2048]  4.196352M  4.194304M
9_model.Softplus_9                  -  [16384, 2048]          -          -
10_model.BatchNorm1d_10        [2048]  [16384, 2048]     4.096k     2.048k
11_model.Dropout_11      

  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_model.Linear_0,"[1515, 2048]","[16384, 2048]",3104768.0,3102720.0
1_model.Softplus_1,-,"[16384, 2048]",,
2_model.BatchNorm1d_2,[2048],"[16384, 2048]",4096.0,2048.0
3_model.Dropout_3,-,"[16384, 2048]",,
4_model.Linear_4,"[2048, 2048]","[16384, 2048]",4196352.0,4194304.0
5_model.Softplus_5,-,"[16384, 2048]",,
6_model.BatchNorm1d_6,[2048],"[16384, 2048]",4096.0,2048.0
7_model.Dropout_7,-,"[16384, 2048]",,
8_model.Linear_8,"[2048, 2048]","[16384, 2048]",4196352.0,4194304.0
9_model.Softplus_9,-,"[16384, 2048]",,


In [None]:
criterion = torch.nn.CrossEntropyLoss() #Defining Loss function 
optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'],weight_decay=config['weight_decay']) #Defining Optimizer
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min',patience=5,threshold = 0.001)
# Recommended : Define Scheduler for Learning Rate, including but not limited to StepLR, MultiStepLR, CosineAnnealingLR, ReduceLROnPlateau, etc. 
# You can refer to Pytorch documentation for more information on how to use them.

# Training and Validation Functions

In [None]:
torch.cuda.empty_cache()
gc.collect()

2048

In [None]:
def train(model, optimizer, criterion, dataloader):

    model.train()
    train_loss = 0.0 #Monitoring Loss
    
    for iter, (mfccs, phonemes) in enumerate(dataloader):

        ### Move Data to Device (Ideally GPU)
        mfccs = mfccs.to(device)
        phonemes = phonemes.to(device)

        ### Forward Propagation
        logits = model(mfccs)

        ### Loss Calculation
        loss = criterion(logits, phonemes)
        train_loss += loss.item()

        ### Initialize Gradients
        optimizer.zero_grad()

        ### Backward Propagation
        loss.backward()

        ### Gradient Descent
        optimizer.step()
    scheduler.step(loss)
  
    train_loss /= len(dataloader)
    return train_loss

In [None]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode

    phone_true_list = []
    phone_pred_list = []

    for i, data in enumerate(dataloader):

        frames, phonemes = data
        ### Move data to device (ideally GPU)
        frames, phonemes = frames.to(device), phonemes.to(device) 

        with torch.inference_mode(): # makes sure that there are no gradients computed as we are not training the model now
            ### Forward Propagation
            logits = model(frames)

        ### Get Predictions
        predicted_phonemes = torch.argmax(logits, dim=1)
        
        ### Store Pred and True Labels
        phone_pred_list.extend(predicted_phonemes.cpu().tolist())
        phone_true_list.extend(phonemes.cpu().tolist())
        
        # Do you think we need loss.backward() and optimizer.step() here?
    
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    ### Calculate Accuracy
    accuracy = sklearn.metrics.accuracy_score(phone_pred_list, phone_true_list) 
    return accuracy*100

# Weights and Biases Setup

In [None]:
wandb.login(key="e6c1c51b5a9be1652cd05c3cd971db676b78181f") #API Key is in your wandb account, under settings (wandb.ai/settings)



True

In [None]:
# Create your wandb run
run = wandb.init(
    name = "high-cutoff2", ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit=True, ### Allows reinitalizing runs when you re-run this cell
    project="hw1p2", ### Project should be created in your wandb account 
    config=config ### Wandb Config for your run
)

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train loss,▁
validation accuracy,▁

0,1
train loss,0.83526
validation accuracy,80.80063


In [None]:
### Save your model architecture as a string with str(model) 
model_arch = str(model)

### Save it in a txt file 
arch_file = open("model_arch.txt", "w")
file_write = arch_file.write(model_arch)
arch_file.close()

### log it in your wandb run with wandb.save()
wandb.save('model_arch.txt')

['/content/wandb/run-20220925_155151-2vplklsn/files/model_arch.txt']

# Experiment

Now, it is time to finally run your ablations! Have fun!

In [None]:
# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()

best_acc = 0.0 ### Monitor best accuracy in your run

for epoch in range(config['epochs']):
    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    train_loss = train(model, optimizer, criterion, train_loader)
    accuracy = eval(model, val_loader)

    print("\tTrain Loss: {:.4f}".format(train_loss))
    print("\tValidation Accuracy: {:.2f}%".format(accuracy))


    ### Log metrics at each epoch in your run - Optionally, you can log at each batch inside train/eval functions (explore wandb documentation/wandb recitation)
    wandb.log({"train loss": train_loss, "validation accuracy": accuracy})

    ### Save checkpoint if accuracy is better than your current best
    if accuracy >= best_acc:

      ### Save checkpoint with information you want
      
      torch.save({'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': train_loss,
              'acc': accuracy}, 
        './model_checkpoint.pth')
      
      ### Save checkpoint in wandb
      wandb.save('checkpoint.pth')

    # Is your training time very high? Look into mixed precision training if your GPU (Tesla T4, V100, etc) can make use of it 
    # Refer - https://pytorch.org/docs/stable /notes/amp_examples.html

### Finish your wandb run
run.finish()


Epoch 1/90
	Train Loss: 0.7882
	Validation Accuracy: 81.38%

Epoch 2/90
	Train Loss: 0.5754
	Validation Accuracy: 83.44%

Epoch 3/90
	Train Loss: 0.5209
	Validation Accuracy: 84.34%

Epoch 4/90
	Train Loss: 0.4899
	Validation Accuracy: 84.91%

Epoch 5/90
	Train Loss: 0.4690
	Validation Accuracy: 85.32%

Epoch 6/90
	Train Loss: 0.4533
	Validation Accuracy: 85.63%

Epoch 7/90
	Train Loss: 0.4410
	Validation Accuracy: 85.86%

Epoch 8/90
	Train Loss: 0.4311
	Validation Accuracy: 86.01%

Epoch 9/90
	Train Loss: 0.4228
	Validation Accuracy: 86.14%

Epoch 10/90
	Train Loss: 0.4155
	Validation Accuracy: 86.28%

Epoch 11/90
	Train Loss: 0.4095
	Validation Accuracy: 86.41%

Epoch 12/90
	Train Loss: 0.4040
	Validation Accuracy: 86.49%

Epoch 13/90
	Train Loss: 0.3993
	Validation Accuracy: 86.59%

Epoch 14/90
	Train Loss: 0.3947
	Validation Accuracy: 86.64%

Epoch 15/90
	Train Loss: 0.3909
	Validation Accuracy: 86.74%

Epoch 16/90
	Train Loss: 0.3874
	Validation Accuracy: 86.77%

Epoch 17/90
	Tra

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train loss,█▄▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation accuracy,▁▄▅▆▇▇▇▇▇▇▇▇████████████████████████████

0,1
train loss,0.32779
validation accuracy,87.58315


# Testing and submission to Kaggle

In [None]:
def test(model, test_loader):
  ### What you call for model to perform inference?
  model.eval()

  ### List to store predicted phonemes of test data
  test_predictions = []

  ### Which mode do you need to avoid gradients?
  with torch.inference_mode():

      for i, frames in enumerate(tqdm(test_loader)):

          frames = frames.float().to(device)             
          
          output = model(frames)

          ### Get most likely predicted phoneme with argmax
          predicted_phonemes = torch.argmax(output, dim=1)

          ### How do you store predicted_phonemes with test_predictions? Hint, look at eval 
          test_predictions.extend(predicted_phonemes.cpu().tolist())
          
          
  return test_predictions

In [None]:
predictions = test(model, test_loader)

100%|██████████| 119/119 [00:10<00:00, 11.59it/s]


In [None]:
### Create CSV file with predictions
with open("./submission.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(predictions)):
        f.write("{},{}\n".format(i, predictions[i]))

In [None]:
### Submit to kaggle competition using kaggle API
!kaggle competitions submit -c 11-785-f22-hw1p2 -f ./submission.csv -m "Test Submission"

100% 18.6M/18.6M [00:02<00:00, 8.10MB/s]
Successfully submitted to Frame-Level Speech Recognition