# Frame-Level Speech Recognition with Multilayer Perceptron

In this project, we will be working with MFCC data consisting of 28 features at each time step/frame. The model will be used to recognize the phoneme occured in that frame. After completion it will be submitted to the ["11785 HW1P2 Fall 2024" competition"](https://www.kaggle.com/competitions/11785-hw1p2-f24/overview).



This project was completed according to the first homework in Carnegie Mellon University's CMU 11-785: Deep Learning course.

# Libraries

In [1]:
!pip install torchsummaryX==1.1.0 wandb --quiet

In [2]:
import torch
import numpy as np
from torchsummaryX import summary
import sklearn
import gc
import zipfile
import pandas as pd
from tqdm.auto import tqdm
import os
import datetime
import wandb
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [3]:
### PHONEME LIST
PHONEMES = [
            '[SIL]',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '[SOS]', '[EOS]']

In [5]:
wandb.login(key="a35a4beb5d7e2e112211e9cd7e103b10585e3132") 

[34m[1mwandb[0m: Currently logged in as: [33mmobin-roohi[0m ([33mmobin-roohi-university-of-tehran[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Dataset Class

In [6]:
# Dataset class to load train and validation data

class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, root, phonemes = PHONEMES, context=0, partition= "train-clean-100"): 
        self.context = context
        self.phonemes = phonemes

        # MFCC directory
        self.mfcc_dir = os.path.join(root, partition, 'mfcc')
        
        # Transcripts directory
        self.transcript_dir = os.path.join(root, partition, 'transcript')

        # List of MFCC and transcript files
        mfcc_names = sorted(os.listdir(self.mfcc_dir))
        transcript_names = sorted(os.listdir(self.transcript_dir))
        assert len(mfcc_names) == len(transcript_names)

        self.mfccs, self.transcripts = [], []

        for i in range(len(mfcc_names)):
            if i % 1000 == 0: 
                print(f"Data {i} / {len(mfcc_names)}")
            # Load a single mfcc
            mfcc = np.load(os.path.join(self.mfcc_dir, mfcc_names[i]))
    
            # Cepstral normalization
            mfcc_norm = self.cepstral_normalization(mfcc)
                    
            # Load the corresponding transcript and removing [SOS] and [EOS]
            transcript = np.load(os.path.join(self.transcript_dir, transcript_names[i]))[1:-1]
            
            # Save the preprocessed MFFCs and transcripts
            self.mfccs.append(mfcc_norm)
            self.transcripts.append(transcript)

        # Concatenate all mfccs 
        self.mfccs = np.vstack(self.mfccs)

        # Concatenate all transcripts 
        self.transcripts = np.concatenate(self.transcripts, axis=0)

        # Length of the dataset is now the length of concatenated mfccs/transcripts
        self.length = len(self.mfccs)

        # We can introduce context by padding zeros on top and bottom of self.mfcc
        zero_padding = np.zeros((self.context, 28))
        self.mfccs = np.vstack([zero_padding, self.mfccs, zero_padding])

        # Map transcripts to id integers using phonemes list
        encoding_dict = {string : idx for idx, string in enumerate(phonemes)}
        self.transcripts = np.array([encoding_dict[t] for t in self.transcripts])

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        # Frame with context frames to the left, and right
        frames = self.mfccs[ind : ind + 2 * self.context + 1, :]
        
        # After slicing, the array has a of shape 2*context+1 x 28. But MLP requires 1d data and not 2d.
        frames = frames.flatten()

        # Convert to tensors
        frames = torch.FloatTensor(frames)
        phonemes = torch.tensor(self.transcripts[ind])

        return frames, phonemes

    def cepstral_normalization(self, mfcc):
        # Mean and standard deviation
        mean = np.mean(mfcc, axis = 0)
        std = np.std(mfcc, axis = 0) + 1e-8

        # Normalize
        mfcc_norm = (mfcc - mean) / std

        return mfcc_norm

In [7]:
# Dataset class to load test data (without transcripts)

class AudioTestDataset(torch.utils.data.Dataset):

    def __init__(self, root, phonemes = PHONEMES, context=0, partition= "test-clean"): 
        self.context = context
        self.phonemes = phonemes

        # MFCC directory
        self.mfcc_dir = os.path.join(root, partition, 'mfcc')

        # List of MFCC files
        mfcc_names = sorted(os.listdir(self.mfcc_dir))

        self.mfccs = []

        for i in range(len(mfcc_names)):
            # Load a single mfcc
            mfcc = np.load(os.path.join(self.mfcc_dir, mfcc_names[i]))
    
            # Cepstral normalization
            mfcc_norm = self.cepstral_normalization(mfcc)
            
            # Save the preprocessed MFFCs
            self.mfccs.append(mfcc_norm)

        # Concatenate all mfccs 
        self.mfccs = np.vstack(self.mfccs)

        # Length of the dataset is now the length of concatenated mfccs
        self.length = len(self.mfccs)

        # We can introduce context by padding zeros on top and bottom of self.mfcc
        zero_padding = np.zeros((self.context, 28))
        self.mfccs = np.vstack([zero_padding, self.mfccs, zero_padding])

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        # Frame with context frames to the left, and right
        frames = self.mfccs[ind : ind + 2 * self.context + 1, :]
        
        # After slicing, the array has a of shape 2*context+1 x 28. But MLP requires 1d data and not 2d.
        frames = frames.flatten()

        # Convert to tensors
        frames = torch.FloatTensor(frames)

        return frames

    def cepstral_normalization(self, mfcc):
        # Mean and standard deviation
        mean = np.mean(mfcc, axis = 0)
        std = np.std(mfcc, axis = 0) + 1e-8

        # Normalize
        mfcc_norm = (mfcc - mean) / std

        return mfcc_norm

# Parameters Configuration

We store the parameters and hyperparameters in a single configuration dictionary to make it easier to keep track of them during each experiment. It can also be used with weights and biases to log the parameters for each experiment and keep track of them across multiple experiments.

In [8]:
config = {
    'epochs'        : 5,
    'batch_size'    : 1024,
    'context'       : 40,
    'init_lr'       : 1e-3,
    'architecture'  : 'initial-stage',
    'gamma'         : 0.5
    # Add more as needed - e.g dropout values, weight decay, scheduler parameters
}

# Create Datasets

In [9]:
# Train/validation data
train_data = AudioDataset("/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2", phonemes = PHONEMES, context=config["context"], partition="train-clean-100")
val_data = AudioDataset("/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2", phonemes = PHONEMES, context=config["context"], partition="dev-clean")

# Test data
test_data = AudioTestDataset("/kaggle/input/11785-hw1p2-f24/11785-f24-hw1p2", phonemes = PHONEMES, context=config["context"], partition="test-clean")

Data 0 / 28539
Data 1000 / 28539
Data 2000 / 28539
Data 3000 / 28539
Data 4000 / 28539
Data 5000 / 28539
Data 6000 / 28539
Data 7000 / 28539
Data 8000 / 28539
Data 9000 / 28539
Data 10000 / 28539
Data 11000 / 28539
Data 12000 / 28539
Data 13000 / 28539
Data 14000 / 28539
Data 15000 / 28539
Data 16000 / 28539
Data 17000 / 28539
Data 18000 / 28539
Data 19000 / 28539
Data 20000 / 28539
Data 21000 / 28539
Data 22000 / 28539
Data 23000 / 28539
Data 24000 / 28539
Data 25000 / 28539
Data 26000 / 28539
Data 27000 / 28539
Data 28000 / 28539
Data 0 / 2703
Data 1000 / 2703
Data 2000 / 2703


In [10]:
# Dataloaders for train, val and test datasets


train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 4,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 2,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)


print("Batch size     : ", config['batch_size'])
print("Context        : ", config['context'])
print("Input size     : ", (2*config['context']+1)*28)
print("Output symbols : ", len(PHONEMES))

print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))

Batch size     :  1024
Context        :  40
Input size     :  2268
Output symbols :  42
Train dataset samples = 36091157, batches = 35246
Validation dataset samples = 1928204, batches = 1884
Test dataset samples = 1934138, batches = 1889


In [11]:
# Testing code to check if the data loaders are working
for i, data in enumerate(train_loader):
    frames, phoneme = data
    print(frames.shape, phoneme.shape)
    break

torch.Size([1024, 2268]) torch.Size([1024])


# Network Architecture


This section defines the network architecture for the homework. 

In [12]:
class Network(torch.nn.Module):

    def __init__(self, input_size, output_size):

        super(Network, self).__init__()

        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, 1024),
            torch.nn.ReLU(),
            # torch.nn.Dropout(0.2),
            torch.nn.Linear(1024, 512),
            torch.nn.ReLU(),
            # torch.nn.Dropout(0.2),
            torch.nn.Linear(512, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(128, output_size),
        )

    def forward(self, x):
        out = self.model(x)

        return out

# Define Model, Loss Function and Optimizer

Here we define the model, loss function, optimizer and optionally a learning rate scheduler.

In [13]:
INPUT_SIZE  = (2*config['context'] + 1) * 28 
print(INPUT_SIZE)
model       = Network(INPUT_SIZE, len(train_data.phonemes)).to(device)
summary(model, frames.to(device))

2268
----------------------------------------------------------------------------------------------------
Layer                   Kernel Shape         Output Shape         # Params (K)      # Mult-Adds (M)
0_Linear                [2268, 1024]         [1024, 1024]             2,323.46                 2.32
1_ReLU                             -         [1024, 1024]                    -                    -
2_Linear                 [1024, 512]          [1024, 512]               524.80                 0.52
3_ReLU                             -          [1024, 512]                    -                    -
4_Linear                  [512, 256]          [1024, 256]               131.33                 0.13
5_ReLU                             -          [1024, 256]                    -                    -
6_Dropout                          -          [1024, 256]                    -                    -
7_Linear                  [256, 128]          [1024, 128]                32.90                

In [14]:
# Defining Loss function.
criterion = torch.nn.CrossEntropyLoss() 

# Defining Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr= config['init_lr']) 

# Defining Scheduler for Learning Rate
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = config['gamma'])

# # Later: Mixed Precision Training
# !nvidia-smi

# Training and Validation Functions

In [16]:
torch.cuda.empty_cache()
gc.collect()

0

In [17]:
from torch.amp import autocast, GradScaler
from tqdm import tqdm

def train(model, dataloader, optimizer, criterion, update_interval=100):
    model.train()
    tloss, tacc = 0, 0  # Monitoring loss and accuracy
    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    # Create a GradScaler once at the beginning of training.
    scaler = GradScaler()

    for i, (frames, phonemes) in enumerate(dataloader):
        ### Initialize Gradients
        optimizer.zero_grad()

        ### Move Data to Device (Ideally GPU)
        frames = frames.to(device)
        phonemes = phonemes.to(device)

        ### Forward Propagation
        ### Runs the forward pass with autocasting.
        with autocast(device_type=device, dtype=torch.float16):
            logits = model(frames)

            ### Loss Calculation
            loss = criterion(logits, phonemes)

        ### Backward Propagation
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        tloss += loss.item()
        tacc += torch.sum(torch.argmax(logits, dim=1) == phonemes).item() / logits.shape[0]

        # Update progress bar less frequently
        if (i + 1) % update_interval == 0 or (i + 1) == len(dataloader):
            batch_bar.set_postfix(
                loss="{:.04f}".format(float(tloss / (i + 1))),
                acc="{:.04f}%".format(float(tacc * 100 / (i + 1)))
            )
            batch_bar.update(update_interval)

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()
    
    batch_bar.close()
    tloss /= len(dataloader)
    tacc /= len(dataloader)

    return tloss, tacc


In [18]:
def eval(model, dataloader, update_interval=100):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    for i, (frames, phonemes) in enumerate(dataloader):

        ### Move data to device (ideally GPU)
        frames      = frames.to(device)
        phonemes    = phonemes.to(device)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode():
            ### Forward Propagation
            logits  = model(frames)
            ### Loss Calculation
            loss    = criterion(logits, phonemes)

        vloss   += loss.item()
        vacc    += torch.sum(torch.argmax(logits, dim= 1) == phonemes).item()/logits.shape[0]

        # Update progress bar less frequently
        if (i + 1) % update_interval == 0 or (i + 1) == len(dataloader):
            batch_bar.set_postfix(
                loss="{:.04f}".format(float(vloss / (i + 1))),
                acc="{:.04f}%".format(float(vacc * 100 / (i + 1)))
            )
            batch_bar.update(update_interval)

        ### Release memory
        del frames, phonemes, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(val_loader)
    vacc    /= len(val_loader)

    return vloss, vacc

# Weights and Biases Setup

In [19]:
# wandb.login(key=input("Please enter your wandb API key: ")) 

In [22]:
# Create your wandb run
run = wandb.init(
    name    = "more context with dropout in later layers w/o scheduling",
    project = "hw1p2-CMU",
    config  = config,
    # reinit=True
)

In [23]:
model_arch  = str(model)

arch_file   = open("model_arch.txt", "w")
file_write  = arch_file.write(model_arch)
arch_file.close()

wandb.save('model_arch.txt')

['/kaggle/working/wandb/run-20250121_123856-eckfd27r/files/model_arch.txt']

# Experiment

Finally, we run our abilations!

In [24]:
# Iterate over number of epochs to train and evaluate model
torch.cuda.empty_cache()
gc.collect()
wandb.watch(model, log="all")

for epoch in range(config['epochs']):

    print("\nEpoch {}/{}".format(epoch+1, config['epochs']))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc   = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc       = eval(model, val_loader)

    # Update the learning rate
    # scheduler.step()

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    ### Log metrics at each epoch
    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})



Epoch 1/5


                                                                                       

	Train Acc 69.0653%	Train Loss 1.0597	 Learning Rate 0.0010000
	Val Acc 72.5205%	Val Loss 0.8842

Epoch 2/5


                                                                                       

	Train Acc 73.2684%	Train Loss 0.9196	 Learning Rate 0.0010000
	Val Acc 73.4671%	Val Loss 0.8526

Epoch 3/5


                                                                                       

	Train Acc 74.3437%	Train Loss 0.8867	 Learning Rate 0.0010000
	Val Acc 74.2590%	Val Loss 0.8348

Epoch 4/5


                                                                                       

	Train Acc 74.9076%	Train Loss 0.8699	 Learning Rate 0.0010000
	Val Acc 74.6118%	Val Loss 0.8269

Epoch 5/5


                                                                                       

	Train Acc 75.3034%	Train Loss 0.8585	 Learning Rate 0.0010000
	Val Acc 74.9362%	Val Loss 0.8221


