# **Speech Recognition**
**Spoken Digits Dataset - https://github.com/Jakobovski/free-spoken-digit-dataset**

In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import glob
import numpy as np
import librosa
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

import tqdm.auto as tqdm
from torchsummary import summary

import warnings
warnings.filterwarnings('ignore')

In [2]:
if torch.cuda.is_available():
    device="cuda"
else:
    device="cpu"
print(f"Using {device}")

Using cuda


## **Data Preparation**

In [3]:
class SpokenDigiitsDataset(Dataset):
    def __init__(self, audio_dir) -> None:
        super(SpokenDigiitsDataset, self).__init__()
        self.labels = []
        self.audio_dir = audio_dir
        self.sample_size = 22050
        
    
    def __len__(self):
        return len(glob.glob(f'{self.audio_dir}/*.wav'))

    def _mel_spectrogram(self, audio):
        mel_spec = librosa.feature.melspectrogram(audio, sr=22050, n_fft=1024, hop_length=512, n_mels=64)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        return mel_spec

    def _right_pad(self, signal):
        if signal.shape[0] < self.sample_size:
            missing_samples = self.sample_size - signal.shape[0]
            last_dim_padding = (0, missing_samples)
            signal = F.pad(torch.tensor(signal), last_dim_padding)
            signal = signal.numpy()
        return signal

    def _truncate(self, signal):
        if signal.shape[0] > self.sample_size:
            signal = signal[:self.sample_size]
        return signal

    

    def __getitem__(self, index):
        audio_path = glob.glob(f'{self.audio_dir}/*.wav')[index]
        audio, sr = librosa.load(audio_path)
        audio = librosa.resample(audio, sr, 22050)
        audio = librosa.util.normalize(audio)
        audio = self._truncate(audio)
        audio = self._right_pad(audio)
        audio = self._mel_spectrogram(audio)
        audio = torch.tensor(audio, dtype=torch.float32, device=device)
        
        label = audio_path.split('/')[-1].split('_')[1]
        if label not in self.labels:
            num = len(self.labels)
            self.labels.append(label)
        else:
            num = self.labels.index(label)
        
        label = num
        #label = torch.tensor(label, dtype=torch.int, device=device)
        return audio, label


In [4]:
datapath = "./data/spoken-digits/"
dataset = SpokenDigiitsDataset(datapath)
dataten, label = dataset[0]
print(dataten)

tensor([[-60.7656, -39.9687, -33.3869,  ..., -80.0000, -80.0000, -80.0000],
        [-54.5900, -21.0452, -15.9190,  ..., -80.0000, -80.0000, -80.0000],
        [-51.0485, -16.5689, -19.2177,  ..., -80.0000, -80.0000, -80.0000],
        ...,
        [-69.5762, -80.0000, -80.0000,  ..., -80.0000, -80.0000, -80.0000],
        [-69.7309, -80.0000, -80.0000,  ..., -80.0000, -80.0000, -80.0000],
        [-69.8274, -80.0000, -80.0000,  ..., -80.0000, -80.0000, -80.0000]],
       device='cuda:0')


## **PyTorch Model**

In [5]:
#Simple lstm model for speec recognition
class SpeechRecognitionModel(nn.Module):
    def __init__(self, input_size=44, hidden_size=64, num_layers=2, num_classes=6):
        super(SpeechRecognitionModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size*64, num_classes)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        out, _ = self.lstm(x, (h0, c0))
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)
        return out

In [6]:
signal_shape = dataset[0][0].shape
print(signal_shape)
model = SpeechRecognitionModel().to(device)

torch.Size([64, 44])


## **Training**

In [10]:
def train(model, train_loader, loss_fn, optimizer, device, epochs):
    for i in range(epochs):
        model.train()
        for batch in tqdm.tqdm(train_loader):
            optimizer.zero_grad()
            x, y = batch
            x = x.to(device)
            y = y.to(device)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            optimizer.step()
        print(f"Epoch {i} Loss: {loss.item()}")
    return model


In [11]:
batch_size = 64
train_dataloader = DataLoader(dataset, batch_size)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device="cuda"
epochs=10



In [12]:
model = train(model, train_dataloader, loss_fn, optimizer, device, epochs)

torch.save(model.state_dict(), "Speech-Recognition.pth")
print("Trained Feed Forward Network")

  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 0 Loss: 0.2653054893016815


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 1 Loss: 0.21847163140773773


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 2 Loss: 0.12817074358463287


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 3 Loss: 0.15764343738555908


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 4 Loss: 0.13405515253543854


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 5 Loss: 0.12168055772781372


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 6 Loss: 0.08614056557416916


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 7 Loss: 0.07463821023702621


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 8 Loss: 0.06552808731794357


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 9 Loss: 0.04016384854912758
Trained Feed Forward Network


## **Evaluation**

In [14]:
batch_size = 128
train_dataloader = DataLoader(dataset, batch_size)
model.eval()
predictions = []
for _, (X, y) in enumerate(train_dataloader):
    X = X.to(device)
    y = y.to(device)

    with torch.no_grad():
        pred = model(X)
        for i in range(len(pred)):
            arr = pred[i].to("cpu")
            predictions.append(np.argmax(arr))

    del pred
    torch.cuda.empty_cache()

In [None]:
count=0
i=0
print(len(predictions))
for _, (_, y) in enumerate(train_dataloader):
    y = y.to("cpu").numpy()
    for j in range(len(y)):
        if(predictions[i] == y[j]):
            print(predictions[i], y[j])
            count+=1
        i+=1

In [16]:
print(count/len(predictions))

0.9646666666666667
