In [None]:
import sys
import os
import io

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import tensorflow as tf
import torchaudio.transforms as T
from torchaudio.datasets import SPEECHCOMMANDS

import matplotlib.pyplot as plt
import IPython.display as ipd

from tqdm import tqdm

import math
import tarfile
import multiprocessing

import numpy as np
import pickle
import requests
import pandas as pd
import time

import scipy
import librosa

In [None]:
!pip3 install torchaudio

Collecting torchaudio
  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[?25l[K     |▏                               | 10 kB 23.9 MB/s eta 0:00:01[K     |▍                               | 20 kB 29.8 MB/s eta 0:00:01[K     |▌                               | 30 kB 33.0 MB/s eta 0:00:01[K     |▊                               | 40 kB 22.5 MB/s eta 0:00:01[K     |▉                               | 51 kB 11.2 MB/s eta 0:00:01[K     |█                               | 61 kB 12.5 MB/s eta 0:00:01[K     |█▏                              | 71 kB 13.8 MB/s eta 0:00:01[K     |█▍                              | 81 kB 15.0 MB/s eta 0:00:01[K     |█▌                              | 92 kB 10.7 MB/s eta 0:00:01[K     |█▊                              | 102 kB 11.5 MB/s eta 0:00:01[K     |██                              | 112 kB 11.5 MB/s eta 0:00:01[K     |██                              | 122 kB 11.5 MB/s eta 0:00:01[K     |██▎                             | 

# Introduction to PyTorch.
## 1. Prepare the data.

*  ### Apply transformations on the data.

## 2. Define the Network.
## 3. Define a Loss function and optimizer.
## 4.  Train the network.
## 5. Make predictions.

### 1. Prepare the data.

In [None]:
import torchaudio
from torch.utils.data import Dataset
from torch import Tensor
from torchaudio.datasets.utils import (
    download_url,
    extract_archive,
)

FOLDER_IN_ARCHIVE = "SpeechCommands"
URL = "speech_commands_v0.02"
HASH_DIVIDER = "_nohash_"
EXCEPT_FOLDER = "_background_noise_"
_CHECKSUMS = {"https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz":
    "3cd23799cb2bbdec517f1cc028f8d43c",
    "https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz":
    "6b74f3901214cb2c2934e98196829835",
}

class SPEECHCOMMANDS(Dataset):

    def __init__(self,
                 root: Union[str, Path],
                 url= URL,
                 folder_in_archive = FOLDER_IN_ARCHIVE,
                 download= False,
                 subset= None,
                 )

        if url in [
            "speech_commands_v0.01",
            "speech_commands_v0.02",
        ]:
            base_url = "https://storage.googleapis.com/download.tensorflow.org/data/"
            ext_archive = ".tar.gz"

            url = os.path.join(base_url, url + ext_archive)

        root = os.fspath(root)

        basename = os.path.basename(url)
        archive = os.path.join(root, basename)

        basename = basename.rsplit(".", 2)[0]
        folder_in_archive = os.path.join(folder_in_archive, basename)

        self._path = os.path.join(root, folder_in_archive)

        if download:
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
                    download_url(url, root, hash_value=checksum, hash_type="md5")
                extract_archive(archive, self._path)

        if subset == "validation":
            self._walker = _load_list(self._path, "validation_list.txt")
        elif subset == "testing":
            self._walker = _load_list(self._path, "testing_list.txt")
        elif subset == "training":
            excludes = set(_load_list(self._path, "validation_list.txt", "testing_list.txt"))
            walker = sorted(str(p) for p in Path(self._path).glob('*/*.wav'))
            self._walker = [
                w for w in walker
                if HASH_DIVIDER in w
                and EXCEPT_FOLDER not in w
                and os.path.normpath(w) not in excludes
            ]
        else:
            walker = sorted(str(p) for p in Path(self._path).glob('*/*.wav'))
            self._walker = [w for w in walker if HASH_DIVIDER in w and EXCEPT_FOLDER not in w]

    def _load_list(root, *filenames):
    output = []
    for filename in filenames:
        filepath = os.path.join(root, filename)
        with open(filepath) as fileobj:
            output += [os.path.normpath(os.path.join(root, line.strip())) for line in fileobj]
    return output

    def _load_list(root, *filenames):
    output = []
    for filename in filenames:
        filepath = os.path.join(root, filename)
        with open(filepath) as fileobj:
            output += [os.path.normpath(os.path.join(root, line.strip())) for line in fileobj]
    return output

def load_speechcommands_item(filepath, path) -> Tuple[Tensor, int, str, str, int]:
    relpath = os.path.relpath(filepath, path)
    label, filename = os.path.split(relpath)
    speaker, _ = os.path.splitext(filename)
    speaker, _ = os.path.splitext(speaker)

    speaker_id, utterance_number = speaker.split(HASH_DIVIDER)
    utterance_number = int(utterance_number)
    #Here resampling

    # Load audio
    waveform, sample_rate = torchaudio.load(filepath)
    return waveform, sample_rate, label, speaker_id, utterance_number

    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int]:
      fileid = self._walker[n]
      return load_speechcommands_item(fileid, self._path)


    def __len__(self) -> int:
        return len(self._walker)

  


In [None]:
train_set = SPEECHCOMMANDS("training")
test_set = SPEECHCOMMANDS("testing")
val_set = SPEECHCOMMANDS("validation")
#returns a Tuple with tensors->
waveform, sample_rate, label, speaker_id, utterance_number = train_set [0]

## Apply transformations on the data.

Py Torch Transforms


In [None]:
train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=1024, hop_length=512, n_mels=64),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)
#Class Text Transform in the project's notebook
text_transform = TextTransform()

def transform_data(data):
  '''Returns melspectograms and lables as 4 dimensional tensors'''
   melspectrograms = []
   labels = []

   for (waveform, sample_rate, label, speaker_id, utterance_number) in data:
     waveform = (waveform - waveform.mean()) / waveform.std()
     spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
     melspectrograms_.append(spec)
     label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
     labels.append(label)
     melspectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
     labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

   return melspectrograms, labels

## 2. Define the Network.

In [None]:
from torch import nn
from torchsummary import summary


class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        # 4 conv blocks / flatten / linear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128 * 5 * 4, 10)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions


if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn.cuda(), (1, 64, 44))

In [None]:
from torchsummary import summary
import torch.nn.functional as F
import torch.nn as nn


class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn..Conv2d(1, 64, kernel_size=3, padding=1)
        self.maxpool = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=2)
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=2)
        self.maxpool3 = nn.MaxPool2d(kernel_size=2)
        self.conv4 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=2)
        self.maxpool4 = nn.MaxPool2d(kernel_size=2)

        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128 * 5 * 4, 10)
        #self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = F.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.maxpool2(x)
        x = self.conv3(x)
        x = F.relu(x)
        x = self.maxpool3(x)
        x = self.conv4(x)
        x = F.relu(x)
        x = self.maxpool4(x)
        x = self.flatten(x)
        x = self.linear(x)
        predictions = F.log_softmax(x)
        return predictions


if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn.cpu(), (1, 64, 44))


## 3. Define a Loss function and optimizer.

In [None]:
loss_fn = nn.CTCLoss(blank=28).to(device)
optimiser = torch.optim.Adam(cnn.parameters(),lr)

##4. Train the network.

In [None]:
from torch.utils.data import DataLoader

#Here i can use the RayTune hyperparameters.
BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 0.001


#def create_data_loader(train_data, batch_size):
 #   train_dataloader = DataLoader(train_data, batch_size)
  #  return train_dataloader

train_dataloader = torch.utils.data.DataLoader(dataset=train_data, 
                                                batch_size = BATCH_SIZE, 
                                                collate_fn=lambda x: transform_data(x)

def train_single_epoch(model, train_dataloader, loss_fn, optimiser, device):

  for batch_idx, data in enumerate(data_loader):
    input , target = transform_data(data)
    input, target = input.to(device), target.to(device)

    prediction = model(input)
    loss = loss_fn(prediction, target)

    optimiser.zero_grad()
    loss.backward()
    optimiser.step()


def train(model, train_dataloader, loss_fn, optimiser, device, epochs, batch_size):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, train_dataloader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")


if __name__ == "__main__":


  if torch.cuda.is_available():
    device = "cpu"
  print(f"Using {device}")

    train_data = DataSplitter("training")
    test_data = DataSplitter("testing")
    val_data = DataSplitter("validation")
    
    #train_dataloader = create_data_loader(train_set, BATCH_SIZE)


    train_dataloader = torch.utils.data.DataLoader(dataset=train_data, 
                                                   batch_size = BATCH_SIZE, 
                                                   collate_fn=lambda x: transform_data(x)

    # construct model and assign it to device
    cnn = CNNNetwork().to(device)
    print(cnn)

    # initialise loss funtion + optimiser
    loss_fn = nn.CTCLoss(blank=28).to(device)
    optimiser = torch.optim.Adam(cnn.parameters(),
                                 lr=LEARNING_RATE)

    # train model
    train(cnn, train_dataloader, loss_fn, optimiser, device, EPOCHS)

    # save model
    torch.save(cnn.state_dict(), "feedforwardnet.pth")
    print("Trained feed forward net saved at feedforwardnet.pth")

##5. Make predictions.

In [None]:
class_mapping = ['backward',
 'bed',
 'bird',
 'cat',
 'dog',
 'down',
 'eight',
 'five',
 'follow',
 'forward',
 'four',
 'go',
 'happy',
 'house',
 'learn',
 'left',
 'marvin',
 'nine',
 'no',
 'off',
 'on',
 'one',
 'right',
 'seven',
 'sheila',
 'six',
 'stop',
 'three',
 'tree',
 'two',
 'up',
 'visual',
 'wow',
 'yes',
 'zero']


def predict(model, input, target, class_mapping):
    model.eval()
    #
    with torch.no_grad():
      #This function here :Context-manager that disabled gradient calculation is disabling gradient calculation is useful for inference, when you are sure that you will not call Tensor.backward(). 
      #It will reduce memory consumption for computations that would otherwise have requires_grad=True.

        predictions = model(input)
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected


if __name__ == "__main__":
    # load back the model
    cnn = CNNNetwork()
    state_dict = torch.load("cnnnet.pth")
    cnn.load_state_dict(state_dict)

    # get a sample from the SpeechCommands Dataset
    input, target = test_set[0][0], test_set[0][1] # [batch size, waveform, sample_rate etc]
    input.unsqueeze_(0)

    # make an inference
    predicted, expected = predict(cnn, input, target,
                                  class_mapping)
    print(f"Predicted: '{predicted}', expected: '{expected}'")
 

Plus features

In [None]:
view() , reshape()