# PyTorch SpecAugment
This notebook will demonstrate techniques described in this paper by Google on using Data Augmentation for Automatic Speech Recognition. 
https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html

In [1]:
import numpy as np 
import pandas as pd
import os 
import math
import torch 
import torchaudio 
from torch.utils.data import Dataset, dataloader
import matplotlib.pyplot as plt 
#%matplotlib_inline
import seaborn as sns
import librosa 
import librosa.display

import IPython.display as ipd

In [4]:
metadata = pd.read_csv('/Users/stephen/Desktop/Speech_Commands/Data/ASVP-ESD_UPDATE/metadata.csv')
audio_dir = '/Users/stephen/Desktop/Speech_Commands/Data/ASVP-ESD_UPDATE/Audio'

In [5]:
metadata.head()

Unnamed: 0,Emotions,Path,Folder
0,neutral,/03-01-02-01-02-16-03-03-01.wav,actor_16
1,angry,/03-01-05-02-07-16-03-03-01.wav,actor_16
2,neutral,/03-01-02-01-13-16-03-03-01.wav,actor_16
3,neutral,/03-01-02-01-05-16-03-03-01.wav,actor_16
4,angry,/03-01-05-02-06-16-03-03-01.wav,actor_16


In [None]:
# Pytorch Dataset class 

class EmotionalSpeechDataset(Dataset):
    # constructor 
    def __init__(self, annotations_file, audio_dir, target_sample_rate, num_samples, device, transformation=None): 
        self.annotations_file = annotations_file
        self.audio_dir = audio_dir 
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples 
        self.device = device 
        self.transformation = transformation

    def __len__(self):
        return len(self.annotations_file)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)

        # if a transformation is stated, apply to audio tensor 
        if self.transformation: 
            signal = self.transformation(signal).to(self.device)
        return signal, label 

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal 

    def _right_pad_if_necessary(self, signal): 
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:    # compare each file with desired length 
            num_missing_samples = self.num_samples - length_signal 
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal 

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal 

    def _mix_down_if_necessary(self, signal):  # if file is Stereo, convert to mono by using the mean 
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal 

    def _get_audio_sample_path(self, index):
        #fold = f"fold{self.annotations.iloc[index, 2]}"    # grab from "Folder" column 
        #path = os.path.join(self.audio_dir, fold, self.annotations_file.iloc[index, 1]) # combine to make full PATH
        fold = self.annotations_file.iloc[index, 2]
        path = os.path.join(self.audio_dir + fold + self.annotations_file.iloc[index, 1])
        return path 
    
    def _get_audio_sample_label(self, index):
        return self.annotations_file.iloc[index, 0]  # grab from "label" column  

