In [None]:
!pip install torch torchaudio librosa

### **Audio augment by adding noise and pitch shift**


In [2]:
import torch
import torchaudio
import torchaudio.functional as F
import librosa
import matplotlib.pyplot as plt
from IPython.display import Audio
import torchaudio.transforms as T
from torchaudio.utils import download_asset

In [3]:
def plot_melspectrogram(specgram, title=None, ylabel=None):
    """ visualize mel spectrogram && log mel spectrogram """
    fig, axs = plt.subplots(1, 1)
    axs.set_title(title or "Mel Spectrogram")
    axs.set_ylabel(ylabel=ylabel)
    axs.set_xlabel("frames")
    im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto")
    fig.colorbar(im, ax=axs)
    plt.show(block=False)

### **Get colab noise file** /nguyenanh-projects/streaming-asr/noises

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os

In [6]:
noises_path = '/content/drive/MyDrive/nguyenanh-projects/streaming-asr/noises'

In [7]:
def get_noise_files(path: str):
  # return list of file_path
  result = []
  for filename in os.listdir(path):
      if(filename.endswith(".wav")):
        file_path = os.path.join(path, filename)
        result.append(file_path)

  return result

get_noise_files(noises_path)

['/content/drive/MyDrive/nguyenanh-projects/streaming-asr/noises/re_radio.wav',
 '/content/drive/MyDrive/nguyenanh-projects/streaming-asr/noises/re_tam.wav',
 '/content/drive/MyDrive/nguyenanh-projects/streaming-asr/noises/re_tam_10.wav']

In [8]:
Audio(get_noise_files(noises_path)[0])

# **Add noise function**

In [9]:
def add_noise(sample_array: torch.Tensor, noise_array: torch.Tensor):
  """ SNR explained: https://www.linkedin.com/pulse/signal-to-noise-ratio-snr-explained-leonid-ayzenshtat/
  :param audio_array: torch.Tensor,
  :param noise_array
  :return augmented audio with noise
  """
  scaled_noise = noise_array[:, :sample_array.size(1)]
  snr_dbs = torch.tensor([20, 10, 3])
  augmented = F.add_noise(sample_array, scaled_noise, snr_dbs)
  return augmented

In [10]:
audio_array, _ = torchaudio.load("test2.wav")
noise_array, _ = torchaudio.load(get_noise_files(noises_path)[0])

In [11]:
audio_array.shape

torch.Size([1, 63840])

In [12]:
noise_array = noise_array[0, :audio_array.size(1)]
noise_array.shape

torch.Size([63840])

In [13]:
noise_array = noise_array.unsqueeze(0)
noise_array.shape

torch.Size([1, 63840])

### **Noise audio processed**

In [14]:
aug = add_noise(sample_array=audio_array, noise_array=noise_array)

torch.Size([1, 63840])
torch.Size([1, 63840])


In [15]:
aug.shape

torch.Size([3, 63840])

In [16]:
Audio(data=aug, rate=16000)