# Audio Recognition using CNN

## Google Colab, Drive Configuration & Imports

In [None]:
from google.colab import drive

import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as tf

import matplotlib.pyplot as plt
import matplotlib.image as img

from PIL import Image
import scipy.io as sio

import os
import numpy as np
import numpy

#for loading and visualizing audio files
import librosa
import librosa.display

#to play audio
import IPython.display as ipd
import skimage.io

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)
data_path = '/content/drive/Shareddrives/DeepLearning/Projecte_Final/Data/'
spec_path = '/content/drive/Shareddrives/DeepLearning/Projecte_Final/Spectograms/'
results_path = '/content/drive/Shareddrives/DeepLearning/Projecte_Final/Results/'

Mounted at /content/drive


## Building the spectograms

In [None]:
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled
    
def spectrogram_image(y, sr, out, hop_length, n_mels):
    # use log-melspectrogram
    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels,
                                            n_fft=hop_length*2, hop_length=hop_length)
    mels = numpy.log(mels + 1e-9) # add small number to avoid log(0)

    # min-max scale to fit inside 8-bit range
    img = scale_minmax(mels, 0, 255).astype(numpy.uint8)
    img = numpy.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img # invert. make black==more energy

    # save as PNG
    skimage.io.imsave(out, img)

In [None]:
words = ['catapulta', 'hola', 'iu', 'mar', 'taula', 'victor']
audio_fpath = data_path + "exported_audios/"+ word +"/"
audio_clips = os.listdir(audio_fpath)

print("Number of .wav files in audio folder = ",len(audio_clips))

if len(os.listdir(spec_path + word +"/")) == 0:   # Last number was used in order not to convert all audios every time we got more
    last_number = 0
else:
    last_number = len(os.listdir(spec_path + word +"/"))
print(last_number)

Number of .wav files in audio folder =  1014
1014


In [None]:
# settings
hop_length = 512 # number of samples per time-step in spectrogram
n_mels = 128 # number of bins in spectrogram. Height of image
time_steps = 384 # number of time-steps. Width of image

words = ['catapulta', 'hola', 'iu', 'mar', 'taula', 'victor']

for word in words:
    for dt in ['Train', 'Test']:

        audio_fpath = data_path + "exported_audios/"+ word + "/" + dt +"/"
        audio_clips = os.listdir(audio_fpath)

        for i in range(1, len(audio_clips)+1):
            # load audio. Using example from librosa
            #path = librosa.util.example_audio_file()
            y, sr = librosa.load(audio_fpath + word + f'%d.mp3'%(i), sr=44100) # (path, offset=1.0, duration=10.0, sr=22050)
            out = spec_path + word + "/" + dt + f'/%d.png'%i

            # extract a fixed length window
            start_sample = 0 # starting at beginning
            length_samples = time_steps*hop_length
            window = y[start_sample:start_sample+length_samples]

            # convert to PNG
            spectrogram_image(window, sr=sr, out=out, hop_length=hop_length, n_mels=n_mels)
            print('Wrote file...', out)

Output hidden; open in https://colab.research.google.com to view.