In [None]:
#https://medium.com/mlearning-ai/parsing-the-esc50-audio-dataset-with-tensorflow-2ad4ae96f6b0

In [17]:
import math
from utils import *
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

In [22]:
import librosa
import scipy

In [1]:
import pandas as pd
import numpy as np
import os

In [31]:
def getCleanSpeech(mozilla_basepath,val_dataset_size, dataframe_name, sep):
    mozilla_metadata = pd.read_csv(os.path.join(mozilla_basepath, dataframe_name), sep=sep)
    clean_files = mozilla_metadata['path'].values
    np.random.shuffle(clean_files)
    clean_files = [os.path.join(mozilla_basepath, 'clips', filename) for filename in clean_files]
    if dataframe_name=='test.tsv':
        return clean_files
    clean_files = clean_files[:-val_dataset_size]
    clean_val_files = clean_files[-val_dataset_size:]
    return clean_files, clean_val_files

In [13]:
def get_filenames_by_class_id(basepath,metadata):
    class_ids = None
    if class_ids is None:
        class_ids = np.unique(metadata['classID'].values)

    all_files = []
    file_counter = 0
    for c in class_ids:
        per_class_files = metadata[metadata['classID'] == c][['slice_file_name', 'fold']].values
        per_class_files = [os.path.join(basepath, 'audio', 'fold' + str(file[1]), file[0]) for file in per_class_files]
        file_counter += len(per_class_files)
        all_files.extend(per_class_files)

    return all_files

In [35]:
def getNoisySpeech(urbansound_basepath, val_dataset_size, test=False):
    urbansound_metadata = pd.read_csv(os.path.join(urbansound_basepath, 'metadata', 'UrbanSound8K.csv'))

    urbansound_metadata.reindex(np.random.permutation(urbansound_metadata.index))
    if test:
        urbansound_train = urbansound_metadata[urbansound_metadata.fold == 10]
        urbansound_train_filenames = get_filenames_by_class_id(urbansound_basepath,urbansound_train)
        np.random.shuffle(urbansound_train_filenames)
        return urbansound_train_filenames

    else:

        urbansound_train = urbansound_metadata[urbansound_metadata.fold != 10]

        urbansound_train_filenames = get_filenames_by_class_id(urbansound_basepath,urbansound_train)
    np.random.shuffle(urbansound_train_filenames)

    urbansound_val = urbansound_train_filenames[-val_dataset_size:]
    urbansound_train = urbansound_train_filenames[:-val_dataset_size]

    return urbansound_train, urbansound_val

In [36]:
windowLength = 256
config = {'windowLength': windowLength,
          'overlap': round(0.25 * windowLength),
          'fs': 16000,
          'audio_max_duration': 0.8}

In [37]:
mozilla_basepath = 'C:/Users/B989/Downloads/Compressed/cv-corpus-10.0-2022-07-04-bn/bn'
urbansound_basepath = 'C:/Users/B989/OneDrive - Brain Station 23 Ltd/Documents/UrbanSound8K/UrbanSound8K'

clean_train_filenames, clean_val_filenames = getCleanSpeech(mozilla_basepath,val_dataset_size=1000,dataframe_name ='train.tsv', sep='\t')

noise_train_filenames, noise_val_filenames = getNoisySpeech(urbansound_basepath,val_dataset_size=200)


clean_test_filenames = getCleanSpeech(mozilla_basepath,val_dataset_size=1000,dataframe_name ='test.tsv', sep='\t')

noise_test_filenames = getNoisySpeech(urbansound_basepath,val_dataset_size=200, test=True)




In [30]:
def read_audio(filepath, sample_rate, normalize=True):
    audio, sr = librosa.load(filepath, sr=sample_rate)
    if normalize is True:
        div_fac = 1 / np.max(np.abs(audio)) / 3.0
        audio = audio * div_fac
    return audio, sr

In [19]:
def removeSilentFrames(audio):
    trimed_audio = []
    indices = librosa.effects.split(audio, hop_length=config.overlap, top_db=20)

    for index in indices:
        trimed_audio.extend(audio[index[0]: index[1]])
    return np.array(trimed_audio)

In [20]:
def cropAudio(audio):
    audio_duration_secs = librosa.core.get_duration(audio, config.fs)

    if config.audio_max_duration >= audio_duration_secs:
        return audio

    audio_duration_ms = math.floor(audio_duration_secs * config.fs)
    duration_ms = math.floor(config.audio_max_duration * config.fs)
    idx = np.random.randint(0, audio_duration_ms - duration_ms)
    return audio[idx: idx + duration_ms]

In [21]:
def mixCleanNoisyAudio(clean_audio, noise_signal):
    if len(clean_audio) >= len(noise_signal):
        while len(clean_audio) >= len(noise_signal):
            noise_signal = np.append(noise_signal, noise_signal)

    ind = np.random.randint(0, noise_signal.size - clean_audio.size)

    noiseSegment = noise_signal[ind: ind + clean_audio.size]

    speech_power = np.sum(clean_audio ** 2)
    noise_power = np.sum(noiseSegment ** 2)
    noisyAudio = clean_audio + np.sqrt(speech_power / noise_power) * noiseSegment
    return noisyAudio

In [23]:
def getSTFTspectrogram(audio, ffT_length):
    window = scipy.signal.hamming(config.window_length, sym=False)
    return librosa.stft(audio, n_fft=ffT_length, win_length=config.window_length, hop_length=config.overlap, window=window, center=True)

In [24]:
def getPhase(clean_spectral_magnitude, clean_phase, noise_phase):
    assert clean_phase.shape == noise_phase.shape, "Shapes must match."
    return clean_spectral_magnitude * np.cos(clean_phase - noise_phase)

In [25]:
def audio_processing(clean_filename, noise_filenames):

    clean_audio, _ = read_audio(clean_filename, config.fs)

    clean_audio = removeSilentFrames(clean_audio)

    noise_filename = np.random.choice(noise_filenames)

    noise_audio, sr = read_audio(noise_filename,  config.fs)

    noise_audio = removeSilentFrames(noise_audio)

    clean_audio = cropAudio(clean_audio)

    noiseInput = mixCleanNoisyAudio(clean_audio, noise_audio)

    noise_spectrogram = getSTFTspectrogram(noiseInput)

    noise_phase = np.angle(noise_spectrogram)

    noise_magnitude = np.abs(noise_spectrogram)

    clean_spectrogram = getSTFTspectrogram(clean_audio)

    clean_phase = np.angle(clean_spectrogram)

    clean_magnitude = np.abs(clean_spectrogram)

    clean_magnitude = getPhase(clean_magnitude, clean_phase, noise_phase)

    scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
    noise_magnitude = scaler.fit_transform(noise_magnitude)
    clean_magnitude = scaler.transform(clean_magnitude)

    return noise_magnitude, clean_magnitude, noise_phase

In [26]:
def prepare_input_features(stft_features, numSegments, numFeatures):
    noisySTFT = np.concatenate([stft_features[:, 0:numSegments - 1], stft_features], axis=1)
    stftSegments = np.zeros((numFeatures, numSegments, noisySTFT.shape[1] - numSegments + 1))

    for index in range(noisySTFT.shape[1] - numSegments + 1):
        stftSegments[:, :, index] = noisySTFT[:, index:index + numSegments]
    return stftSegments


In [27]:
def create_tf_record(clean_filenames, *, prefix, subset_size):
    counter = 0
    for i in range(0, len(clean_filenames), subset_size):
        tfrecord_filename = prefix + '_' + str(counter) + '.tfrecords'
        if os.path.isfile(tfrecord_filename):
            print(f"Skipping {tfrecord_filename}")
            counter += 1
            continue

        writer = tf.io.TFRecordWriter(tfrecord_filename)
        clean_filenames_sublist =clean_filenames[i:i + subset_size]

        print(f"Processing files from: {i} to {i + subset_size}")
        '''if parallel:
            out = p.map(self.parallel_audio_processing, clean_filenames_sublist)
        else:'''
        out = [audio_processing(filename) for filename in clean_filenames_sublist]

        for o in out:
            noise_stft_magnitude = o[0]
            clean_stft_magnitude = o[1]
            noise_stft_phase = o[2]

            noise_stft_mag_features = prepare_input_features(noise_stft_magnitude, numSegments=8, numFeatures=129)

            noise_stft_mag_features = np.transpose(noise_stft_mag_features, (2, 0, 1))
            clean_stft_magnitude = np.transpose(clean_stft_magnitude, (1, 0))
            noise_stft_phase = np.transpose(noise_stft_phase, (1, 0))

            noise_stft_mag_features = np.expand_dims(noise_stft_mag_features, axis=3)
            clean_stft_magnitude = np.expand_dims(clean_stft_magnitude, axis=2)

            for x_, y_, p_ in zip(noise_stft_mag_features, clean_stft_magnitude, noise_stft_phase):
                y_ = np.expand_dims(y_, 2)
                example = get_tf_feature(x_, y_, p_)
                writer.write(example.SerializeToString())

        counter += 1
        writer.close()

In [28]:
create_tf_record(clean_val_filenames, prefix='val', subset_size=2000)

Skipping val_0.tfrecords


In [29]:
create_tf_record(clean_train_filenames ,prefix='train', subset_size=4000)

Skipping train_0.tfrecords
Skipping train_1.tfrecords
Skipping train_2.tfrecords
Skipping train_3.tfrecords


In [38]:
create_tf_record(clean_test_filenames, prefix='test', subset_size=1000)

Skipping test_0.tfrecords
Skipping test_1.tfrecords
Skipping test_2.tfrecords
Skipping test_3.tfrecords
Skipping test_4.tfrecords
Skipping test_5.tfrecords
Skipping test_6.tfrecords
Skipping test_7.tfrecords
Skipping test_8.tfrecords
