In [9]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import os
import soundfile as sf
import random
import pickle

from glob import glob

import librosa
import librosa.display
import IPython.display as ipd

In [2]:
# Seting a seed for reproducibility
random.seed(1234) 

In [8]:
# constants
HOP_LENGTH = 256
MONO = 22050
SAMPLE_RATE = 16000
FRAME_SIZE = 512
OVERLAP_RATIO=0.3

In [13]:
# Defining the paths to the clean audio and noise audio folders
clean_audio_dir = 'clean_audio'
noise_audio_dir = 'noise_audio'

# folder for storing spectogram value of noised and clean audio speech
X_train_spec_dir = 'x_train_noised_speech'
Y_train_spec_dir = 'y_train_clean_audio'

# folder for storing min max values dictionary
min_max_value_dir = 'min_max_value_save'

# Ensure the output folders exist
if not os.path.exists(X_train_spec_dir):
    os.makedirs(X_train_spec_dir)
if not os.path.exists(Y_train_spec_dir):
    os.makedirs(Y_train_spec_dir)
if not os.path.exists(min_max_value_dir):
    os.makedirs(min_max_value_dir)

In [14]:
"""
1. load clean and noise audio files

for each file in clean audio file perform the following steps:
2. trim clean and noise audio files
3. mix clean audio with random noise
4. generate log spectogram of noisy speech
5. mix max normalizarion of noisy speech
6. save log spectogram in X_train_spec folder
7. store orginal min max value of each log spectogram as dictionary {'save_path':{'min': ,'max': }}

finally save the pickle file of min max value
"""

"\n1. load clean and noise audio files\n\nfor each file in clean audio file perform the following steps:\n2. trim clean and noise audio files\n3. mix clean audio with random noise\n4. generate log spectogram of noisy speech\n5. mix max normalizarion of noisy speech\n6. save log spectogram in X_train_spec folder\n7. store orginal min max value of each log spectogram as dictionary {'save_path':{'min': ,'max': }}\n\nfinally save the pickle file of min max value\n"

In [21]:
# 1. Load clean and noise audio files
def load(file_path):
        signal = librosa.load(file_path,
                              sr=SAMPLE_RATE,
                              mono=MONO)[0]
        return signal

# 2. Trim clean and noise audio files
def trim_audio(audio, top_db=20):
    audio_trimmed, _ = librosa.effects.trim(audio, top_db=top_db,hop_length=HOP_LENGTH)
    return audio_trimmed

# 3. Mix clean audio with random noise
def mix_audio(clean_audio, noise_audio):
    mixed_audio = clean_audio + noise_audio[:len(clean_audio)] * OVERLAP_RATIO
    return mixed_audio

# 4. Generate log spectrogram of noisy speech
def extractor(signal):
    stft = librosa.stft(signal,
                            n_fft=FRAME_SIZE,
                            hop_length=HOP_LENGTH)
    spectrogram = np.abs(stft)
    log_spectrogram = librosa.amplitude_to_db(spectrogram)
    return log_spectrogram

# 5. Mix max normalization of noisy speech
def normalize(spectrogram, min_max_dict):
    min_value = min_max_dict['min']
    max_value = min_max_dict['max']
    normalized_spectrogram = (spectrogram - min_value) / (max_value - min_value)
    return normalized_spectrogram

# 6. Save normalized log spectrogram in X_train_spec folder
def save_normalized_spectrogram(spectrogram, output_folder, filename):   
    save_path = os.path.join(output_folder, f"{filename}.npy")
    np.save(save_path, spectrogram)
    return save_path

# 7. Save original min max value of each log spectrogram as a dictionary
def store_min_max_value(save_path, min_val, max_val, min_max_values):
    min_max_values[save_path] ={'min': min_val, 'max':max_val}

# 8. saving pkl file of mix_max_value
def save_pickle(save_path, data):
     with open(save_path, 'wb') as f:
          pickle.dump(data, f)


In [22]:
# Iterate through each file in the clean audio folder
min_max_values = {}
for filename in os.listdir(clean_audio_dir):
    if filename.endswith(".wav"):
        clean_audio_path = os.path.join(clean_audio_dir, filename)

        # Load clean audio
        clean_audio = load(clean_audio_path)

        # Trim clean audio
        trimmed_clean_audio = trim_audio(clean_audio)

        # Mix clean audio with random noise
        noise_audio_path = random.choice(os.listdir(noise_audio_dir))
        noise_audio_path = os.path.join(noise_audio_dir, noise_audio_path)  # Path to the noise audio folder
        noise_audio = load(noise_audio_path)
        trimmed_noise_audio = trim_audio(noise_audio)
        mixed_audio = mix_audio(trimmed_clean_audio, trimmed_noise_audio)

        # Generate log spectrogram of noisy speech
        log_spec = extractor(mixed_audio)

        
        # save_min_max_values(log_spec, f'min_max_values_{filename}.pkl')

        # Mix max normalization of noisy speech
        normalized_spec = normalize(log_spec, {'min': np.min(log_spec), 'max': np.max(log_spec)})

        # Save normalized log spectrogram in X_train_spec folder
        save_path = save_normalized_spectrogram(normalized_spec, X_train_spec_dir, f'{filename}_spec')

        # Save original min max value of each log spectrogram as a dictionary
        store_min_max_value(save_path, log_spec.min(), log_spec.max(), min_max_values)

        print(f"Processed file {save_path}")


# saving min_max_values dictionary
save_path = os.path.join(min_max_value_dir, "min_max_values.pkl")
save_pickle(save_path, min_max_values)


Processed file x_train_noised_speech\p234_001.wav_spec.npy
Processed file x_train_noised_speech\p234_009.wav_spec.npy
Processed file x_train_noised_speech\p234_010.wav_spec.npy
Processed file x_train_noised_speech\p234_012.wav_spec.npy
Processed file x_train_noised_speech\p234_013.wav_spec.npy
Processed file x_train_noised_speech\p234_014.wav_spec.npy
Processed file x_train_noised_speech\p234_015.wav_spec.npy


In [23]:
min_max_values

{'x_train_noised_speech\\p234_001.wav_spec.npy': {'min': -52.605247,
  'max': 27.39475},
 'x_train_noised_speech\\p234_009.wav_spec.npy': {'min': -53.486153,
  'max': 26.513845},
 'x_train_noised_speech\\p234_010.wav_spec.npy': {'min': -49.832893,
  'max': 30.167105},
 'x_train_noised_speech\\p234_012.wav_spec.npy': {'min': -54.21559,
  'max': 25.784407},
 'x_train_noised_speech\\p234_013.wav_spec.npy': {'min': -51.44835,
  'max': 28.551651},
 'x_train_noised_speech\\p234_014.wav_spec.npy': {'min': -52.17472,
  'max': 27.82528},
 'x_train_noised_speech\\p234_015.wav_spec.npy': {'min': -47.565178,
  'max': 32.434822}}