In [1]:
import librosa
import numpy as np
import os
import re
from scipy.linalg import norm
import soundfile as sf

In [2]:
def audio_to_audio_frame_stack(sound_data, frame_length, hop_length_frame):
    sequence_sample_length = sound_data.shape[0]

    sound_data_list = [sound_data[start:start + frame_length] for start in range(0, sequence_sample_length - frame_length + 1, hop_length_frame)]  # get sliding windows
    sound_data_array = np.vstack(sound_data_list)

    return sound_data_array

In [3]:
def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame, min_duration):
    list_sound_array = []

    for file in list_audio_files:
        # open the audio file
        y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
        total_duration = librosa.get_duration(y=y, sr=sr)

        if (total_duration >= min_duration):
            list_sound_array.append(audio_to_audio_frame_stack(y, frame_length, hop_length_frame))
        else:
            print(f"The following file {os.path.join(audio_dir,file)} is below the min duration")

    return np.vstack(list_sound_array)

In [4]:
def audio_files_to_numpy2(audio_dir, audio_file, sample_rate, frame_length, hop_length_frame, min_duration):
    list_sound_array = []
    y, sr = librosa.load(os.path.join(audio_dir, audio_file), sr=sample_rate)
    list_sound_array.append(audio_to_audio_frame_stack(y, frame_length, hop_length_frame))
    return np.vstack(list_sound_array)

In [5]:
def predicted_data(audio_dir, audio_file, sample_rate):
    for file in audio_file:
        # open the audio file
        y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
    return y

In [6]:
def blend_noise_randomly(voice, noise, nb_samples, frame_length, SNR):
    prod_voice = np.zeros((nb_samples, frame_length))
    prod_noise = np.zeros((nb_samples, frame_length))
    prod_noisy_voice = np.zeros((nb_samples, frame_length))

    for i in range(nb_samples):
        prod_voice[i, :] = voice[i, :]
        prod_noise[i, :] = noise[i, :]/norm(noise[i, :])*10**(-SNR/20)*norm(voice[i, :]);
        prod_noisy_voice[i, :] = prod_voice[i, :] + prod_noise[i, :]

    return prod_voice, prod_noise, prod_noisy_voice

In [7]:
def audio_to_magnitude_db_and_phase(n_fft, hop_length_fft, audio):
    stftaudio = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length_fft)
    
    #real= np.real(stftaudio)
    #imaginary= np.imag(stftaudio)
    real= stftaudio.real
    imaginary= stftaudio.imag
    
    return stftaudio, real, imaginary

In [8]:
def numpy_audio_to_matrix_spectrogram(numpy_audio, dim_square_spec, n_fft, hop_length_fft):
    nb_audio = numpy_audio.shape[0]
    
    complex_mat= np.zeros((nb_audio, dim_square_spec, dim_square_spec), dtype=complex)
    m_mag_db = np.zeros((nb_audio, dim_square_spec, dim_square_spec))
    m_phase = np.zeros((nb_audio, dim_square_spec, dim_square_spec), dtype=complex)

    for i in range(nb_audio):
        complex_mat[i, :, :], m_mag_db[i, :, :], m_phase[i, :, :] = audio_to_magnitude_db_and_phase(n_fft, hop_length_fft, numpy_audio[i])

    return complex_mat, m_mag_db, m_phase

In [9]:
def magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, mag, phase):

    # taking magnitude and phase of audio
    audio_reverse_stft = mag*phase
    audio_reconstruct = librosa.core.istft(audio_reverse_stft, hop_length=hop_length_fft, length=frame_length)
    return audio_reconstruct

In [10]:
def matrix_spectrogram_to_numpy_audio(m_mag_db, m_phase, frame_length, hop_length_fft)  :
    list_audio = []

    nb_spec = m_mag_db.shape[0]

    for i in range(nb_spec):

        audio_reconstruct = magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, m_mag_db[i], m_phase[i])
        list_audio.append(audio_reconstruct)

    return np.vstack(list_audio)

In [11]:
def new_path(path, SNR):
    if SNR==-10:
        path= path+'-10/'
    elif SNR==-5:
        path= path+'-5/'
    elif SNR==0:
        path= path+'0/'
    elif SNR==5:
        path= path+'5/'
    elif SNR==10:
        path= path+'10/'
    return path

In [12]:
def mixing(voice_dir, noise_dir, list_speech, list_noise, SNR, sample_rate, path):
    for i in range(len(SNR)):
        for j in range(len(list_noise)):
            n= os.path.splitext(list_noise[j])[0]
            for k in range(len(list_speech)):
                s= os.path.splitext(list_speech[k])[0]
                clean, clean_sr = librosa.load(os.path.join(voice_dir, list_speech[k]), sr=sample_rate)
                noise, noise_sr = librosa.load(os.path.join(noise_dir, list_noise[j]), sr=sample_rate)
                noise=noise/norm(noise)*10**(-SNR[i]/20)*norm(clean)
                noisy= clean + noise
                out_path= new_path(path, SNR[i])
                sf.write(out_path+s+' '+n+'.wav', noisy, sample_rate)

In [13]:
def mixing2(voice_dir, noise_dir, list_speech, list_noise, SNR, sample_rate, path):
    for i in range(len(SNR)):
        for j in range(len(list_noise)):
            s= os.path.splitext(list_noise[j])[0]
            for k in range(len(list_speech)):
                n= os.path.splitext(list_speech[k])[0]
                clean, clean_sr = librosa.load(os.path.join(voice_dir, list_speech[k]), sr=sample_rate)
                noise, noise_sr = librosa.load(os.path.join(noise_dir, list_noise[j]), sr=sample_rate)
                noise=noise/norm(noise)*10**(-SNR[i]/20)*norm(clean)
                noisy= clean + noise
                out_path= new_path(path, SNR[i])
                sf.write(out_path+s+' '+n+'.wav', noisy, sample_rate)

In [14]:
def DTCWT(data, num_levels):
    transform= dtcwt.Transform1d()
    data_trans= transform.forward(data, nlevels= num_levels)
    return data_trans

In [15]:
def Recon_wave(obj):
    inverse= dtcwt.Transform1d().inverse(obj)
    return inverse

In [16]:
def scaled_in(matrix_spec):
    matrix_spec = (matrix_spec + 46)/50
    return matrix_spec

In [17]:
def scaled_ou(matrix_spec):
    matrix_spec = (matrix_spec -6 )/82
    return matrix_spec

In [18]:
def inv_scaled_in(matrix_spec):
    matrix_spec = matrix_spec * 50 - 46
    return matrix_spec

In [19]:
def inv_scaled_ou(matrix_spec):
    matrix_spec = matrix_spec * 82 + 6
    return matrix_spec

In [20]:
def sorted_nicely( l ):
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key = alphanum_key)