In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import os
import soundfile as sf

from glob import glob

import librosa
import librosa.display
import IPython.display as ipd

from itertools import cycle

sns.set_theme(style="white", palette=None)
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [3]:
HOP_LENGTH = 256
MONO = 22050
SAMPLE_RATE = 16000
FRAME_SIZE = 512

# Audio Player

In [4]:
def play(signal):
    return ipd.Audio(signal)

# Loader

In [5]:
def load(file_path):
        signal = librosa.load(file_path,
                              sr=SAMPLE_RATE,
                              mono=MONO)[0]
        return signal

# Spectogram Extractor

In [6]:
def extractor(signal):
    stft = librosa.stft(signal,
                            n_fft=FRAME_SIZE,
                            hop_length=HOP_LENGTH)
    spectrogram = np.abs(stft)
    log_spectrogram = librosa.amplitude_to_db(spectrogram)
    return log_spectrogram

# Spectogram to audio

In [7]:
def convert_spectrograms_to_audio(spectrogram):
        # log spectrogram -> spectrogram
        spec = librosa.db_to_amplitude(spectrogram)
        # apply Griffin-Lim
        signal = librosa.istft(spec, hop_length=HOP_LENGTH)
        return signal

In [8]:
clean_audio_folder = 'clean_audio'
noise_audio_folder = 'noise_audio'
trimmed_clean_folder = 'trimmed_clean_audio'
trimmed_noise_folder = 'trimmed_noise_audio'
output_folder = 'noised_speech'

clean_audio_files = glob(os.path.join(clean_audio_folder, '*.wav'))
noise_audio_files = glob(os.path.join(noise_audio_folder, '*.wav'))
trim_clean_audio_files = glob(os.path.join(trimmed_clean_folder, '*.wav'))
trim_noise_audio_files = glob(os.path.join(trimmed_noise_folder, '*.wav'))
noise_speech = glob(os.path.join(output_folder, '*.wav'))

In [17]:
trim_clean_audio_files

['trimmed_clean_audio\\p234_001.wav',
 'trimmed_clean_audio\\p234_009.wav',
 'trimmed_clean_audio\\p234_010.wav',
 'trimmed_clean_audio\\p234_012.wav',
 'trimmed_clean_audio\\p234_013.wav',
 'trimmed_clean_audio\\p234_014.wav',
 'trimmed_clean_audio\\p234_015.wav']

In [21]:
play(trim_clean_audio_files[3])

In [26]:
clean_speech = load(trim_clean_audio_files[3])

In [27]:
clean_spec = extractor(clean_speech)
clean_spec.shape

(257, 201)

In [28]:
contructed_signal = convert_spectrograms_to_audio(clean_spec)
sf.write('stft_constructed_clean.wav', contructed_signal, samplerate= SAMPLE_RATE)
constructed_audio = glob('*.wav')
constructed_audio

['constructed.wav',
 'melamp_constructed.wav',
 'meldb_constructed.wav',
 'mel_constructed.wav',
 'normmel_constructed.wav',
 'norm_mel_constructed2.wav',
 'stft_constructed.wav',
 'stft_constructed_clean.wav']

In [29]:
play(constructed_audio[-1])

Log Spectogram works better for reconstruction