In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import os
import soundfile as sf
import random
import pickle

from glob import glob

import librosa
import librosa.display
import IPython.display as ipd

In [5]:
# constants
HOP_LENGTH = 256
MONO = 22050
SAMPLE_RATE = 16000
FRAME_SIZE = 512
OVERLAP_RATIO=0.3

In [None]:
"""
Generating sound from the spectogram
steps:
1. load a .npy file from the x_train_noised_speech
2. obtain the corresponding min max value of the loaded spectogram
3. perform denormaliztion to obtain denormalized log_spectrogram
4. convert spectogram to audio signal
"""

In [2]:
file_path = 'min_max_value_save\min_max_values.pkl'

In [4]:
with open(file_path, 'rb') as f:
        min_max_values = pickle.load(f)
min_max_values

{'x_train_noised_speech\\p234_001.wav_spec.npy': {'min': -52.605247,
  'max': 27.39475},
 'x_train_noised_speech\\p234_009.wav_spec.npy': {'min': -53.486153,
  'max': 26.513845},
 'x_train_noised_speech\\p234_010.wav_spec.npy': {'min': -49.832893,
  'max': 30.167105},
 'x_train_noised_speech\\p234_012.wav_spec.npy': {'min': -54.21559,
  'max': 25.784407},
 'x_train_noised_speech\\p234_013.wav_spec.npy': {'min': -51.44835,
  'max': 28.551651},
 'x_train_noised_speech\\p234_014.wav_spec.npy': {'min': -52.17472,
  'max': 27.82528},
 'x_train_noised_speech\\p234_015.wav_spec.npy': {'min': -47.565178,
  'max': 32.434822}}

In [12]:
# 1. Load a .npy file from the X_train_noised_speech
def load_spectrogram(file_path):
    spectrogram = np.load(file_path)
    return spectrogram

# 2. Obtain the corresponding min max value of the loaded spectrogram
def get_min_max_values(file_path, min_max_values):
    min_val = min_max_values[file_path]['min']
    max_val = min_max_values[file_path]['max']
    return min_val, max_val

def denormalize(spectrogram, min_val, max_val):
    denormalized_spectrogram = spectrogram * (max_val - min_val) + min_val
    return denormalized_spectrogram

# 4. Convert spectrogram to audio signal
def spectrogram_to_audio(spectrogram):
    spectrogram = librosa.db_to_amplitude(spectrogram)
    audio_signal = librosa.istft(spectrogram, hop_length=HOP_LENGTH)
    return audio_signal

In [7]:
spec_path = 'x_train_noised_speech\p234_001.wav_spec.npy'

In [14]:
spectrogram = load_spectrogram(spec_path)
min_val, max_val = get_min_max_values(spec_path, min_max_values)
denorm_spectrogram =denormalize(spectrogram, min_val, max_val)
audio = spectrogram_to_audio(denorm_spectrogram)
audio

array([ 2.6641144e-05,  5.3223412e-06, -3.2794018e-05, ...,
       -3.4597807e-03, -3.5848604e-03, -4.0358463e-03], dtype=float32)

In [18]:
sf.write('constructed.wav', audio, samplerate= SAMPLE_RATE)
constructed_audio = glob('*.wav')
constructed_audio

['constructed.wav',
 'melamp_constructed.wav',
 'meldb_constructed.wav',
 'mel_constructed.wav',
 'normmel_constructed.wav',
 'norm_mel_constructed2.wav',
 'stft_constructed.wav',
 'stft_constructed_clean.wav']

In [20]:
ipd.Audio(constructed_audio[0])