In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import os
import soundfile as sf
import random
import pickle

from glob import glob

import librosa
import librosa.display
import IPython.display as ipd

In [None]:
# constants
HOP_LENGTH = 256
MONO = 22050
SAMPLE_RATE = 16000
FRAME_SIZE = 512
OVERLAP_RATIO=0.3

In [None]:
"""
Loading x_train 
1. Load every .npy file present in x_train_noised_speech dir and assign it to x_train
2. check the shape of each numpy array and if less then desired shape apply padding, if more then remove extra columns
"""

In [None]:
x_train_dir = 'x_train_noised_speech'
y_train_dir = 'y_train_clean_audio'

In [None]:
# getting the maximum shape of the numpy array

def get_max_shape(dir):
    max_y = 0
    max_x = 0
    for filename in os.listdir(dir):
        if filename.endswith(".npy"):
            file_path = os.path.join(dir, filename)

            # Load the .npy file and append it to the x_train list
            spectrogram = np.load(file_path)
            if spectrogram.shape[1] > max_y:
                max_y = spectrogram.shape[1]

            if spectrogram.shape[0] > max_x:
                max_x = spectrogram.shape[0]

    return (max_x, max_y)

In [None]:
print(get_max_shape(x_train_dir))

(257, 251)


In [None]:

final_shape = (256, 256)

def get_array(dir, final_shape):
    # Initialize an empty list to store the loaded spectrograms
    arr = []
    # Iterate through each file in the directory
    for filename in os.listdir(dir):
        if filename.endswith(".npy"):
            file_path = os.path.join(dir, filename)
            print(file_path)
            # Load the .npy file and append it to the x_train list
            spectrogram = np.load(file_path)
            # Pad the spectrogram to match the desired final shape
            if spectrogram.shape[1] < final_shape[1]:
                # Pad the spectrogram to match the desired final shape
                pad_width = ((0, 0), (0, final_shape[1] - spectrogram.shape[1]))
                processed_spectrogram = np.pad(spectrogram, pad_width, mode='constant', constant_values=0)
            elif spectrogram.shape[1] > final_shape[1]:
                # Trim the spectrogram to match the desired final shape
                processed_spectrogram = spectrogram[:final_shape[0], :final_shape[1]]
            else:
                processed_spectrogram = spectrogram  # No change needed if the shape is already as desired
            # Append the padded spectrogram to the x_train list
            processed_spectrogram = processed_spectrogram[:final_shape[0], :]
            arr.append(processed_spectrogram)

    # Convert the list to a NumPy array if required
    arr = np.array(arr)

    return arr


In [None]:
x_train = get_array(x_train_dir, final_shape)
# x_train.shape

(7, 256, 256)

In [14]:
x_train[3].shape

(256, 256)

In [48]:
# 2. Obtain the corresponding min max value of the loaded spectrogram
def get_min_max_values(file_path, min_max_values):
    min_val = min_max_values[file_path]['min']
    max_val = min_max_values[file_path]['max']
    return min_val, max_val

def denormalize(spectrogram, min_val, max_val):
    denormalized_spectrogram = spectrogram * (max_val - min_val) + min_val
    return denormalized_spectrogram

# 4. Convert spectrogram to audio signal
def spectrogram_to_audio(spectrogram):
    spectrogram = librosa.db_to_amplitude(spectrogram)
    audio_signal = librosa.istft(spectrogram, hop_length=HOP_LENGTH)
    return audio_signal

In [49]:
file_path = 'min_max_value_save\min_max_values.pkl'
with open(file_path, 'rb') as f:
        min_max_values = pickle.load(f)
min_max_values

{'x_train_noised_speech\\p234_001.wav_spec.npy': {'min': -52.605247,
  'max': 27.39475},
 'x_train_noised_speech\\p234_009.wav_spec.npy': {'min': -53.486153,
  'max': 26.513845},
 'x_train_noised_speech\\p234_010.wav_spec.npy': {'min': -49.832893,
  'max': 30.167105},
 'x_train_noised_speech\\p234_012.wav_spec.npy': {'min': -54.21559,
  'max': 25.784407},
 'x_train_noised_speech\\p234_013.wav_spec.npy': {'min': -51.44835,
  'max': 28.551651},
 'x_train_noised_speech\\p234_014.wav_spec.npy': {'min': -52.17472,
  'max': 27.82528},
 'x_train_noised_speech\\p234_015.wav_spec.npy': {'min': -47.565178,
  'max': 32.434822}}

# Clean audio generation

In [56]:
y_train = get_array(y_train_dir, final_shape)
y_train.shape

(7, 257, 251)

In [58]:
spec_path = 'x_train_noised_speech\p234_001.wav_spec.npy'

In [59]:
min_val, max_val = get_min_max_values(spec_path, min_max_values)
denorm_spectrogram =denormalize(y_train[0], min_val, max_val)
audio = spectrogram_to_audio(denorm_spectrogram)
audio

array([ 2.4167290e-05,  4.8435854e-06, -3.0679854e-05, ...,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00], dtype=float32)

In [60]:
sf.write('clean_constructed.wav', audio, samplerate= SAMPLE_RATE)
constructed_audio = glob('*.wav')
constructed_audio

['clean_constructed.wav',
 'constructed.wav',
 'constructed1.wav',
 'lessthan_constructed.wav',
 'padded_constructed.wav',
 'padded_constructed1.wav',
 'stft_constructed_clean.wav']

In [61]:
ipd.Audio(constructed_audio[0])