In [1]:
from google.colab import drive
drive._mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Downsampling block (encoder)
def downsample_block(input_layer, filters, kernel_size, padding='same', activation='relu'):
    conv1 = layers.Conv2D(filters, kernel_size, activation=activation, padding=padding, kernel_initializer='HeNormal')(input_layer)
    # conv1 = layers.Dropout(0.1)(conv1)
    conv1 = layers.Conv2D(filters, kernel_size, activation=activation, padding=padding, kernel_initializer='HeNormal')(conv1)
    # b1 = layers.BatchNormalization()(conv1)
    # r1 = layers.ReLU()(b1)
    r1 = layers.ReLU()(conv1)
    pool = layers.MaxPooling2D(pool_size=(2, 2))(r1)
    # print("Downsample block shape is: " ,conv1.shape)
    return conv1, pool

# Upsampling block
def upsample_block(input_layer, skip_connection, filters, kernel_size, padding='same', activation='relu'):
    up = layers.UpSampling2D(size=(2, 2))(input_layer)
    up = layers.Conv2DTranspose(filters, kernel_size, activation=activation, padding=padding)(up)
    merge = layers.concatenate([up, skip_connection], axis=3)
    conv = layers.Conv2D(filters, 3, activation=activation, padding=padding, kernel_initializer='HeNormal')(merge)
    conv = layers.Conv2D(filters, 3, activation=activation, padding=padding, kernel_initializer='HeNormal')(conv)
    # print("upsample block shape is: " ,conv.shape)
    return conv

# Define the U-Net architecture
def unet(input_shape):
    inputs = tf.keras.Input(input_shape)

    conv1, pool1 = downsample_block(inputs, 64, (3, 3))
    conv2, pool2 = downsample_block(pool1, 128, (3, 3))
    # conv3, pool3 = downsample_block(pool2, 256, (3, 3))
    # conv4, pool4 = downsample_block(pool3, 512, (3, 3))

    conv5 = layers.Conv2D(1024, (3, 3), activation='relu', padding='same')(pool2)
    conv5 = layers.Conv2D(1024, (3, 3), activation='relu', padding='same')(conv5)

    # conv6 = upsample_block(conv5, conv4, 512, (3, 3))
    # conv7 = upsample_block(conv6, conv3, 256, (3, 3))
    conv8 = upsample_block(conv5, conv2, 128, (3, 3))
    conv9 = upsample_block(conv8, conv1, 64, (3, 3))

    outputs = layers.Conv2D(1, 1, activation='sigmoid')(conv9)

    model = models.Model(inputs=inputs, outputs=outputs)
    return model

In [2]:
import numpy as np
import os


# getting the maximum shape of the numpy array

def get_max_shape(dir):
    max_y = 0
    max_x = 0
    for filename in os.listdir(dir):
        if filename.endswith(".npy"):
            file_path = os.path.join(dir, filename)

            # Load the .npy file and append it to the x_train list
            spectrogram = np.load(file_path)
            if spectrogram.shape[1] > max_y:
                max_y = spectrogram.shape[1]

            if spectrogram.shape[0] > max_x:
                max_x = spectrogram.shape[0]

    return (max_x, max_y)

def get_array(dir, final_shape):
    # Initialize an empty list to store the loaded spectrograms
    arr = []
    # Iterate through each file in the directory
    for filename in os.listdir(dir):
        if filename.endswith(".npy"):
            file_path = os.path.join(dir, filename)

            # Load the .npy file and append it to the x_train list
            spectrogram = np.load(file_path)
            # Pad the spectrogram to match the desired final shape
            if spectrogram.shape[1] < final_shape[1]:
                # Pad the spectrogram to match the desired final shape
                pad_width = ((0, 0), (0, final_shape[1] - spectrogram.shape[1]))
                processed_spectrogram = np.pad(spectrogram, pad_width, mode='constant', constant_values=0)
            elif spectrogram.shape[1] > final_shape[1]:
                # Trim the spectrogram to match the desired final shape
                processed_spectrogram = spectrogram[:final_shape[0], :final_shape[1]]
            else:
                processed_spectrogram = spectrogram  # No change needed if the shape is already as desired
            # Append the padded spectrogram to the x_train list
            processed_spectrogram = processed_spectrogram[:final_shape[0], :]
            arr.append(processed_spectrogram)

    # Convert the list to a NumPy array if required
    arr = np.array(arr)

    return arr

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K

In [4]:
x_train_dir = '/content/drive/MyDrive/Colab Notebooks/speech denoiser/x_train_noised_speech'
y_train_dir = '/content/drive/MyDrive/Colab Notebooks/speech denoiser/y_train_clean_audio'

shape = get_max_shape(x_train_dir)

In [37]:
shape

(257, 496)

In [6]:
shape = (256, 128)
x_train = get_array(dir= x_train_dir, final_shape=shape)
y_train = get_array(dir=y_train_dir, final_shape=shape)

In [7]:
y_train.shape

(29, 256, 128)

In [8]:
# reshaping to make it suitable for training
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], x_train.shape[2], 1 )
y_train = y_train.reshape(y_train.shape[0], y_train.shape[1], y_train.shape[2], 1 )

In [9]:
model = unet(input_shape=x_train.shape[1:])

In [20]:
# mae = tf.keras.losses.MeanAbsoluteError()
binary_loss = tf.keras.losses.BinaryCrossentropy()
# sparse_loss=tf.keras.losses.SparseCategoricalCrossentropy()

# adam = tf.keras.optimizers.Adam(learning_rate=0.1)

In [11]:
# Define the IoU metric function
def iou(y_true, y_pred, smooth=1):
    intersection = tf.reduce_sum(tf.abs(y_true * y_pred), axis=[1,2,3])
    union = tf.reduce_sum(y_true, axis=[1,2,3]) + tf.reduce_sum(y_pred, axis=[1,2,3]) - intersection
    iou_score = tf.reduce_mean((intersection + smooth) / (union + smooth), axis=0)
    return iou_score

In [23]:
adam = tf.keras.optimizers.Adam(learning_rate=0.001)

model = unet(input_shape=x_train.shape[1:])
model.compile(optimizer=adam, loss=binary_loss, metrics=[iou])

history = model.fit(
    x_train, y_train,
    batch_size= 3,
    epochs=100,
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [48]:
pred = model.predict(x_train)



In [49]:
pred.shape

(29, 256, 256, 1)

In [50]:
pred = pred.reshape(pred.shape[0], pred.shape[1], pred.shape[2])
pred.shape

(29, 256, 256)

In [51]:
import librosa
import librosa.display
import IPython.display as ipd
from glob import glob

import soundfile as sf
import pickle

In [52]:
file_path = 'min_max_values.pkl'
with open(file_path, 'rb') as f:
        min_max_values = pickle.load(f)
min_max_values

{'x_train_noised_speech\\p234_001.wav_spec.npy': {'min': -52.605247,
  'max': 27.39475},
 'x_train_noised_speech\\p234_002.wav_spec.npy': {'min': -52.048027,
  'max': 27.951973},
 'x_train_noised_speech\\p234_003.wav_spec.npy': {'min': -49.93129,
  'max': 30.06871},
 'x_train_noised_speech\\p234_004.wav_spec.npy': {'min': -49.38927,
  'max': 30.61073},
 'x_train_noised_speech\\p234_005.wav_spec.npy': {'min': -48.617085,
  'max': 31.382915},
 'x_train_noised_speech\\p234_006.wav_spec.npy': {'min': -51.260406,
  'max': 28.739595},
 'x_train_noised_speech\\p234_007.wav_spec.npy': {'min': -51.865627,
  'max': 28.134373},
 'x_train_noised_speech\\p234_008.wav_spec.npy': {'min': -50.052658,
  'max': 29.94734},
 'x_train_noised_speech\\p234_009.wav_spec.npy': {'min': -51.896908,
  'max': 28.103092},
 'x_train_noised_speech\\p234_010.wav_spec.npy': {'min': -49.662415,
  'max': 30.337584},
 'x_train_noised_speech\\p234_012.wav_spec.npy': {'min': -52.02375,
  'max': 27.976248},
 'x_train_noised_

In [53]:
# constants
HOP_LENGTH = 256
MONO = 22050
SAMPLE_RATE = 16000
FRAME_SIZE = 512
OVERLAP_RATIO=0.3

In [54]:
spec_path = 'x_train_noised_speech\p234_001.wav_spec.npy'

In [55]:
# 2. Obtain the corresponding min max value of the loaded spectrogram
def get_min_max_values(file_path, min_max_values):
    min_val = min_max_values[file_path]['min']
    max_val = min_max_values[file_path]['max']
    return min_val, max_val

def denormalize(spectrogram, min_val, max_val):
    denormalized_spectrogram = spectrogram * (max_val - min_val) + min_val
    return denormalized_spectrogram

# 4. Convert spectrogram to audio signal
def spectrogram_to_audio(spectrogram):
    spectrogram = librosa.db_to_amplitude(spectrogram)
    audio_signal = librosa.istft(spectrogram, hop_length=HOP_LENGTH)
    return audio_signal

In [56]:
min_val, max_val = get_min_max_values(spec_path, min_max_values)
denorm_spectrogram =denormalize(pred[0], min_val, max_val)
audio = spectrogram_to_audio(denorm_spectrogram)
audio

array([ 4.0376672e-04,  3.6384142e-04,  3.6791092e-04, ...,
       -2.8832448e-05, -7.5441167e-05, -3.3589822e-05], dtype=float32)

In [57]:
sf.write('constructed.wav', audio, samplerate= SAMPLE_RATE)
constructed_audio = glob('*.wav')
constructed_audio

['constructed.wav']

In [58]:
ipd.Audio(constructed_audio[0])