**Speech Denoising using 2D CNN**

In [1]:
!pip install librosa # in colab, you'll need to install this
import librosa



In [0]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import matplotlib.pyplot as plt
import pylab as pl
import pandas as pd
import numpy as np

from IPython.display import display, clear_output
from __future__ import print_function, absolute_import, division
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
from math import ceil
from IPython.display import Audio
from scipy.io import wavfile
import math

%matplotlib inline

In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
from google.colab import files
uploaded = files.upload()

Saving test_x_01.wav to test_x_01.wav
Saving test_x_02.wav to test_x_02.wav
Saving train_clean_male.wav to train_clean_male.wav
Saving train_dirty_male.wav to train_dirty_male.wav


In [5]:
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

User uploaded file "test_x_01.wav" with length 145284 bytes
User uploaded file "test_x_02.wav" with length 388752 bytes
User uploaded file "train_clean_male.wav" with length 2522886 bytes
User uploaded file "train_dirty_male.wav" with length 2522898 bytes


In [0]:
s, sr=librosa.load('train_clean_male.wav', sr=None)
S_inp=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load('train_dirty_male.wav', sr=None)
X_inp=librosa.stft(sn, n_fft=1024, hop_length=512)
sx1, sr=librosa.load('test_x_01.wav', sr=None)
test1 = librosa.stft(sx1, n_fft=1024, hop_length=512)
sx2, sr=librosa.load('test_x_02.wav', sr=None)
test2 = librosa.stft(sx2, n_fft=1024, hop_length=512)

In [7]:
S_inp.shape
X_inp.shape
test1.shape
test2.shape

(513, 2459)

(513, 2459)

(513, 142)

(513, 380)

In [8]:
#Getting magnitudes as S and X are complex valued

S_mod = np.transpose(np.abs(S_inp))
X_mod = np.transpose(np.abs(X_inp))
test1_mod = np.transpose(np.abs(test1))
test2_mod = np.transpose(np.abs(test2))

S_mod.shape
X_mod.shape
test1_mod.shape
test2_mod.shape

(2459, 513)

(2459, 513)

(142, 513)

(380, 513)

In [9]:
#2D CNN Architecture

X = tf.placeholder(tf.float32, [None, 20, 513], name="X")
y = tf.placeholder(tf.float32, [None, 513], name="y")
dropout_var = tf.placeholder("float")

def cnn2d_neural_net(X, dp):
    #Reshape input for CNN 
    X_cnn = tf.reshape(X, (-1, 20, 513, 1))

    #Convolution Layer 1
    conv1 = tf.layers.conv2d(
            inputs=X_cnn,
            filters=32,
            kernel_size=[4,4],
            padding="same",
            activation=tf.nn.relu)
    #Pooling Layer 1
    pool1 = tf.layers.max_pooling2d(
            inputs=conv1,
            pool_size=[2,2],
            strides=[2,2],
            padding="same")
    #Convolution Layer 2
    conv2 = tf.layers.conv2d(
            inputs=pool1,
            filters=32,
            kernel_size=[2,2],
            padding="same",
            activation=tf.nn.relu)
    #Pooling Layer 2
    pool2 = tf.layers.max_pooling2d(
            inputs=conv2,
            pool_size=[2,2],
            strides=[2,2])
    #Flatten layer
    pool2_flat = tf.layers.flatten(pool2)
    
    #Fully connected layers
    layer1 = tf.layers.dense(
                    inputs=pool2_flat,
                    units=1024,
                    activation=tf.nn.relu)
    
    dropout = tf.layers.dropout(
            inputs=layer1,
            rate=dp)
    
    output = tf.layers.dense(
            inputs=dropout,
            units=513,
            activation=tf.nn.relu)

    return (output)

#Output 
output = cnn2d_neural_net(X, dropout_var)

#Optimization
cost = tf.reduce_mean(tf.losses.mean_squared_error(output, y)) 
train_optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

#Initializer
init = tf.global_variables_initializer()

Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.max_pooling2d instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Use tf.cast instead.


In [0]:
#Function to calculate SNR

def SNR_function(s, s_pred):
    
    nlen = min(len(s), len(s_pred))
    SNR = 10*math.log10((np.sum(s[:nlen]**2))/(np.sum((s[:nlen] - s_pred[:nlen])**2)))
    
    return SNR

In [16]:
#Running model on training speech signal dataset and checking (in batches)

max_epochs = 1000
batch_size = 128
step = 5

sess = tf.Session()
sess.run(init)

X_shifted = np.array([np.reshape(X_mod[i:i+20],(20,513)) for i in range(2440)])
y_shifted = S_mod[19:,:]

for epoch in range(max_epochs):
    avg_cost = 0.
    random = np.arange(0, 2440, 128)
    np.random.shuffle(random)
    
    for i in range(len(random)):
        start = int(random[i])
        end = int(start + batch_size)
        b_x, b_y = np.array(X_shifted[start:end,:,:]), np.array(y_shifted[start:end])
        data = {X: b_x, y: b_y, dropout_var : 0.2}
        sess.run(train_optimizer, feed_dict=data)
        avg_cost += sess.run(cost, feed_dict=data)
     
    avg_cost = avg_cost / len(random)
        
    if (epoch+1) % step == 0:
        print ("Epoch: %03d/%03d cost: %.9f" % (epoch, max_epochs, avg_cost))
        data = {X: b_x, y: b_y, dropout_var : 0.2}
        train_output = sess.run(output, feed_dict=data)
        
        data = {X: X_shifted, y: y_shifted, dropout_var : 0.2}
        full_train_output = sess.run(output, feed_dict=data)
        
print ("=========================Model Optimization Complete============================")

Epoch: 004/1000 cost: 0.029571954
Epoch: 009/1000 cost: 0.011974132
Epoch: 014/1000 cost: 0.007820888
Epoch: 019/1000 cost: 0.005970108
Epoch: 024/1000 cost: 0.004620082
Epoch: 029/1000 cost: 0.005082565
Epoch: 034/1000 cost: 0.004294476
Epoch: 039/1000 cost: 0.005770985
Epoch: 044/1000 cost: 0.003335274
Epoch: 049/1000 cost: 0.003208236
Epoch: 054/1000 cost: 0.002570640
Epoch: 059/1000 cost: 0.002267038
Epoch: 064/1000 cost: 0.002317241
Epoch: 069/1000 cost: 0.002525717
Epoch: 074/1000 cost: 0.002285717
Epoch: 079/1000 cost: 0.002087444
Epoch: 084/1000 cost: 0.004194754
Epoch: 089/1000 cost: 0.002416093
Epoch: 094/1000 cost: 0.001916490
Epoch: 099/1000 cost: 0.001633182
Epoch: 104/1000 cost: 0.001814092
Epoch: 109/1000 cost: 0.001721575
Epoch: 114/1000 cost: 0.001682994
Epoch: 119/1000 cost: 0.001417158
Epoch: 124/1000 cost: 0.001451881
Epoch: 129/1000 cost: 0.002791699
Epoch: 134/1000 cost: 0.002201653
Epoch: 139/1000 cost: 0.001549214
Epoch: 144/1000 cost: 0.001232424
Epoch: 149/100

In [17]:
#Adding noise frames with small values

full_train_output_pad = np.vstack((np.full((19,513),0.0000000000000000015), full_train_output))
full_train_output_pad.shape
print(full_train_output_pad)

(2459, 513)

[[1.50000000e-18 1.50000000e-18 1.50000000e-18 ... 1.50000000e-18
  1.50000000e-18 1.50000000e-18]
 [1.50000000e-18 1.50000000e-18 1.50000000e-18 ... 1.50000000e-18
  1.50000000e-18 1.50000000e-18]
 [1.50000000e-18 1.50000000e-18 1.50000000e-18 ... 1.50000000e-18
  1.50000000e-18 1.50000000e-18]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 1.83241218e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]


In [18]:
#SNR Calculation on whole dataset

s_hat = (X_inp / np.abs(X_inp)) * np.transpose(full_train_output_pad)
s_pred = librosa.istft(s_hat, win_length = 1024, hop_length = 512)

s.shape
s_pred.shape

print (SNR_function(s,s_pred))

(1258899,)

(1258496,)

17.486399235373355


In [0]:
librosa.output.write_wav('train_recovered_2dcnn.wav', s_pred, sr)

files.download('train_recovered_2dcnn.wav')

In [27]:
#On test signal 1

test1_shifted = np.array([np.reshape(test1_mod[i:i+20],(20,513)) for i in range(123)])

data = {X: test1_shifted, y : y_shifted, dropout_var : 0.2}
test1_output = sess.run(output, feed_dict=data)
test1_output_pad = np.vstack((np.full((19,513),0.0000000000000000015), test1_output))
test1_output_pad.shape

test1_hat = (test1 / np.abs(test1)) * np.transpose(test1_output_pad)
test1_pred = librosa.istft(test1_hat, win_length = 1024, hop_length = 512)


librosa.output.write_wav('test_s_01_recons_2dcnn.wav', test1_pred, sr)

files.download('test_s_01_recons_2dcnn.wav')

(142, 513)

In [26]:
#On test signal 2

test2_shifted = np.array([np.reshape(test2_mod[i:i+20],(20,513)) for i in range(361)])

data = {X: test2_shifted, y : y_shifted, dropout_var : 0.2}
test2_output = sess.run(output, feed_dict=data)
test2_output_pad = np.vstack((np.full((19,513),0.0000000000000000015), test2_output))
test2_output_pad.shape

test2_hat = (test2 / np.abs(test2)) * np.transpose(test2_output_pad)
test2_pred = librosa.istft(test2_hat, win_length = 1024, hop_length = 512)


librosa.output.write_wav('test_s_02_recons_2dcnn.wav', test2_pred, sr)

files.download('test_s_02_recons_2dcnn.wav')

(380, 513)