**Speech Denoising using 1D CNN**

In [2]:
!pip install librosa # in colab, you'll need to install this
import librosa



In [0]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import matplotlib.pyplot as plt
import pylab as pl
import pandas as pd
import numpy as np

from IPython.display import display, clear_output
from __future__ import print_function, absolute_import, division
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
from math import ceil
from IPython.display import Audio
from scipy.io import wavfile
import math

%matplotlib inline

In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
from google.colab import files
uploaded = files.upload()

Saving test_x_01.wav to test_x_01 (1).wav
Saving test_x_02.wav to test_x_02 (1).wav
Saving train_clean_male.wav to train_clean_male (1).wav
Saving train_dirty_male.wav to train_dirty_male (1).wav


In [6]:
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

User uploaded file "test_x_01.wav" with length 145284 bytes
User uploaded file "test_x_02.wav" with length 388752 bytes
User uploaded file "train_clean_male.wav" with length 2522886 bytes
User uploaded file "train_dirty_male.wav" with length 2522898 bytes


In [0]:
s, sr=librosa.load('train_clean_male.wav', sr=None)
S_inp=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load('train_dirty_male.wav', sr=None)
X_inp=librosa.stft(sn, n_fft=1024, hop_length=512)
sx1, sr=librosa.load('test_x_01.wav', sr=None)
test1 = librosa.stft(sx1, n_fft=1024, hop_length=512)
sx2, sr=librosa.load('test_x_02.wav', sr=None)
test2 = librosa.stft(sx2, n_fft=1024, hop_length=512)

In [8]:
S_inp.shape
X_inp.shape
test1.shape
test2.shape

(513, 2459)

(513, 2459)

(513, 142)

(513, 380)

In [9]:
#Getting magnitudes as S and X are complex valued

S_mod = np.transpose(np.abs(S_inp))
X_mod = np.transpose(np.abs(X_inp))
test1_mod = np.transpose(np.abs(test1))
test2_mod = np.transpose(np.abs(test2))

S_mod.shape
X_mod.shape
test1_mod.shape
test2_mod.shape

(2459, 513)

(2459, 513)

(142, 513)

(380, 513)

In [10]:
#1D CNN Architecture

X = tf.placeholder(tf.float32, [None, 513], name="X")
y = tf.placeholder(tf.float32, [None, 513], name="y")
dropout_var = tf.placeholder("float")

def cnn1d_neural_net(X, dp):
    #Reshape input for CNN 
    X_cnn = tf.reshape(X, [-1,513,1])
    #Convolution Layer 1
    conv1 = tf.layers.conv1d(
            inputs=X_cnn,
            filters=32,
            strides = 1,
            kernel_size=16,
            padding="same",
            activation=tf.nn.relu)
    #Pooling Layer 1
    pool1 = tf.layers.max_pooling1d(
            inputs=conv1,
            pool_size=2,
            strides=2,
            padding="same")
    #Convolution Layer 2
    conv2 = tf.layers.conv1d(
            inputs=pool1,
            filters=32,
            strides=1,
            kernel_size=8,
            padding="same",
            activation=tf.nn.relu)
    #Pooling Layer 2
    pool2 = tf.layers.max_pooling1d(
            inputs=conv2,
            pool_size=2,
            strides=2)
    #Flatten layer
    pool2_flat = tf.layers.flatten(pool2)
    
    #Fully connected layers
    layer1 = tf.layers.dense(
                    inputs=pool2_flat,
                    units=1024,
                    activation=tf.nn.relu)
    
    dropout = tf.layers.dropout(
            inputs=layer1,
            rate=dp)
    
    output = tf.layers.dense(
            inputs=dropout,
            units=513,
            activation=tf.nn.relu)

    return (output)

#Output 
output = cnn1d_neural_net(X, dropout_var)

#Optimization
cost = tf.reduce_mean(tf.losses.mean_squared_error(output, y)) 
train_optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

#Initializer
init = tf.global_variables_initializer()

Instructions for updating:
Use keras.layers.conv1d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.max_pooling1d instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Use tf.cast instead.


In [0]:
#Function to calculate SNR

def SNR_function(s, s_pred):
    
    nlen = min(len(s), len(s_pred))
    SNR = 10*math.log10((np.sum(s[:nlen]**2))/(np.sum((s[:nlen] - s_pred[:nlen])**2)))
    
    return SNR

In [12]:
#Running model on training speech signal dataset and checking (in batches)

max_epochs = 1000
batch_size = 128
step = 5

sess = tf.Session()
sess.run(init)

for epoch in range(max_epochs):
    avg_cost = 0.
    random = np.arange(0, 2459, 128)
    np.random.shuffle(random)
    
    for i in range(len(random)):
        start = int(random[i])
        end = int(start + batch_size)
        b_x, b_y = np.array(X_mod[start:end,:]), np.array(S_mod[start:end])
        data = {X: b_x, y: b_y, dropout_var : 0.2}
        sess.run(train_optimizer, feed_dict=data)
        avg_cost += sess.run(cost, feed_dict=data)
     
    avg_cost = avg_cost / len(random)
        
    if (epoch+1) % step == 0:
        print ("Epoch: %03d/%03d cost: %.9f" % (epoch, max_epochs, avg_cost))
        data = {X: b_x, y: b_y, dropout_var : 0.2}
        train_output = sess.run(output, feed_dict=data)
        
        data = {X: X_mod, y: S_mod, dropout_var : 0.2}
        full_train_output = sess.run(output, feed_dict=data)
        
print ("=========================Model Optimization Complete============================")

Epoch: 004/1000 cost: 0.010809428
Epoch: 009/1000 cost: 0.006042133
Epoch: 014/1000 cost: 0.006882394
Epoch: 019/1000 cost: 0.003691017
Epoch: 024/1000 cost: 0.003365514
Epoch: 029/1000 cost: 0.003150398
Epoch: 034/1000 cost: 0.002428060
Epoch: 039/1000 cost: 0.002408685
Epoch: 044/1000 cost: 0.002232445
Epoch: 049/1000 cost: 0.003446339
Epoch: 054/1000 cost: 0.002059321
Epoch: 059/1000 cost: 0.001743725
Epoch: 064/1000 cost: 0.003030069
Epoch: 069/1000 cost: 0.004698553
Epoch: 074/1000 cost: 0.001734368
Epoch: 079/1000 cost: 0.001449856
Epoch: 084/1000 cost: 0.001238381
Epoch: 089/1000 cost: 0.001124857
Epoch: 094/1000 cost: 0.001075878
Epoch: 099/1000 cost: 0.001124738
Epoch: 104/1000 cost: 0.001097991
Epoch: 109/1000 cost: 0.003058501
Epoch: 114/1000 cost: 0.001441706
Epoch: 119/1000 cost: 0.001655763
Epoch: 124/1000 cost: 0.003858744
Epoch: 129/1000 cost: 0.001033761
Epoch: 134/1000 cost: 0.001081006
Epoch: 139/1000 cost: 0.000895004
Epoch: 144/1000 cost: 0.000728444
Epoch: 149/100

In [13]:
#SNR Calculation on whole dataset
s_hat = (X_inp / np.abs(X_inp)) * np.transpose(full_train_output)
s_pred = librosa.istft(s_hat, win_length = 1024, hop_length = 512)

s.shape
s_pred.shape

print (SNR_function(s,s_pred))

(1258899,)

(1258496,)

19.358574382825605


In [0]:
librosa.output.write_wav('train_recovered_1dcnn.wav', s_pred, sr)

files.download('train_recovered_1dcnn.wav')

In [0]:
#On test signal 1

data = {X: test1_mod, y : S_mod, dropout_var : 0.2}
test1_output = sess.run(output, feed_dict=data)

test1_hat = (test1 / np.abs(test1)) * np.transpose(test1_output)
test1_pred = librosa.istft(test1_hat, win_length = 1024, hop_length = 512)


librosa.output.write_wav('test_s_01_recons_1dcnn.wav', test1_pred, sr)

files.download('test_s_01_recons_1dcnn.wav')

In [0]:
#On test signal 2

data = {X: test2_mod, y : S_mod, dropout_var : 0.2}
test2_output = sess.run(output, feed_dict=data)

test2_hat = (test2 / np.abs(test2)) * np.transpose(test2_output)
test2_pred = librosa.istft(test2_hat, win_length = 1024, hop_length = 512)


librosa.output.write_wav('test_s_02_recons_1dcnn.wav', test2_pred, sr)

files.download('test_s_02_recons_1dcnn.wav')