In [1]:
# Speech Libraries
from python_speech_features import mfcc, get_filterbanks
from python_speech_features.sigproc import deframesig
from psf_supplement import specdecomp

## Wave libraries
from IPython.display import Audio
from IPython.display import display
import scipy.io.wavfile as wav

## Standard python libraries
import numpy as np
import sys
import matplotlib.pylab as plt
%matplotlib inline

## Three data files

In [2]:
audio_dir = "/data/fs4/datasets/magnolia/sisec/dev/"
fs, snda = wav.read(audio_dir+"dev_Ca1_Ce_A_src.wav")
fs, sndb = wav.read(audio_dir+"dev_Ca1_Ce_B_src.wav")
fs, sndc = wav.read(audio_dir+"dev_Sq1_Ce_B_src.wav")

sndabc = snda+sndb+sndc

## Preprocessing Spectral and MFCC features

In [3]:
# Parameters
nfilt=64
numcep=64
nfft=512
winlen=0.01
winstep=0.005
ceplifter=0

# We can extract MFCC features from the combination of A & B signals
mfcc_feat = mfcc(sndabc,fs,nfilt=nfilt,numcep=numcep,nfft=nfft,
                 winlen=winlen,winstep=winstep,ceplifter=ceplifter,
                 appendEnergy=False)

# Since python_speech_features takes spectrograms differently than 
# you or I might, I've included the library in psf_supplement.py.
mfcc_magni = specdecomp(sndabc,samplerate=fs,nfft=nfft,
                        winlen=winlen,winstep=winstep,decomp='abs')
mfcc_phase = specdecomp(sndabc,samplerate=fs,nfft=nfft,
                        winlen=winlen,winstep=winstep,decomp='phase')

# Labeled mask requires situations where the desired signal is larger 
# than the noise. In this case, we require magnitude responses of signals
# A and B (the signal and noise).
magni_A = specdecomp(snda,samplerate=fs,nfft=nfft,
                     winlen=winlen,winstep=winstep,decomp='abs')
magni_B = specdecomp(sndb,samplerate=fs,nfft=nfft,
                     winlen=winlen,winstep=winstep,decomp='abs')
magni_C = specdecomp(sndc,samplerate=fs,nfft=nfft,
                     winlen=winlen,winstep=winstep,decomp='abs')


mask_A = ( magni_A > magni_B ) * ( magni_A > magni_C )
mask_B = ( magni_B > magni_A ) * ( magni_B > magni_C )
mask_C = ( magni_C > magni_A ) * ( magni_C > magni_B )

## Forked neural network model

In [4]:
import os
# os.environ['KERAS_BACKEND']= "tensorflow"
os.environ['KERAS_BACKEND']= "theano"
import keras
from keras.models import Sequential, load_model
from keras.optimizers import Adam
from keras.layers.core import Dense, Activation, Dropout, Lambda
from keras.objectives import binary_crossentropy

model = Sequential()
model.add(Dense(input_dim=mfcc_feat.shape[1], output_dim=2048, init="uniform"))
model.add(Activation("relu"))
model.add(Dense(input_dim=2048, output_dim=2048))
model.add(Activation("relu"))


modelA = Sequential()
modelB = Sequential()
modelC = Sequential()

modelA.add(model)
modelB.add(model)
modelC.add(model)

modelA.add(Dense(input_dim=2048, output_dim=257))
modelB.add(Dense(input_dim=2048, output_dim=257))
modelC.add(Dense(input_dim=2048, output_dim=257))

modelA.add(Activation("sigmoid"))
modelB.add(Activation("sigmoid"))
modelC.add(Activation("sigmoid"))

modelA.optimizer_init=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
modelA.compile(loss=binary_crossentropy, optimizer=modelA.optimizer_init)

modelB.optimizer_init=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
modelB.compile(loss=binary_crossentropy, optimizer=modelB.optimizer_init)

modelC.optimizer_init=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
modelC.compile(loss=binary_crossentropy, optimizer=modelC.optimizer_init)

Using Theano backend.
Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled, cuDNN 5110)


## Iterate through training

In [5]:
for epoch in range(150):
    modelAloss = modelA.fit(mfcc_feat, mask_A, nb_epoch=1, batch_size=32, verbose=0, shuffle=1)
    modelBloss = modelB.fit(mfcc_feat, mask_B, nb_epoch=1, batch_size=32, verbose=0, shuffle=1)
    modelCloss = modelC.fit(mfcc_feat, mask_C, nb_epoch=1, batch_size=32, verbose=0, shuffle=1)
    mAl = modelAloss.history['loss'][0]
    mBl = modelBloss.history['loss'][0]
    mCl = modelCloss.history['loss'][0]
    sys.stdout.write('\rEpoch '+str(epoch)+', loss=('+str(mAl)+','+str(mBl)+','+str(mCl)+')')
    sys.stdout.flush()

Epoch 149, loss=(0.0013783499418,0.000612506327556,0.00147323150306))

## Test on training data

In [6]:
mask_A_recon = modelA.predict( mfcc_feat )
mask_B_recon = modelB.predict( mfcc_feat )
mask_C_recon = modelC.predict( mfcc_feat )

## Reconstruct the original signal

In [7]:
snda_recon = (mask_A_recon * mfcc_magni) * np.exp( 1j *  mfcc_phase )
snda_recon = np.fft.irfft( snda_recon )
snda_recon = snda_recon[:,:(int(fs*winlen))]

sndb_recon = (mask_B_recon * mfcc_magni) * np.exp( 1j *  mfcc_phase )
sndb_recon = np.fft.irfft( sndb_recon )
sndb_recon = sndb_recon[:,:(int(fs*winlen))]

sndc_recon = (mask_C_recon * mfcc_magni) * np.exp( 1j *  mfcc_phase )
sndc_recon = np.fft.irfft( sndc_recon )
sndc_recon = sndc_recon[:,:(int(fs*winlen))]

snda_recon = deframesig(snda_recon, 0, int(fs*winlen), int(fs*winstep))
sndb_recon = deframesig(sndb_recon, 0, int(fs*winlen), int(fs*winstep))
sndc_recon = deframesig(sndc_recon, 0, int(fs*winlen), int(fs*winstep))

In [8]:
display(Audio(snda_recon, rate=fs))
display(Audio(sndb_recon, rate=fs))
display(Audio(sndc_recon, rate=fs))