# MFCC Features for Masks

Simple example of linear regression being used in conjunction with MFCCs.

In [2]:
# Speech Libraries
from python_speech_features import mfcc, get_filterbanks
from python_speech_features.sigproc import deframesig
import soundfile

## Wave libraries
from IPython.display import Audio
from IPython.display import display
import scipy.io.wavfile as wav

## Standard python libraries
import os,sys
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

## Data Preparation

In [11]:
sys.path.append('../../src/io')
from hdf5_iterator import Hdf5Iterator


## Noise Training

In [None]:
audio_dir = '/data/fs4/datasets/magnolia/librispeech/LibriSpeech/dev-clean/'
noise_dirs = os.listdir(audio_dir)

noises = {}
for iter_dir in noise_dirs:
    if iter_dir == '1272':
        continue
    audio_dir_iter = audio_dir + iter_dir+'/'
    audio_dir_iter_dirs = os.listdir(audio_dir_iter)
    for audio_dir_iter_dir_iter in audio_dir_iter_dirs:
        audio_files = os.listdir(audio_dir_iter+'/'+audio_dir_iter_dir_iter)
        for audio_file in audio_files:
            if not audio_file[-4:]=='flac':
                continue
            nsetime, fs = soundfile.read(audio_dir_iter+audio_dir_iter_dir_iter+'/'+audio_file)
            nsespec = specdecomp(nsetime,samplerate=fs,nfft=nfft,
                                 winlen=winlen,winstep=winstep,decomp='complex')
            noises[audio_file] = nsespec
            
noises_keys = list( noises.keys() )

In [None]:
def getbatch(numsamps, trainsplit=10, nfft=257):

    sigbatch = np.zeros((numsamps, nfft)) + 0j
    nsebatch = np.zeros((numsamps, nfft)) + 0j
    mskbatch = np.zeros((numsamps, nfft))
    for i in range(numsamps):
        sigchoice = signals[ signal_keys[ np.random.choice(trainsplit) ] ]
        sigbatch[i] = sigchoice[ np.random.choice(len(sigchoice))]

        nsechoice = noises[ noises_keys[ np.random.choice(len(noises_keys))]]
        nsebatch[i] = nsechoice[ np.random.choice(len(nsechoice))]
        
        mskbatch[i] = abs(sigbatch[i]) > abs(nsebatch[i])
        
    return sigbatch, nsebatch, mskbatch

sigbatch, nsebatch, mskbatch = getbatch(10*1024)

## Deep Neural Network Solution

In [None]:
train_from_scratch = False
continue_to_train = True
save_trained_model = 'dnn-1024-2048-2048-2048.h5' 
load_trained_model = False

import os
# os.environ['KERAS_BACKEND']= "tensorflow"
os.environ['KERAS_BACKEND']= "theano"
import keras
from keras.models import Sequential, load_model
from keras.optimizers import Adam
from keras.layers.core import Dense, Activation, Dropout, Lambda
from keras.objectives import binary_crossentropy

if train_from_scratch:
    model = Sequential()
    model.add(Dense(input_dim=257, output_dim=1024, init="uniform"))
    model.add(Activation("relu"))
    model.add(Dense(input_dim=1024, output_dim=2048, init="uniform"))
    model.add(Activation("relu"))
    model.add(Dense(input_dim=2048, output_dim=2048, init="uniform"))
    model.add(Activation("relu"))
    model.add(Dense(input_dim=2048, output_dim=2048, init="uniform"))
    model.add(Activation("relu"))
    model.add(Dense(input_dim=2048, output_dim=257))
    model.add(Activation("sigmoid"))
    optimizer_init=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(loss=binary_crossentropy, optimizer=optimizer_init)

if continue_to_train:
    for epoch in range(1000):
        sys.stdout.write('\r Getting batch ')
        sigbatch, nsebatch, mskbatch = getbatch(10*1024)
        sys.stdout.write('\r Starting to train')
        modelloss = model.fit(abs(sigbatch+nsebatch), mskbatch, nb_epoch=1, batch_size=32,verbose=0,shuffle=1)
        sys.stdout.write( '\r Epoch: '+str(epoch)+', '+ str(modelloss.history['loss'][0])+'\n' )

if save_trained_model:
    model.save(save_trained_model, overwrite=False)
    
# if load_trained_model:
#     model = load_model(load_trained_model)
    
# mask_recon = model.predict(mfcc_feat)

## Recover FFT magnitude

First recover the magnitude response through power spectrum.

In [None]:
# Choose original signal and noise
signaldemo = 1.5*signals[signal_keys[5]]
noisedemo = list( noises.values() )[2000]

# Add them together
minsamps = np.min( [signaldemo.shape[0], noisedemo.shape[0]] )
signoise = signaldemo[:minsamps]+noisedemo[:minsamps]

# Predict based on signal STFT spectra
prediction = model.predict(abs(signoise))

signoiserecon = np.fft.irfft( signoise )
signoiserecon = signoiserecon[:,:(int(fs*winlen))]
signoiserecon = deframesig(signoiserecon, 0, int(fs*winlen), int(fs*winstep))

sigrecon = prediction * abs(signoise) * np.exp( 1j * np.angle(signoise) )
sigrecon = np.fft.irfft( sigrecon )
sigrecon = sigrecon[:,:(int(fs*winlen))]
sigrecon = deframesig(sigrecon, 0, int(fs*winlen),int(fs*winstep))

display(Audio(signoiserecon, rate=fs))

display(Audio(sigrecon,rate=fs))


In [None]:
snda_recon = snda_recon.astype(np.int16)
plt.subplot(121); plt.imshow(np.log(mfcc_magni),aspect=0.4); 
plt.title('Original spectrum'); plt.colorbar(); plt.ylabel('Time (Sample)')
plt.subplot(122); plt.imshow(np.log(magni_A),aspect=0.4); 
plt.title('Reconstructed spectrum'); plt.colorbar(); plt.ylabel('Time (Sample)')
display(Audio(snda_recon,rate=fs))


In [None]:
the_tuple = (1,2,3)
sum(the_tuple)