In [146]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from scipy.signal import fftconvolve
import IPython
import pyroomacoustics as pra
from pyroomacoustics.denoise import apply_spectral_sub
from scipy import signal as sps


TRAIN_DIR = "/home/karim/Desktop/Sonos_Assignment/vad_train_set/"
TEST_DIR = "/home/karim/Desktop/Sonos_Assignment/vad_train_set/"

## Load two chanels/spectrograms

In [None]:
# Setting up room dims
room_dim = [3, 3, 3]  # meters
room_center = [dim / 2 for dim in room_dim]
# put our michrophone in the center of the room
R = np.array([[room_center[0] - (0.071/2), room_center[0] + (0.071/2)], 
              [room_center[1], room_center[1]], 
              [room_center[2], room_center[2]]])  # [[x], [y], [z]]

# Setting up Beamformer params
Lg_t = 0.100                # filter size in seconds
Lg = np.ceil(Lg_t*fs)       # in samples
sigma2_n = 5e-7
fft_len = 512



# Iterate through our data files
directory = TRAIN_DIR + "audio/"
for filename in os.listdir(directory)[500:]:
    if filename.endswith(".wav"):
        # Load files
        audio_file = os.path.join(directory, filename)
        meta_file = os.path.join(TRAIN_DIR + "metadata/", filename[:-3]+"json")
        fs, signal = wavfile.read(audio_file)
        signal = pra.normalize(signal.astype(float))
        with open(meta_file) as json_file:
            meta_data = json.load(json_file)
            
        # Load azimuth and elevation
        source_azimuth, source_elevation =  meta_data['source_doa']
        noise_azimuth, noise_elevation = meta_data['noise_doa']
        
        # Setting up room based on rt60
        rt60_tgt = float(meta_data['rt60'])
        e_absorption, max_order = pra.inverse_sabine(rt60_tgt, room_dim)
        room_bf = pra.ShoeBox(room_dim, fs=fs, materials=pra.Material(e_absorption), max_order=max_order)
        
        # Putting sources in room
        r = 1 #Assume all sources are 1 meter away from the center
        # Setup sources for speech and noise 
        speech_source = [r*np.cos(source_azimuth)*np.cos(source_elevation) + room_center[0],
                        r*np.sin(source_azimuth)*np.cos(source_elevation) + room_center[1],
                        r*np.sin(source_elevation) + room_center[2]] # [x,y,z]
        noise_source = [r*np.cos(noise_azimuth)*np.cos(noise_elevation) + room_center[0],
                        r*np.sin(noise_azimuth)*np.cos(noise_elevation) + room_center[1],
                        r*np.sin(noise_elevation) + room_center[2]] # [x,y,z]
        
        # Here I pass a dummy signal to the source just to simulate. We ignore this signal later
        room_bf.add_source(speech_source, delay=0., signal=signal.T[0,:])
        room_bf.add_source(noise_source, delay=0, signal=np.zeros_like(signal.T[0,:]))

        # define our beamformer
        mics = pra.Beamformer(R, room_bf.fs, N=fft_len, Lg=Lg)
        room_bf.add_microphone_array(mics)

        # Choose beamformer algorithm
        mics.rake_mvdr_filters(room_bf.sources[0][:1] , room_bf.sources[1][:1] , sigma2_n * np.eye(mics.Lg * mics.M))
        #mics.rake_delay_and_sum_weights(room_bf.sources[0][:1],room_bf.sources[1][:1])
        #mics.rake_perceptual_filters(room_bf.sources[0][0:1] , room_bf.sources[1][0:1] , sigma2_n * np.eye(mics.Lg * mics.M))

        # Run simulation
        room_bf.compute_rir()
        room_bf.simulate()
        
        # Replace microphone signals with the recorded signals in our dataset, i.e. ignoring dummy sources
        room_bf.mic_array.record(signal.T, fs)

        # Get enhanced signal
        beamform_signal = mics.process(FD=False)
        beamform_signal = pra.normalize(beamform_signal)
        
        #Apply Spectral Subtraction
        denoised_beamform_signal = apply_spectral_sub(beamform_signal, nfft=512,
                                     db_reduc=12, lookback=15, beta=20, alpha=3)
        denoised_beamform_signal = pra.normalize(denoised_beamform_signal)
        
        # Apply narrowband speech filtering
        sos = sps.butter(10, [30, 3000], 'bandpass', fs=fs, output='sos')
        filtered_denoised_beamform_signal = sps.sosfilt(sos, denoised_beamform_signal)
        filtered_denoised_beamform_signal = pra.normalize(filtered_denoised_beamform_signal)
        
        # Downsampling to 6kHz [This is very slow, I can do it with librosa when reading the file later]
        #number_of_samples = round(filtered_denoised_beamform_signal.shape[0] * float(6000) / fs)
        #downsampled_filtered_denoised_beamform_signal = sps.resample(filtered_denoised_beamform_signal, 
        #                                                             number_of_samples)
        
        wavfile.write(TRAIN_DIR + "beamformed/[filtered_denoised_beamformed]" + filename, 
                      fs, filtered_denoised_beamform_signal)
        
        # plot the room and resulting beamformer before simulation
        #fig, ax = room_bf.plot(freq=[500, 1000, 2000, 4000], img_order=0)
        #fig.set_size_inches(20, 8)
        #ax.legend(['500', '1000', '2000', '4000'])

        # Listen to output
        #print("Beamformed Signal:")
        #IPython.display.Audio(signal_enhanced, rate=fs)

## Remove Noise from 3 seconds 

In [None]:
### Add band-pass filter
### Add echo cancelation
### Preemphasis?? 
### Add beamforming using angles and time delay

## Use location infor
    ### Either determine location yourself
    ### Or use the groundtruth straight
    ### How to do this Beamforming thing? 

## Use vocals-specific features to enhance the speech

## Merge and output mono enhanced

In [153]:
IPython.display.Audio(filtered_denoised_beamform_signal, rate=fs)

In [148]:
IPython.display.Audio(0.5*(signal[:,0] + signal[:,1]) , rate=fs)