# Intel N-DNS solution evaluation example (Track 1)

> Copyright (C) 2021-22 Intel Corporation: MIT

This notebook provides a guideline for evaluating the Intel N-DNS solution for track 1. We will use the example SDNN baseline solution to guide the evaluation workflow. The evaluation metrics of interest are as follows.
1. **SI-SNR** of the solution
2. **SI-SNRi** of the solution (improvement against both _noisy data_ and _encode+decode_ processing).
3. **Latency** of encode+decode processing
4. **Power** proxy
5. **Power Delay Product (PDP)** proxy

Please refer to the Intel N-DNS challenge meanifesto for more details about the metrics.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import yaml
import librosa
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import IPython.display as ipd

from lava.lib.dl import slayer
from audio_dataloader import DNSAudio
from snr import si_snr
from dnsmos import DNSMOS

In [2]:
from train_sdnn import collate_fn, stft_splitter, stft_mixer, nop_stats, Network

# Gather the network statistics
## 1. Overload N-DNS `Network` definition

Modify network's `forward` method to log spiking events in each layer.

In [3]:
class InferenceNet(Network):
    def forward(self, noisy):
        x = noisy - self.stft_mean

        counts = []
        for block in self.blocks:
            x = block(x)
            count = torch.mean((torch.abs(x) > 0).to(x.dtype))
            counts.append(count.item())

        mask = torch.relu(x + 1)
        return slayer.axon.delay(noisy, self.out_delay) * mask, torch.tensor(counts)

## 2. Read the training hyperparameters

In [4]:
trained_folder = 'Trained'
args = yaml.safe_load(open(trained_folder + '/args.txt', 'rt'))
if 'out_delay' not in args.keys():
    args['out_delay'] = 0
if 'n_fft' not in args.keys():
    args['n_fft'] = 512
device = torch.device('cuda:0')
root = args['path']
out_delay = args['out_delay']
n_fft = args['n_fft']
win_length = n_fft
hop_length = n_fft // 4
stats = slayer.utils.LearningStats(accuracy_str='SI-SNR', accuracy_unit='dB')

## 3. Create dataset and dataloader instances

In [5]:
train_set = DNSAudio(root=root + 'training_set/')
validation_set = DNSAudio(root=root + 'validation_set/')

train_loader = DataLoader(train_set,
                          batch_size=32,
                          shuffle=True,
                          collate_fn=collate_fn,
                          num_workers=4,
                          pin_memory=True)
validation_loader = DataLoader(validation_set,
                               batch_size=32,
                               shuffle=True,
                               collate_fn=collate_fn,
                               num_workers=4,
                               pin_memory=True)


## 4. Instantiate N-DNS network

In [6]:
net = InferenceNet(args['threshold'],
                   args['tau_grad'],
                   args['scale_grad'],
                   args['dmax'],
                   args['out_delay']).to(device)

## 5. Load trained network

In [7]:
noisy, clean, noise, metadata = train_set[0]
noisy = torch.unsqueeze(torch.FloatTensor(noisy), dim=0).to(device)
noisy_abs, noisy_arg = stft_splitter(noisy, n_fft)
net(noisy_abs)
net.load_state_dict(torch.load(trained_folder + '/network.pt'))

<All keys matched successfully>

## 6. Gather DNSMOS, SI-SNR, SynOPs and NeuronOPs statistics

In [8]:
dnsmos = DNSMOS()
dnsmos_noisy = np.zeros(3)
dnsmos_clean = np.zeros(3)
dnsmos_noise = np.zeros(3)
dnsmos_cleaned  = np.zeros(3)
train_event_counts = []

t_st = datetime.now()
for i, (noisy, clean, noise) in enumerate(train_loader):
    net.eval()
    with torch.no_grad():
        noisy = noisy.to(device)
        clean = clean.to(device)

        noisy_abs, noisy_arg = stft_splitter(noisy, n_fft)
        clean_abs, clean_arg = stft_splitter(clean, n_fft)

        denoised_abs, count = net(noisy_abs)
        train_event_counts.append(count.cpu().data.numpy())
        noisy_arg = slayer.axon.delay(noisy_arg, out_delay)
        clean_abs = slayer.axon.delay(clean_abs, out_delay)
        clean = slayer.axon.delay(clean, win_length * out_delay)

        loss = F.mse_loss(denoised_abs, clean_abs)
        clean_rec = stft_mixer(denoised_abs, noisy_arg, n_fft)
        score = si_snr(clean_rec, clean)

        dnsmos_noisy += np.sum(dnsmos(noisy.cpu().data.numpy()), axis=0)
        dnsmos_clean += np.sum(dnsmos(clean.cpu().data.numpy()), axis=0)
        dnsmos_noise += np.sum(dnsmos(noise.cpu().data.numpy()), axis=0)
        dnsmos_cleaned += np.sum(dnsmos(clean_rec.cpu().data.numpy()), axis=0)

        stats.training.correct_samples += torch.sum(score).item()
        stats.training.loss_sum += loss.item()
        stats.training.num_samples += noisy.shape[0]

        processed = i * train_loader.batch_size
        total = len(train_loader.dataset)
        time_elapsed = (datetime.now() - t_st).total_seconds()
        samples_sec = time_elapsed / (i + 1) / train_loader.batch_size
        header_list = [f'Train: [{processed}/{total} '
                        f'({100.0 * processed / total:.0f}%)]']
        header_list.append(f'Event rate: {[c.item() for c in count]}')
        print(f'\r{header_list[0]}', end='')

dnsmos_clean /= len(train_loader.dataset)
dnsmos_noisy /= len(train_loader.dataset)
dnsmos_noise /= len(train_loader.dataset)
dnsmos_cleaned /= len(train_loader.dataset)

print()
stats.print(0, i, samples_sec, header=header_list)
print('Avg DNSMOS clean   [ovrl, sig, bak]: ', dnsmos_clean)
print('Avg DNSMOS noisy   [ovrl, sig, bak]: ', dnsmos_noisy)
print('Avg DNSMOS noise   [ovrl, sig, bak]: ', dnsmos_noise)
print('Avg DNSMOS cleaned [ovrl, sig, bak]: ', dnsmos_cleaned)

mean_events = np.mean(train_event_counts, axis=0)

neuronops = []
for block in net.blocks[:-1]:
    neuronops.append(np.prod(block.neuron.shape))

synops = []
for events, block in zip(mean_events, net.blocks[1:]):
    synops.append(events * np.prod(block.synapse.shape))
print(f'SynOPS: {synops}')
print(f'Total SynOPS: {sum(synops)}')
print(f'Total NeuronOPS: {sum(neuronops)}')
print(f'Time-step per sample: {noisy_abs.shape[-1]}')

[0A
[2KEvent rate: [0.30871349573135376, 0.2193162888288498, 0.04606522247195244, 0.9994450211524963]
Epoch    0: i =  1874 ,     102.7517 ms elapsed        
Train loss =     0.00380                          SI-SNR = 12.31859 dB
Avg DNSMOS clean   [ovrl, sig, bak]:  [3.29563445 3.57449779 4.08365889]
Avg DNSMOS noisy   [ovrl, sig, bak]:  [2.43390976 3.17245699 2.68303469]
Avg DNSMOS noise   [ovrl, sig, bak]:  [1.34082896 1.57199918 1.92072504]
Avg DNSMOS cleaned [ovrl, sig, bak]:  [2.70185238 3.20471684 3.44368897]
SynOPS: [40514.81315612793, 57311.546875, 6520.753440856934]
Total SynOPS: 104347.11347198486
Total NeuronOPS: 1281
Time-step per sample: 3751


In [9]:
ipd.Audio(noisy[0].cpu(), rate=16000)

In [10]:
ipd.Audio(clean_rec[0].cpu(), rate=16000)

In [11]:
ipd.Audio(clean[0].cpu(), rate=16000)

In [12]:
dnsmos_noisy = np.zeros(3)
dnsmos_clean = np.zeros(3)
dnsmos_noise = np.zeros(3)
dnsmos_cleaned  = np.zeros(3)
valid_event_counts = []

t_st = datetime.now()
for i, (noisy, clean, noise) in enumerate(validation_loader):
    net.eval()
    with torch.no_grad():
        noisy = noisy.to(device)
        clean = clean.to(device)

        noisy_abs, noisy_arg = stft_splitter(noisy, n_fft)
        clean_abs, clean_arg = stft_splitter(clean, n_fft)

        denoised_abs, count = net(noisy_abs)
        valid_event_counts.append(count.cpu().data.numpy())
        noisy_arg = slayer.axon.delay(noisy_arg, out_delay)
        clean_abs = slayer.axon.delay(clean_abs, out_delay)
        clean = slayer.axon.delay(clean, win_length * out_delay)

        loss = F.mse_loss(denoised_abs, clean_abs)
        clean_rec = stft_mixer(denoised_abs, noisy_arg, n_fft)
        score = si_snr(clean_rec, clean)

        dnsmos_noisy += np.sum(dnsmos(noisy.cpu().data.numpy()), axis=0)
        dnsmos_clean += np.sum(dnsmos(clean.cpu().data.numpy()), axis=0)
        dnsmos_noise += np.sum(dnsmos(noise.cpu().data.numpy()), axis=0)
        dnsmos_cleaned += np.sum(dnsmos(clean_rec.cpu().data.numpy()), axis=0)

        stats.validation.correct_samples += torch.sum(score).item()
        stats.validation.loss_sum += loss.item()
        stats.validation.num_samples += noisy.shape[0]

        processed = i * validation_loader.batch_size
        total = len(validation_loader.dataset)
        time_elapsed = (datetime.now() - t_st).total_seconds()
        samples_sec = time_elapsed / (i + 1) / validation_loader.batch_size
        header_list = [f'Valid: [{processed}/{total} '
                        f'({100.0 * processed / total:.0f}%)]']
        header_list.append(f'Event rate: {[c.item() for c in count]}')
        print(f'\r{header_list[0]}', end='')

dnsmos_clean /= len(validation_loader.dataset)
dnsmos_noisy /= len(validation_loader.dataset)
dnsmos_noise /= len(validation_loader.dataset)
dnsmos_cleaned /= len(validation_loader.dataset)

print()
stats.print(0, i, samples_sec, header=header_list)
print('Avg DNSMOS clean   [ovrl, sig, bak]: ', dnsmos_clean)
print('Avg DNSMOS noisy   [ovrl, sig, bak]: ', dnsmos_noisy)
print('Avg DNSMOS noise   [ovrl, sig, bak]: ', dnsmos_noise)
print('Avg DNSMOS cleaned [ovrl, sig, bak]: ', dnsmos_cleaned)

mean_events = np.mean(valid_event_counts, axis=0)

neuronops = []
for block in net.blocks[:-1]:
    neuronops.append(np.prod(block.neuron.shape))

synops = []
for events, block in zip(mean_events, net.blocks[1:]):
    synops.append(events * np.prod(block.synapse.shape))
print(f'SynOPS: {synops}')
print(f'Total SynOPS: {sum(synops)} per time-step')
print(f'Total NeuronOPS: {sum(neuronops)} per time-step')
print(f'Time-step per sample: {noisy_abs.shape[-1]}')

[5A
[2KEvent rate: [0.3181550204753876, 0.21679367125034332, 0.04949130862951279, 0.999442458152771]
Epoch    0: i =  1874 ,     102.5422 ms elapsed        
Train loss =     0.00380                          SI-SNR = 12.31859 dB 
Valid loss =     0.00373                          SI-SNR = 12.50489 dB
Avg DNSMOS clean   [ovrl, sig, bak]:  [3.29491551 3.57421666 4.08289938]
Avg DNSMOS noisy   [ovrl, sig, bak]:  [2.45707635 3.18887228 2.71982605]
Avg DNSMOS noise   [ovrl, sig, bak]:  [1.34840847 1.58526078 1.95771366]
Avg DNSMOS cleaned [ovrl, sig, bak]:  [2.71498773 3.2131026  3.46046059]
SynOPS: [39874.007247924805, 57208.47265625, 6436.094928741455]
Total SynOPS: 103518.57483291626 per time-step
Total NeuronOPS: 1281 per time-step
Time-step per sample: 3751


In [13]:
ipd.Audio(noisy[0].cpu(), rate=16000)

In [14]:
ipd.Audio(clean_rec[0].cpu(), rate=16000)

In [15]:
ipd.Audio(clean[0].cpu(), rate=16000)

# Latency

$\text{latency} = \text{latency}_\text{buffer} + \text{latency}_\text{enc+dec} + \text{latency}_\text{N-DNS}$

## 1. Buffer latency

It is the time required to collect data samples needed by the DNS sample at every time-step. For STFT encoder, it is the `window_length` of STFT processing.

In [16]:
dt = hop_length / metadata['fs']
buffer_latency = dt
print(f'Buffer latency: {buffer_latency * 1000} ms')

Buffer latency: 8.0 ms


## 2. Encode+Decode latency

It is the additional processing time introduced by the encoder+decoder blocks. We will measure the actual computation time on a CPU.

In [17]:
t_st = datetime.now()
for i in range(noisy.shape[0]):
    audio = noisy[i].cpu().data.numpy()
    stft = librosa.stft(audio, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
    istft = librosa.istft(stft, n_fft=n_fft, win_length=win_length, hop_length=hop_length)

time_elapsed = (datetime.now() - t_st).total_seconds()

enc_dec_latency = time_elapsed / noisy.shape[0] / 16000 / 30 * hop_length
print(f'STFT + ISTFT latency: {enc_dec_latency * 1000} ms')

STFT + ISTFT latency: 0.015155475000000002 ms


## 3. N-DNS latency

It is the algorithmic time shift introduced by the N-DNS network (if any). This can be calculated as the peak cross-correlation between the noisy and clean reconstruction audio. This latency should be the desired output delay (`out_delay * dt`) of the network. The cross correlation calculation below is an alternate way of evaluating the N-DNS latency.

In [18]:
dns_delays = []
max_len = 50000  # Only evaluate for first clip of audio
for i in range(noisy.shape[0]):
    delay = np.argmax(np.correlate(noisy[i, :max_len].cpu().data.numpy(),
                                   clean_rec[i, :max_len].cpu().data.numpy(),
                                   'full')) - max_len + 1
    dns_delays.append(delay)
dns_latency = np.mean(dns_delays) / metadata['fs']
print(f'N-DNS latency: {dns_latency * 1000} ms')

N-DNS latency: 0.0 ms


# Audio Quality Metrics

The audio quality metric is measured by
1. $\text{SI-SNR}$ on the validation set.
2. $\text{SI-SNR}$ improvement on the raw audio ($\text{SI-SNRi}_\text{data}$) and encode+decode operation ($\text{SI-SNRi}_\text{enc+dec}$)

> Note: when the testing dataset is released, we shall use $\text{SI-SNR}$ on testing set as the eventual audio quality metric.

In [19]:
base_stats = slayer.utils.LearningStats(accuracy_str='SI-SNR',
                                        accuracy_unit='dB')
nop_stats(validation_loader, base_stats, base_stats.validation, print=False)

Here, $\text{SI-SNRi}_\text{data}$ and $\text{SI-SNRi}_\text{encode+decode}$ are same as STFT-ISTFT is a lossless transformation.

In [20]:
si_snr_i = stats.validation.accuracy - base_stats.validation.accuracy
print(f'SI-SNR  (validation set): {stats.validation.accuracy: .2f} dB')
print(f'SI-SNRi (validation set): {si_snr_i: .2f} dB')

SI-SNR  (validation set):  12.50 dB
SI-SNRi (validation set):  4.88 dB


# Computational Metrics

For Track 1, we will use proxies for **power** and **energy-delay-product** as these require access to actual neuromorphic hardware.

1. __Power proxy:__ To estimate proxy for the power of the N-DNS solution, we use the weighted sum of $\text{NeuronOPS}$ and $\text{SynOPS}$ which typically consume the majority of the power in a neuromorphic system. Based on our silicon characterization of Loihi and Loihi 2, NeuronOps consume approximately $10\times$ energy than SynOps. Therefore, we use the following power proxy:

    $P_\text{proxy} = \text{Effective SynOPS} = \text{SynOPS} + 10 \times \text{NeuronOPS}$

2. __Power delay product proxy:__ Power delay product provides a combined metric of power and latency of the solution. The proxy is defined as:

    $\text{PDP}_\text{proxy} = \text{SynOPS delay product} = P_\text{proxy} \times \text{latency}$

In [21]:
latency = buffer_latency + enc_dec_latency + dns_latency
effective_synops_rate = (sum(synops) + 10 * sum(neuronops)) / dt
synops_delay_product = effective_synops_rate * latency

print(f'Solution Latency                 : {latency * 1000: .3f} ms')
print(f'Power proxy (Effective SynOPS)   : {effective_synops_rate:.3f} ops/s')
print(f'PDP proxy (SynOPS-delay product) : {synops_delay_product: .3f} ops')

Solution Latency                 :  8.015 ms
Power proxy (Effective SynOPS)   : 14541071.854 ops/s
PDP proxy (SynOPS-delay product) :  116548.952 ops
