In [52]:
import torchaudio
import noisereduce as nr
import torch as tr
import matplotlib.pyplot as plt

In [53]:
x1, sr = torchaudio.load('../normalized/sensor_1.wav')
x2, sr = torchaudio.load('../normalized/sensor_2.wav')
x3, sr = torchaudio.load('../normalized/sensor_3.wav')
X = tr.concat([x1, x2, x3], dim=0)

N = len(x1[0])
n = 3
tt = tr.arange(N) / sr
ii = tr.linspace(0, N, n, dtype=tr.int32)

In [54]:
def long_or(a, w):
    w = int(w)
    y = tr.zeros(a.shape)
    for i in range(w, a.shape[0]-w , w//4):
        y[i-w:i+w] = True in a[i-w:i+w]
    return y

In [55]:
def moving_average(samples, window_size):
    # samples: (..., time)
    y = []
    for j in range(samples.shape[0]):
        for i in range(-window_size//2, window_size//2):
            y.append(tr.roll(samples[j], i, dims=0))
    return tr.mean(tr.row_stack(y), dim=0, keepdim=True)

In [56]:
def windows(length, window_size):
    for j in range(0, length, window_size):
        yield j, j + window_size - 1

# Frequency Range Preprocessing

In [57]:
# https://seaindia.in/blogs/human-voice-frequency-range/
X = torchaudio.functional.lowpass_biquad(X, sr, 3500)
X = torchaudio.functional.highpass_biquad(X, sr, 80)

# Amplitude Classifier

In [58]:

C = []
ease_l = 10000
threshold = 1.3

for x in X: # for each sensor
    E = tr.sqrt(tr.mean(x**2)) # Calculate a baseline
    c = tr.abs(x) > E * threshold
    c = long_or(c, .25 * sr) # expand acceptance bands

    ease_in = tr.where(tr.diff(c) > 0)[0]
    ease_out = tr.where(tr.diff(c) < 0)[0]
    
    for i in ease_in:
        ii = tr.arange(max(0, i-ease_l), min(i+ease_l, len(c)))
        c[ii] = (ii - ii[0]) / len(ii)
    
    for i in ease_out:
        ii = tr.arange(max(0, i-ease_l), min(i+ease_l, len(c)))
        c[ii] = 1 - (ii - ii[0]) / len(ii)

    C.append(c)
    # plt.fill_between(tt, c, alpha=.3)
    # plt.yticks([])
C = tr.vstack(C)

# Fusion
Use volume discriminator to separate high volume and low volume sounds (segmentation).

In [59]:
# Segmentation
X_high = X * C
X_low = X * (1 - C)

Cross apply noise segments and use moving average as fusion function.

In [61]:
torchaudio.save('high_segment.wav', X_high, sr)
torchaudio.save('low_segment.wav', X_low, sr)