In [66]:
import torchaudio
import noisereduce as nr
import torch as tr
import matplotlib.pyplot as plt

In [67]:
x1, sr = torchaudio.load('../normalized/sensor_1.wav')
x2, sr = torchaudio.load('../normalized/sensor_2.wav')
x3, sr = torchaudio.load('../normalized/sensor_3.wav')
X = tr.concat([x1, x2, x3], dim=0)

N = len(x1[0])
n = 3
tt = tr.arange(N) / sr
ii = tr.linspace(0, N, n, dtype=tr.int32)

In [68]:
def long_or(a, w):
    w = int(w)
    y = tr.zeros(a.shape)
    for i in range(w, a.shape[0]-w , w//4):
        y[i-w:i+w] = True in a[i-w:i+w]
    return y

In [None]:
rms = lambda x: tr.sqrt(tr.mean(x**2))

In [69]:
def moving_average(samples, window_size):
    # samples: (..., time)
    y = []
    for j in range(samples.shape[0]):
        for i in range(-window_size//2, window_size//2):
            y.append(tr.roll(samples[j], i, dims=0))
    return tr.mean(tr.row_stack(y), dim=0, keepdim=True)

In [70]:
def windows(length, window_size):
    for j in range(0, length, window_size):
        yield j, j + window_size - 1

# Frequency Range Preprocessing

In [71]:
# https://seaindia.in/blogs/human-voice-frequency-range/
X = torchaudio.functional.lowpass_biquad(X, sr, 3500)
X = torchaudio.functional.highpass_biquad(X, sr, 80)

# Amplitude Classifier

In [72]:

C = []
ease_l = 10000
threshold = 1.2

for x in X: # for each sensor
    E = tr.sqrt(tr.mean(x**2)) # Calculate a baseline
    c = tr.abs(x) > E * threshold
    c = long_or(c, .1 * sr) # expand acceptance bands

    ease_in = tr.where(tr.diff(c) > 0)[0]
    ease_out = tr.where(tr.diff(c) < 0)[0]
    
    for i in ease_in:
        ii = tr.arange(max(0, i-ease_l), min(i+ease_l, len(c)))
        c[ii] = (ii - ii[0]) / len(ii)
    
    for i in ease_out:
        ii = tr.arange(max(0, i-ease_l), min(i+ease_l, len(c)))
        c[ii] = 1 - (ii - ii[0]) / len(ii)

    C.append(c)
    # plt.fill_between(tt, c, alpha=.3)
    # plt.yticks([])
C = tr.vstack(C)

# Fusion
Use volume discriminator to separate high volume and low volume sounds (segmentation).

In [77]:
# Segmentation
X_high = X * C
iC = 1 - C
X_low = X * iC

X_low /= rms(X_low) * 10

X_noise = []
for i in range(3):
    x = X_low[i][iC[i] > .5]
    x /= rms(x) * 10
    X_noise.append(x)

In [78]:
torchaudio.save('high_segment.wav', X_high, sr)
torchaudio.save('low_segment.wav', tr.unsqueeze(tr.cat(X_noise), 0), sr)

Cross apply noise segments and use moving average as fusion function.

In [None]:
X_clean = X_high.clone()

for i in range(3):
    # Cross apply each noise sample onto each vocal sample. Noise applied sequentially
    for j in range(3):
        X_clean[i] = tr.tensor(nr.reduce_noise(y=X_clean[i], y_noise=X_noise[j],
                                               sr=sr,
                                               prop_decrease=.6 if i == j else .4,
                                               thresh_n_mult_nonstationary=2,
                                               stationary=False))

In [None]:
W = tr.reshape(tr.sqrt(tr.mean(X_clean**2, axis=1)), (3,1))
W /= W.sum()
print(W)
print(X_clean.shape)
x_clean = moving_average(X_clean * W, 7)

tensor([[0.5465],
        [0.3570],
        [0.0966]])
torch.Size([3, 9595771])


Attempt volume regularization. i.e. making the quiet segments of the cleaned signal louder.

In [None]:
y = x_clean.clone()
for i, j in windows(N, int(.1 * sr)):
    i -= int(.05 * sr)
    j += int(.05 * sr)
    y[i:j] /= rms(y[i:j])
x_clean = y

In [None]:
torchaudio.save('output.wav', x_clean * .1, sr)