## インストール(初回のみ)
事前に
- pyenv をインストール
- `pyenv install 3.8.13` を実行

しておく

In [None]:
!sh install.sh

---

In [None]:
import torch
import matplotlib.pyplot as plt
from IPython.display import Audio

## parameters

In [None]:
stft_dict = dict(n_fft=1024, hop_length=256, window=torch.hann_window)

## IO

In [None]:
from batools.utils.audio.wave import load_wave, metadata_wave
from batools.utils.audio.plot import show_spec, show_wave

In [None]:
def show(znt, fs):
    display(Audio(data=znt, rate=fs))

    fig, axes = plt.subplots(2, 1, figsize=(16, 4), tight_layout=True)

    show_wave(znt, fs, ax=axes[0], color='b')

    axes[0].tick_params(axis='x', bottom=False, labelbottom=False)
    axes[0].set_xlim([0, znt.shape[-1]/fs])
    axes[0].set_xlabel('')

    show_spec(znt, fs, stft_dict['n_fft'], ax=axes[1])

    plt.show()

In [None]:
filename = 'data/test.wav'
print(metadata_wave(filename))
data, fs = load_wave(filename)

show(data, fs)

xnt = data

## BSS
分離パラメータの詳細は pyroomacoustics から確認
> https://pyroomacoustics.readthedocs.io/en/pypi-release/pyroomacoustics.bss.html
- AuxIVA
- FastMNMF
- ILRMA

以下，使用は非推奨

自己実装
- ~~NMFLGM~~

GPUテンソルでのデバッグがまだ(CPU上では動く)
> https://github.com/onolab-tmu/overiva

- ~~OverIVA~~
- ~~OGIVE~~

In [None]:
from batools.utils.audio.bss.auxiva import AuxIVA
from batools.utils.audio.bss.fastmnmf import FastMNMF
from batools.utils.audio.bss.ilrma import ILRMA

# from utils.audio.bss.lgm import NMFLGM
# from utils.audio.bss.overiva import OverIVA
# from utils.audio.bss.ogive import OGIVE

GPUを用いた計算も可能

In [None]:
device = 'cpu'
model = AuxIVA(**stft_dict).to(device)

In [None]:
ynt = model(xnt.to(device), n_iter=20)
_ = [show(yt, fs) for yt in ynt]

## Remove silence

In [None]:
from batools.utils.audio import silence_pyaudioanalysis
from batools.utils.audio.transform import extract_from_section

特に雑音が多い環境下の場合，必要な帯域のみを使用して無音区間除去を行ったほうが精度が出る場合がある

In [None]:
from batools.utils.audio.transform import apply_freq_mask

In [None]:
freq_low, freq_high = None, None
ynt = apply_freq_mask(xnt, fs, freq_low=freq_low, freq_high=freq_high)

- 無音区間除去

In [None]:
nonsilent_sections, prob_dict = silence_pyaudioanalysis.silence_removal(
    ynt, fs, 1000, 500,
    freq_low=freq_low, freq_high=freq_high,
    min_nonsilence_ms=0,
    broaden_section_ms=0,
    smooth_window_ms=100,
    weight=.5,
    return_prob=True
)

In [None]:
pr, pr_thr = prob_dict['probability'], prob_dict['threshold']

d_min, d_max = xnt.mean(0).min().item(), xnt.mean(0).max().item()
d_min, d_max = min(d_min, -abs(d_max)), max(d_max, abs(d_min))
nonsilence = d_min*torch.ones(xnt.shape[-1]*1000//fs)
for sec in nonsilent_sections:
    nonsilence[sec[0]:sec[1]] = d_max

fig, axes = plt.subplots(2, 1, figsize=(16, 4), tight_layout=True)

show_wave(xnt, fs, ax=axes[0], color='b')

x_ms = torch.linspace(0, xnt.shape[-1]/fs, steps=nonsilence.shape[0])
pr_ms = pr[:, None].tile((1, 500)).view(-1)
pr_ms = torch.nn.functional.pad(
    pr_ms[:x_ms.size(0)],
    [0, x_ms.size(0)-min(pr_ms.size(0), x_ms.size(0))],
    'constant', torch.nan
)
axes[0].fill_between(x_ms, nonsilence, d_min, facecolor='r', alpha=.5)
axes[0].plot(x_ms, d_min+(d_max-d_min)*pr_ms, color='yellowgreen')
axes[0].axhline(y=d_min+(d_max-d_min)*pr_thr, color='yellow')

axes[0].tick_params(axis='x', bottom=False, labelbottom=False)
axes[0].set_xlim([0, xnt.shape[-1]/fs])
axes[0].set_xlabel('')

show_spec(xnt, fs, stft_dict['n_fft'], ax=axes[1])

plt.show()

- 区間抽出

In [None]:
extracted = [extract_from_section(xnt, fs, section) for section in nonsilent_sections]