In [1]:
from audio_utils import *
import os
from tqdm import tqdm
import numpy as np
import webrtcvad
import torch

In [2]:
# 1. sample_rate is 8000
target_sample_rate = 8000
# 2. vad_window_ms is 30ms
vad_window_ms = [10, 20, 30][2]
# 3. vad_overlap_percent is 50%
vad_overlap_ratio = 0.5
# 4. label_region_ms is 100ms
label_region_s = 0.1
# 5. label_overlap_percent is 50%
label_overlap_ratio = 0.5
# 6. decision_function_name is "max"
deciding_method = ["max"][0]

In [3]:
class WebrtcVadLabelMaker:
    @staticmethod
    def max_count_deciding(items) -> bool:
        counts = np.bincount(items)
        return bool(np.argmax(counts))

    VAD_WIDTHS = [10, 20, 30]

    def __init__(self, mode=2, vad_window_ms=30, vad_overlap_ratio=0, deciding_method='max'):
        self.vad_window_ms = vad_window_ms
        self.vad_overlap_ratio = vad_overlap_ratio
        self.vad = webrtcvad.Vad(mode)
        
        self.DECIDING_FUNCTIONS_DICT = {'max': WebrtcVadLabelMaker.max_count_deciding}
        self.decider = self.DECIDING_FUNCTIONS_DICT[deciding_method]

    def __call__(self, au: AudioWorker, label_region_sec=0.1, label_overlap_ratio=0.5):
        simple_wave = au.wave.squeeze(0)
        vad_window = int(self.vad_window_ms * au.rate / 1000)
        vad_hop = int(vad_window * self.vad_overlap_ratio)
        frames = torch.nn.functional.unfold(simple_wave.unsqueeze(0).unsqueeze(0).unsqueeze(-1),
                                            kernel_size=(vad_window, 1),
                                            stride=(vad_hop, 1)).squeeze(0).T
        speech_mask = []
        for frame in frames:
            bytes_like = frame.mul(32767).to(torch.int16).numpy().tobytes()
            is_speech = self.vad.is_speech(bytes_like, au.rate)
            speech_mask.append(is_speech)

        item_wise_mask = np.full_like(simple_wave, False, dtype=bool)
        for i, is_speech in enumerate(speech_mask):
            item_wise_mask[vad_hop * i:vad_hop * i + vad_window] = is_speech or item_wise_mask[
                                                                                vad_hop * i:vad_hop * i + vad_window]
        reg_width = int(au.rate * label_region_sec)
        region_hop_width = int(reg_width * (1 - label_overlap_ratio))
        count = int(np.floor((len(item_wise_mask) - reg_width) / region_hop_width) + 1)
        region_labels = []
        for i in range(count):
            start = i * region_hop_width
            end = min((i + 1) * region_hop_width, len(item_wise_mask))
            reg_is_speech = self.decider(item_wise_mask[start:end])
            region_labels.append(reg_is_speech)

        return item_wise_mask, region_labels

In [4]:
openSLR_data_directory = 'OpenSLR/train-clean-100'

vad = WebrtcVadLabelMaker(2, vad_window_ms, vad_overlap_ratio, deciding_method)

audio_files_paths = OpenSLRDataset.get_files_by_extension(openSLR_data_directory, ext='flac')

labels_dir = f'{target_sample_rate}_{vad_window_ms}_{int(vad_overlap_ratio * 100)}_{int(label_region_s * 1000)}_{int(label_overlap_ratio * 100)}_{deciding_method}'

print(labels_dir)

8000_30_50_100_50_max


In [20]:
if len(audio_files_paths) > 0:
    os.makedirs(labels_dir, exist_ok=True)

    for audio_path in tqdm(audio_files_paths, total=len(audio_files_paths)):
        aw = AudioWorker(os.path.join(openSLR_data_directory, audio_path)).load()
        _, labels = vad(aw)
        
        destination_file_path = OpenSLRDataset.change_file_extension(os.path.join(labels_dir, audio_path), ".txt")
        label_dir, file_name = os.path.split(destination_file_path)
        
        os.makedirs(label_dir, exist_ok=True)

        with open(destination_file_path, 'w') as file:
            file.write(''.join(map(str, list(map(int, labels)))))

100%|██████████| 28539/28539 [18:40<00:00, 25.46it/s]


In [6]:
from audio_utils import *
OpenSLRDataset.change_file_extension("mynameis.file", ".txt")

'mynameis.txt'