In [1]:
import wave
from time import time

import os
from tqdm import tqdm
import numpy as np
import webrtcvad
import contextlib
import soundfile as sf

from other.audio_utils import get_files_by_extension

In [2]:
target_sample_rate = 8000
vad_window_ms = [10, 20, 30][0]
vad_overlap_ratio = 0.5

In [3]:
class WebrtcVadLabelMaker:
    @staticmethod
    def read_wave(path):
        ext = os.path.splitext(path)[1]
        if ext == '.wav':
            with contextlib.closing(wave.open(path, 'rb')) as wf:
                num_channels = wf.getnchannels()
                assert num_channels == 1
                sample_width = wf.getsampwidth()
                assert sample_width == 2
                sample_rate = wf.getframerate()
                assert sample_rate in (8000, 16000, 32000, 48000)
                pcm_data = wf.readframes(wf.getnframes())
                return pcm_data, sample_rate
        elif ext == '.flac':
            with sf.SoundFile(path, "r") as flac_file:
                pcm_data = flac_file.read(dtype="int16").tobytes()
                sample_rate = flac_file.samplerate
                assert sample_rate in (8000, 16000, 32000, 48000)
                num_channels = flac_file.channels
                assert num_channels == 1
                return pcm_data, sample_rate

    def __init__(self, mode=2, vad_window_ms=30, vad_overlap_ratio=0):
        self.vad_window_ms = vad_window_ms
        self.vad_overlap_ratio = vad_overlap_ratio
        self.vad = webrtcvad.Vad(mode)

    def __call__(self, file_path):
        wave, rate = WebrtcVadLabelMaker.read_wave(file_path)
        window = int(self.vad_window_ms * rate / 1000)
        step = int((1 - vad_overlap_ratio) * window)

        samples_count = len(wave) // 2
        ones_regions = []
        last_being = False
        n_frames = int((samples_count - window) / step)
        for i in range(n_frames):
            s = i * step
            e = s + window
            is_speech = self.vad.is_speech(wave[2 * s:2 * (s + window)], rate)

            if last_being and is_speech:  # 1 -> 1
                ones_regions[-1] = e
            elif not last_being and is_speech:  # 0 -> 1
                if not ones_regions or ones_regions[-1] < s:
                    ones_regions.extend([s, e])
                else:
                    ones_regions[-1] = e
                last_being = True
            else:
                last_being = False

        return ones_regions


In [4]:
openSLR_data_directory, ext = '../data/train-clean-100', 'flac'
# openSLR_data_directory, ext = "../data/MSDWild/raw_wav", 'wav'
where_to_save = '../buffer'

vad = WebrtcVadLabelMaker(2, vad_window_ms)

audio_files_paths = get_files_by_extension(openSLR_data_directory, ext=ext, rel=True)

labels_path = f'{target_sample_rate}_{vad_window_ms}_{int(vad_overlap_ratio * 100)}_webrtc_labels.csv'
labels_path = os.path.join(where_to_save, labels_path)
data_samples = len(audio_files_paths)
print(data_samples, "files like:", np.random.choice(audio_files_paths))
print(labels_path)

28539 files like: 831\130739\831-130739-0060.flac
../buffer\8000_10_50_webrtc_labels.csv


In [6]:
if data_samples > 0:
    with open(labels_path, 'w') as file:
        file.write("filename,labels" + '\n')

        t = tqdm(audio_files_paths[:], total=data_samples)
        webrtcvad_t, write_t = 0, 0
        ma = 0.8
        for i, audio_path in enumerate(t):
            s_vad = time()
            filepath = os.path.join(openSLR_data_directory, audio_path)
            one_stamps = vad(filepath)
            e_vad = time()
            path_parts = audio_path.split(os.sep)
            filename = path_parts[-1]


            file.write(filename + ',' + '-'.join(map(str, one_stamps)) + '\n')
            e_write = time()

            webrtcvad_t = ma * webrtcvad_t + (1 - ma) * (e_vad - s_vad)
            write_t = ma * webrtcvad_t + (1 - ma) * (e_write - e_vad)
            if i % 100 == 0:
                t.set_description_str(f"webrtcvad: {webrtcvad_t * 1000:.1f}ms | write: {write_t * 1000:.1f}ms")

else:
    print(len(audio_files_paths), "audio files not found")

webrtcvad: 3.6ms | write: 2.9ms:   4%|▍         | 1122/28539 [00:05<02:16, 200.39it/s]


KeyboardInterrupt: 

In [None]:
df = pd.read_csv(os.path.join(where_to_save, "8000_10_50_webrtc_labels.txt"))
with open("8000_10_50_webrtc_labels.csv", 'w') as file:
    file.write("filename,labels" + '\n')
    for _, row in tqdm(df.iterrows(), total=data_samples):
        k = 0
        counts = []
        for s in row['labels'].split("-"):
            s = int(s)
            counts.append(s - k)
            k = s
        file.write(filename + ',' + '-'.join(map(str, counts)) + '\n')

In [None]:


audio_files_paths = get_files_by_extension("../data/MSDWild/raw_wav", ext='wav', rel=False)

means = []
maxes = []
stds = []
mines = []
for audio_path in tqdm(audio_files_paths[:3000]):
    aw = AudioWorker(audio_path).load()

    w = aw.wave
    E = w ** 2
    means.append(E.mean())
    stds.append(E.std())

    mines.append(E.min())
    maxes.append(E.max())




In [None]:
plt.hist(means, bins=100);

In [None]:
plt.hist(stds, bins=100);


In [None]:
plt.hist(maxes, bins=100);


In [None]:
plt.hist(mines, bins=100);


In [None]:
target = "3699-47246-0026.flac"
reader, chapter, *_ = target.split("-")
aw = AudioWorker(os.path.join(openSLR_data_directory, reader, chapter, target)).load()
aw.resample(target_sample_rate)
_, labels = vad(aw)


In [None]:
print(aw.wave.shape)

items = aw.wave.size(1)
reg_width = int(aw.rate * label_region_s)
region_hop_width = int(reg_width * (1 - label_overlap_ratio))
count = int(np.floor((items - reg_width) / region_hop_width) + 1)

print((items - reg_width) / region_hop_width)

print(items, reg_width, region_hop_width, count)