In [38]:
import wave

from other.audio_utils import *
import os
from tqdm import tqdm
import numpy as np
import webrtcvad
import contextlib
import soundfile as sf

In [39]:
target_sample_rate = 8000
vad_window_ms = [10, 20, 30][0]
vad_overlap_ratio = 0.5

In [40]:
class WebrtcVadLabelMaker:
    @staticmethod
    def read_wave(path):
        ext = os.path.splitext(path)[1]
        if ext == '.wav':
            with contextlib.closing(wave.open(path, 'rb')) as wf:
                num_channels = wf.getnchannels()
                assert num_channels == 1
                sample_width = wf.getsampwidth()
                assert sample_width == 2
                sample_rate = wf.getframerate()
                assert sample_rate in (8000, 16000, 32000, 48000)
                pcm_data = wf.readframes(wf.getnframes())
                return pcm_data, sample_rate
        elif ext == '.flac':
            with sf.SoundFile(path, "r") as flac_file:
                pcm_data = flac_file.read(dtype="int16").tobytes()
                sample_rate = flac_file.samplerate
                assert sample_rate in (8000, 16000, 32000, 48000)
                num_channels = flac_file.channels
                assert num_channels == 1
                return pcm_data, sample_rate

    def __init__(self, mode=2, vad_window_ms=30, vad_overlap_ratio=0):
        self.vad_window_ms = vad_window_ms
        self.vad_overlap_ratio = vad_overlap_ratio
        self.vad = webrtcvad.Vad(mode)

    def __call__(self, file_path):
        wave, rate = WebrtcVadLabelMaker.read_wave(file_path)
        window = int(self.vad_window_ms * rate / 1000)
        step = int((1 - vad_overlap_ratio) * window)

        speech_mask = []
        n_frames = int((len(wave) // 2 - window) / step)
        for i in range(n_frames):
            s = i * step
            speech_mask.append(0 + self.vad.is_speech(wave[2 * s:2 * (s + window)], rate))

            # item_wise_mask = np.full_like(simple_wave, False, dtype=bool)
            # for i, is_speech in enumerate(speech_mask):
            #
            #     item_wise_mask[vad_hop * i:vad_hop * i + vad_window] = is_speech or item_wise_mask[
            #                                                                         vad_hop * i:vad_hop * i + vad_window]

        return speech_mask

In [41]:
openSLR_data_directory, ext = '../data/train-clean-100', 'flac'
# openSLR_data_directory, ext = "../data/MSDWild/raw_wav", 'wav'
where_to_save = '../buffer'

vad = WebrtcVadLabelMaker(2, vad_window_ms)

audio_files_paths = get_files_by_extension(openSLR_data_directory, ext=ext, rel=True)

labels_path = f'{target_sample_rate}_{vad_window_ms}_{int(vad_overlap_ratio * 100)}_webrtc_labels.csv'
os.makedirs(labels_path, exist_ok=True)
labels_path = os.path.join(where_to_save, labels_path)
data_samples = len(audio_files_paths)
print(data_samples, "files like:", np.random.choice(audio_files_paths))
print(labels_path)

28539 files like: 322\124147\322-124147-0033.flac
../buffer\8000_10_50_webrtc_labels.csv


In [42]:
if data_samples > 0:
    with open(labels_path, 'w') as file:
        file.write("filename,labels" + '\n')

        for audio_path in tqdm(audio_files_paths[:], total=data_samples):
            filepath = os.path.join(openSLR_data_directory, audio_path)
            stamps = vad(filepath)
            path_parts = audio_path.split(os.sep)
            filename = path_parts[-1]

            file.write(filename + ',' + '-'.join(map(str, stamps)) + '\n')


else:
    print(len(audio_files_paths), "audio files not found")

  9%|▉         | 2610/28539 [00:11<01:56, 223.18it/s]


KeyboardInterrupt: 

In [None]:
df = pd.read_csv(os.path.join(where_to_save, "8000_10_50_webrtc_labels.txt"))
with open("8000_10_50_webrtc_labels.csv", 'w') as file:
    file.write("filename,labels" + '\n')
    for _, row in tqdm(df.iterrows(), total=data_samples):
        k = 0
        counts = []
        for s in row['labels'].split("-"):
            s = int(s)
            counts.append(s - k)
            k = s
        file.write(filename + ',' + '-'.join(map(str, counts)) + '\n')

In [None]:
from numpy import histogram

audio_files_paths = get_files_by_extension("../data/MSDWild/raw_wav", ext='wav', rel=False)

means = []
maxes = []
stds = []
mines = []
for audio_path in tqdm(audio_files_paths[:3000]):
    aw = AudioWorker(audio_path).load()

    w = aw.wave
    E = w ** 2
    means.append(E.mean())
    stds.append(E.std())

    mines.append(E.min())
    maxes.append(E.max())




In [None]:
plt.hist(means, bins=100);

In [None]:
plt.hist(stds, bins=100);


In [None]:
plt.hist(maxes, bins=100);


In [None]:
plt.hist(mines, bins=100);


In [None]:
target = "3699-47246-0026.flac"
reader, chapter, *_ = target.split("-")
aw = AudioWorker(os.path.join(openSLR_data_directory, reader, chapter, target)).load()
aw.resample(target_sample_rate)
_, labels = vad(aw)


In [None]:
print(aw.wave.shape)

items = aw.wave.size(1)
reg_width = int(aw.rate * label_region_s)
region_hop_width = int(reg_width * (1 - label_overlap_ratio))
count = int(np.floor((items - reg_width) / region_hop_width) + 1)

print((items - reg_width) / region_hop_width)

print(items, reg_width, region_hop_width, count)