In [5]:
import wave
from time import time
import torch

import os
from tqdm import tqdm
import numpy as np
import torchaudio.functional as F
import pandas as pd
import torchaudio

from other.data.audio_utils import AudioWorker
from other.data.datasets import OpenSLRDataset as osd
from other.utils import get_files_by_extension

In [49]:
# Be sure that data is resampled
data_dir = r'..\..\datasets\openslr\8k_pitched'
# data_dir = r'..\..\datasets\openslr\test'
labels_path = r'..\..\datasets\openslr\8000_silero_labels.csv'

In [50]:
pitch_values = [-2, -1, 1, 2]

In [51]:
labels = pd.read_csv(labels_path).dropna()
final_dicts = []

_tqdm = tqdm(labels.iterrows(), total=len(labels))
for idx, row in _tqdm:
    name, ext = row.filename.split('.')

    reader, chapter, _ = name.split('-')
    audio_file_path = os.path.join(data_dir, reader, chapter, name + '.' + ext)

    if not os.path.exists(audio_file_path):
        continue

    aw = AudioWorker(audio_file_path, os.path.basename(audio_file_path))
    aw.load().leave_one_channel()
    final_dicts.append({'filename': name + '.' + ext, 'labels': row.labels})

    for pv in pitch_values:
        save_name = name + f'_{pv}' + '.' + ext
        save_path = os.path.join(data_dir, reader, chapter, save_name)

        if os.path.exists(save_path):
            final_dicts.append({'filename': save_name, 'labels': row.labels})
            continue

        wave = F.pitch_shift(aw.wave.to('cuda'), sample_rate=aw.rate, n_steps=pv)

        torchaudio.save(save_path, wave.detach().cpu(), aw.rate)
        # print(save_path)

        final_dicts.append({'filename': save_name, 'labels': row.labels})

    # row.filename = name + ''.join(['{_' + str(pv) + '}' for pv in pitch_values]) + '.' + ext

new_labels = pd.DataFrame(final_dicts)
new_labels.to_csv(os.path.basename(labels_path).split('.')[0] + "_pitched.csv", index=False)

100%|██████████| 28537/28537 [25:48<00:00, 18.43it/s]


In [54]:
os.path.basename(labels_path)

'8000_silero_labels.csv'

In [52]:
labels_path.split('.')[0] + "_pitched.csv"

'_pitched.csv'

In [19]:
if data_samples > 0:
    with open(labels_path, 'w') as file:
        file.write("filename,labels" + '\n')

    t = tqdm(audio_files_paths[:], total=data_samples)
    webrtcvad_t, write_t = 0, 0
    ma = 0.8
    for i, audio_path in enumerate(t):
        s_vad = time()
        filepath = os.path.join(openSLR_data_directory, audio_path)
        one_stamps = vad(filepath)
        if one_stamps is None:
            continue
        e_vad = time()
        path_parts = audio_path.split(os.sep)
        filename = path_parts[-1]

        with open(labels_path, 'a') as file:
            file.write(filename + ',' + '-'.join(map(str, one_stamps)) + '\n')
        e_write = time()

        webrtcvad_t = ma * webrtcvad_t + (1 - ma) * (e_vad - s_vad)
        write_t = ma * webrtcvad_t + (1 - ma) * (e_write - e_vad)
        if i % 100 == 0:
            t.set_description_str(f"webrtcvad: {webrtcvad_t * 1000:.1f}ms | write: {write_t * 1000:.1f}ms")

else:
    print(len(audio_files_paths), "audio files not found")

webrtcvad: 96.9ms | write: 77.7ms:   1%|          | 336/28539 [00:28<40:30, 11.61it/s]


KeyboardInterrupt: 

In [13]:
df = pd.read_csv(os.path.join(where_to_save, "8000_10_50_webrtc_labels.txt"))
with open("8000_10_50_webrtc_labels.csv", 'w') as file:
    file.write("filename,labels" + '\n')
    for _, row in tqdm(df.iterrows(), total=data_samples):
        k = 0
        counts = []
        for s in row['labels'].split("-"):
            s = int(s)
            counts.append(s - k)
            k = s
        file.write(filename + ',' + '-'.join(map(str, counts)) + '\n')

NameError: name 'pd' is not defined

In [None]:


audio_files_paths = get_files_by_extension("../data/MSDWild/raw_wav", ext='wav', rel=False)

means = []
maxes = []
stds = []
mines = []
for audio_path in tqdm(audio_files_paths[:3000]):
    aw = AudioWorker(audio_path).load()

    w = aw.wave
    E = w ** 2
    means.append(E.mean())
    stds.append(E.std())

    mines.append(E.min())
    maxes.append(E.max())




In [None]:
plt.hist(means, bins=100);

In [None]:
plt.hist(stds, bins=100);


In [None]:
plt.hist(maxes, bins=100);


In [None]:
plt.hist(mines, bins=100);


In [None]:
target = "3699-47246-0026.flac"
reader, chapter, *_ = target.split("-")
aw = AudioWorker(os.path.join(openSLR_data_directory, reader, chapter, target)).load()
aw.resample(target_sample_rate)
_, labels = vad(aw)


In [None]:
print(aw.wave.shape)

items = aw.wave.size(1)
reg_width = int(aw.rate * label_region_s)
region_hop_width = int(reg_width * (1 - label_overlap_ratio))
count = int(np.floor((items - reg_width) / region_hop_width) + 1)

print((items - reg_width) / region_hop_width)

print(items, reg_width, region_hop_width, count)