In [13]:
import os, re, json
from pprint import pprint
from concurrent.futures import ThreadPoolExecutor
from IPython.display import Audio, display


import torch
import whisperx
import my_utils

In [2]:
SR = 16000

In [3]:
audacity_dir = '../data/db_02-03-2023/'
path = os.path.join(audacity_dir, my_utils.find_aup_file(audacity_dir))

project = my_utils.parse_audacity_project(path)
pprint(project)

{'data_path': '../data/db_02-03-2023\\V8DcTChKFF_data',
 'files': [{'filename': '1-CS12_4510.ogg', 'offset': 0.0},
           {'filename': '2-Crux_4429.ogg', 'offset': 0.0},
           {'filename': '3-JadePixie_7138.ogg', 'offset': 0.0},
           {'filename': '4-IlMaximuslI_9218.ogg', 'offset': 0.0}],
 'project_name': 'V8DcTChKFF_data',
 'root_path': '../data/db_02-03-2023'}


In [5]:
# waveforms = []
# for file in project.get('files'):
#     filename = file.get('filename')
#     filepath = os.path.join(project.get('data_path'), filename)
#     print(filepath)
#     wav = whisper.load_audio(filepath)
#     # vizualize_waveform(wav)
#     waveforms.append((filename, wav))

In [4]:
filepaths = [os.path.join(project.get('data_path'), file.get('filename')) for file in project.get('files')]
with ThreadPoolExecutor() as executor:
    waveforms = executor.map(whisperx.load_audio, filepaths)
waveforms = list(waveforms)
waveforms

[array([0.00344849, 0.00344849, 0.003479  , ..., 0.        , 0.        ,
        0.        ], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)]

In [5]:
filenames = [file.get('filename') for file in project['files']]

In [6]:
silero_dir = '../models/silero-vad'
silero_vad, utils =    torch.hub.load(repo_or_dir=silero_dir,
                               source='local',
                               model='silero_vad',
                               force_reload=True,
                               onnx=False)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils

In [7]:
def process_vad(audio):
    vad_segments = get_speech_timestamps(audio, silero_vad)
    return vad_segments


In [9]:
vad_segs_0 = process_vad(waveforms[0])

In [12]:
vad_segs_1 = process_vad(waveforms[1])

In [13]:
vad_segs_2 = process_vad(waveforms[2])

In [14]:
vad_segs_3 = process_vad(waveforms[3])

In [15]:
vad_segs = [vad_segs_0,vad_segs_1,vad_segs_2,vad_segs_3]

In [30]:
import json

path = project.get('data_path')

for file, vad in zip(filenames, vad_segs):
    data = {'audio_file': file, 'segments': vad}
    with open(os.path.join('./cache/', f"{file}.json",), 'w') as f:
        json.dump(data, f)


In [1]:
def load_segments_from_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

In [14]:
path = './cache/1-CS12_4510.ogg.json'
load_segments_from_json(path)

{'audio_file': '1-CS12_4510.ogg',
 'segments': [{'start': 2080, 'end': 17376},
  {'start': 363552, 'end': 382432},
  {'start': 384544, 'end': 416224},
  {'start': 425504, 'end': 444896},
  {'start': 538144, 'end': 566752},
  {'start': 622112, 'end': 651744},
  {'start': 760352, 'end': 791008},
  {'start': 1220640, 'end': 1233888},
  {'start': 1236512, 'end': 1245152},
  {'start': 3328032, 'end': 3381216},
  {'start': 3382816, 'end': 3391456},
  {'start': 4515360, 'end': 4598752},
  {'start': 4825120, 'end': 4868064},
  {'start': 4872224, 'end': 4885472},
  {'start': 5272096, 'end': 5298656},
  {'start': 5327904, 'end': 5380576},
  {'start': 5434400, 'end': 5486560},
  {'start': 5989920, 'end': 6035936},
  {'start': 6096928, 'end': 6110176},
  {'start': 6243360, 'end': 6324192},
  {'start': 6438944, 'end': 6590432},
  {'start': 6596128, 'end': 6666208},
  {'start': 6716448, 'end': 6748640},
  {'start': 6757408, 'end': 6816736},
  {'start': 6854688, 'end': 6866912},
  {'start': 6876192, 