In [1]:
import os, re, json
from IPython.display import Audio, display
import torch
import whisper
from typing import List

In [2]:
from utils import VAD, AVR

In [3]:
# Grab a short file for testing if it doesn't exist.
example_file = './data/en_example.wav'
if not os.path.exists(example_file):
    torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', example_file)

In [5]:
import utils.prepare as prep

_ = prep.get_avr_data(example_file, refresh=True, model_name='medium.en')

Transcribing AVR data


100%|██████████| 4494/4494 [00:19<00:00, 229.30frames/s]

Done. Data saved: './data/en_example.wav_avr.json'





### Caching Data

In [4]:
def cache_json(data: object, path: str):
    with open(path, 'w') as f:
        json.dump(data, f)

In [5]:
def load_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

## From Audio to VAD segments with cacheing

In [6]:
def get_waveform(filepath: str):
    """Returns a tensor from an audio file."""
    wav = whisper.load_audio(filepath)
    return torch.from_numpy(wav)

In [7]:
def new_vad_data(audio):
    vad = VAD()
    segments = vad.get_speech_timestamps(audio)
    return segments

In [8]:
def get_vad_data(path:str, waveform=None, refresh=False):
    json_path = f"{path}_vad.json"
    if os.path.exists(json_path) and not refresh:
        segments = load_json(json_path)
    else:
        if waveform == None:
            waveform = get_waveform(path)
        print('Calculating VAD segments.')
        segments = new_vad_data(waveform)
        cache_json(segments, json_path)
        print(f"Done. Data saved: '{json_path}'")
    return segments

In [10]:
segs = get_vad_data(example_file)

### Using VAD segments to extract speech only

In [11]:
def collect_chunks(wav: torch.Tensor, segments: List[dict]):
    """Edits a waveform to include only the segements in a list of segments."""
    chunks = []
    for i in segments:
        chunks.append(wav[i['start']: i['end']])
    return torch.cat(chunks)


In [13]:
def align_chunks(segments: List[dict]):
    """Returns a list of segments to align with the waveform generated by the collect chunks function."""
    chunks = []
    current_frame = 0
    for entry in segments:
        speech_length = entry['end'] - entry['start']
        end_frame = current_frame + speech_length
        chunks.append(
            {'start': current_frame,
             'end': end_frame}
            )
        current_frame = end_frame
    return chunks

In [14]:
def drop_chunks(tss: List[dict],
                wav: torch.Tensor):
    chunks = []
    cur_start = 0
    for i in tss:
        chunks.append((wav[cur_start: i['start']]))
        cur_start = i['end']
    return torch.cat(chunks)

In [15]:
def collect_and_align_chunks(wav: torch.Tensor, segments: List[dict]):
    new_wav = collect_chunks(wav, segments)
    new_segs = align_chunks(segments)
    return new_wav, new_segs

In [16]:
wav2, segs2 = collect_and_align_chunks(*get_vad_data(example_file))

TypeError: collect_and_align_chunks() takes 2 positional arguments but 29 were given

## From Audio to AVR data with cacheing

In [17]:
def new_avr_data(audio, verbose=False):
    avr = AVR()
    results = avr.transcribe(audio, verbose=verbose)
    return results['segments']


In [20]:
def get_avr_data(path, waveform=None, refresh=False):
    json_path = f"{path}_avr.json"
    if os.path.exists(json_path) and not refresh:
        avr_data = load_json(json_path)
    else:
        if waveform==None:
            waveform = get_waveform(path)
        wav = collect_chunks(waveform, get_vad_data(path, waveform=waveform))
        print('Transcribing AVR data')
        avr_data = new_avr_data(wav)
        cache_json(avr_data, json_path)
        print(f"Done. Data saved: '{json_path}'")
    return avr_data

In [21]:
avr_results = get_avr_data(example_file, refresh=True)

Transcribing AVR data


100%|██████████| 4494/4494 [00:09<00:00, 462.13frames/s]

Done. Data saved: './data/en_example.wav_avr.json'





In [None]:
def get_full_text(avr_data):
    return ''.join([segment['text'] for segment in avr_data])

get_full_text(avr_results)

" and says, how do I get to Dublin? And the answer that comes back is, well, I wouldn't start from here, Sonny. That is to say, much of political philosophy develops theories that take no account of where we actually are and how the theories that people argue about in the journals and in the literature actually could be implemented in the world, if at all. And this spills over into normative arguments made by other scholars. Thomas Piketty in his book, Capital in the 21st Century, argues for a 4% global wealth tax. Well, good luck with that. Who's gonna implement a 4% global wealth tax? So when I think about normative questions..."