In [4]:
import numpy as np
import torch
import ffmpeg

In [5]:
import speechbrain as sb
from speechbrain.dataio.dataio import read_audio
from IPython.display import Audio

In [6]:
SR = 16_000
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

## Functions

In [7]:
def load_audio(file: str, sr: int = SR):
    """
    Open an audio file and read as mono waveform, resampling as necessary

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    """
    try:
        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
        out, _ = (
            ffmpeg.input(file, threads=0)
            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
        )
    except ffmpeg.Error as e:
        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
    # Max of np.int16 is 32768 so divide to make everything fit between -1 and 1
    out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0  
    return torch.from_numpy(out)

## Speaker Recognition

In [14]:
from speechbrain.pretrained import SpeakerRecognition
verification = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb", 
    savedir="pretrained_models/spkrec-ecapa-voxceleb", 
    run_opts={'device':'cpu'})


In [9]:
score, prediction = verification.verify_files("speechbrain/spkrec-ecapa-voxceleb/example1.wav", "speechbrain/spkrec-ecapa-voxceleb/example2.flac")

print(prediction, score)

tensor([False], device='cuda:0') tensor([0.1635], device='cuda:0')


In [10]:
wav1 = load_audio('./example1.wav')
wav2 = load_audio('./example2.flac')

In [16]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

## VAD

In [2]:
import transformers

help(transformers.AudioClassificationPipeline)

Help on class AudioClassificationPipeline in module transformers.pipelines.audio_classification:

class AudioClassificationPipeline(transformers.pipelines.base.Pipeline)
 |  AudioClassificationPipeline(*args, **kwargs)
 |  
 |  Audio classification pipeline using any `AutoModelForAudioClassification`. This pipeline predicts the class of a
 |  raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio
 |  formats.
 |  
 |  Example:
 |  
 |  ```python
 |  >>> from transformers import pipeline
 |  
 |  >>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks")
 |  >>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
 |  [{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}]
 |  ```
 |  
 |  Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutor