In [2]:
import torch
from transformers import Data2VecAudioForAudioFrameClassification, AutoProcessor, AutoFeatureExtractor
from datasets import load_dataset
from PIL import Image
import requests

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
version = "facebook/data2vec-audio-base-960h"

# dataset

https://github.com/facebookresearch/ImageBind#usage

For windows users, you might need to install librosa and soundfile for reading/writing audio files. (Thanks @congyue1977)

`pip install soundfile librosa`

In [None]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
dataset

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [None]:
sampling_rate = dataset.features["audio"].sampling_rate
sampling_rate

16000

In [None]:
dataset[0]

{'file': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
 'audio': {'path': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
  'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
         0.0010376 ]),
  'sampling_rate': 16000},
 'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 'speaker_id': 1272,
 'chapter_id': 128104,
 'id': '1272-128104-0000'}

In [None]:
# get multi array
[d["array"] for d in dataset[:2]["audio"]]

[array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 array([-1.52587891e-04, -9.15527344e-05, -1.83105469e-04, ...,
         9.76562500e-04,  9.46044922e-04, -4.88281250e-04])]

In [None]:
# get multi text
[d for d in dataset[:2]["text"]]

['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 "NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER"]

# AutoProcessor(same as AutoFeatureExtractor)

In [7]:
processor: AutoProcessor = AutoProcessor.from_pretrained(version)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='facebook/data2vec-audio-base-960h', vocab_size=32, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

## processor

In [8]:
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt").to(device, torch.float16)
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0', dtype=torch.int32)}

In [9]:
inputs["input_values"].shape

torch.Size([1, 93680])

In [10]:
processor.decode(5)

'E'

In [11]:
processor.batch_decode(range(40))

['<pad>',
 '<s>',
 '</s>',
 '<unk>',
 '',
 'E',
 'T',
 'A',
 'O',
 'N',
 'I',
 'H',
 'S',
 'R',
 'D',
 'L',
 'U',
 'M',
 'W',
 'C',
 'F',
 'G',
 'Y',
 'P',
 'B',
 'V',
 'K',
 "'",
 'X',
 'J',
 'Q',
 'Z',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>']

# Data2VecAudioForAudioFrameClassification

Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.

In [12]:
model: Data2VecAudioForAudioFrameClassification = Data2VecAudioForAudioFrameClassification.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

Some weights of Data2VecAudioForAudioFrameClassification were not initialized from the model checkpoint at facebook/data2vec-audio-base-960h and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Data2VecAudioForAudioFrameClassification(
  (data2vec_audio): Data2VecAudioModel(
    (feature_extractor): Data2VecAudioFeatureEncoder(
      (conv_layers): ModuleList(
        (0): Data2VecAudioConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Data2VecAudioConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Data2VecAudioConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Data2VecAudioFeatureProjection(
      (layer_nor

In [13]:
model.eval()
with torch.inference_mode():
    outputs = model(**inputs)
outputs

TokenClassifierOutput(loss=None, logits=tensor([[[ 7.5365e-02,  4.7647e-01],
         [ 7.5365e-02,  4.7647e-01],
         [ 7.5364e-02,  4.7647e-01],
         [ 7.5365e-02,  4.7647e-01],
         [ 7.5365e-02,  4.7647e-01],
         [-5.3869e-02,  4.9465e-01],
         [-2.8681e-02,  4.8524e-01],
         [-6.2587e-03,  4.8217e-01],
         [-1.9858e-02,  4.9937e-01],
         [ 2.0373e-03,  4.8342e-01],
         [-5.6541e-04,  4.7674e-01],
         [-9.6416e-03,  4.8140e-01],
         [-2.5676e-02,  4.9231e-01],
         [-2.8853e-02,  4.9478e-01],
         [-8.0962e-03,  4.8340e-01],
         [-7.0779e-03,  4.7318e-01],
         [ 2.0098e-02,  4.9051e-01],
         [ 3.9949e-02,  4.8465e-01],
         [ 7.1933e-02,  4.1313e-01],
         [ 5.3954e-02,  3.3029e-01],
         [ 4.6347e-02,  2.8744e-01],
         [ 7.3757e-02,  2.5687e-01],
         [ 9.0989e-02,  2.2813e-01],
         [ 9.8402e-02,  1.8492e-01],
         [ 1.2422e-01,  1.8679e-01],
         [ 1.8969e-01,  2.6615e-01]

In [14]:
model.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [15]:
print(outputs.logits.shape)

torch.Size([1, 292, 2])


In [16]:
# labels is a one-hot array of shape (num_frames, num_speakers)
probabilities = torch.sigmoid(outputs.logits[0])
probabilities

tensor([[0.5188, 0.6169],
        [0.5188, 0.6169],
        [0.5188, 0.6169],
        [0.5188, 0.6169],
        [0.5188, 0.6169],
        [0.4865, 0.6212],
        [0.4928, 0.6190],
        [0.4984, 0.6183],
        [0.4950, 0.6223],
        [0.5005, 0.6186],
        [0.4999, 0.6170],
        [0.4976, 0.6181],
        [0.4936, 0.6206],
        [0.4928, 0.6212],
        [0.4980, 0.6186],
        [0.4982, 0.6161],
        [0.5050, 0.6202],
        [0.5100, 0.6188],
        [0.5180, 0.6018],
        [0.5135, 0.5818],
        [0.5116, 0.5714],
        [0.5184, 0.5639],
        [0.5227, 0.5568],
        [0.5246, 0.5461],
        [0.5310, 0.5466],
        [0.5473, 0.5661],
        [0.5462, 0.5707],
        [0.5135, 0.5762],
        [0.4529, 0.4955],
        [0.4928, 0.6499],
        [0.5164, 0.5299],
        [0.5099, 0.5310],
        [0.5391, 0.4204],
        [0.6019, 0.5347],
        [0.4948, 0.4575],
        [0.4610, 0.6205],
        [0.4405, 0.5906],
        [0.4930, 0.4670],
        [0.4

In [17]:
labels = (probabilities > 0.5).long()
labels

tensor([[1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [1, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [0, 0],
        [0, 1],
        [1, 1],
        [1, 1],
        [1, 0],
        [1, 1],
        [0, 0],
        [0, 1],
        [0, 1],
        [0, 0],
        [0, 0],
        [0, 1],
        [1, 1],
        [1, 1],
        [1, 1],
        [0, 1],
        [1, 1],
        [0, 1],
        [0, 1],
        [1, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [1, 1],
        [1, 1],
        [0, 0],
        [0, 1],
        [0, 1],
        [0, 0],
        [0, 1],
        [0, 0],
        [0, 0],
        [1, 1],
        [1, 1],
        