In [2]:
import torch
from transformers import Data2VecAudioForXVector, AutoProcessor, AutoFeatureExtractor
from datasets import load_dataset
from PIL import Image
import requests

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
version = "facebook/data2vec-audio-base-960h"

# dataset

https://github.com/facebookresearch/ImageBind#usage

For windows users, you might need to install librosa and soundfile for reading/writing audio files. (Thanks @congyue1977)

`pip install soundfile librosa`

In [5]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
dataset

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [6]:
sampling_rate = dataset.features["audio"].sampling_rate
sampling_rate

16000

In [7]:
dataset[0]

{'file': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
 'audio': {'path': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
  'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
         0.0010376 ]),
  'sampling_rate': 16000},
 'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 'speaker_id': 1272,
 'chapter_id': 128104,
 'id': '1272-128104-0000'}

In [8]:
# get multi array
[d["array"] for d in dataset[:2]["audio"]]

[array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 array([-1.52587891e-04, -9.15527344e-05, -1.83105469e-04, ...,
         9.76562500e-04,  9.46044922e-04, -4.88281250e-04])]

In [9]:
# get multi text
[d for d in dataset[:2]["text"]]

['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 "NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER"]

# AutoProcessor

In [10]:
processor: AutoProcessor = AutoProcessor.from_pretrained(version)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='facebook/data2vec-audio-base-960h', vocab_size=32, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

## processor

In [11]:
inputs = processor(
    [d["array"] for d in dataset[:2]["audio"]],
    sampling_rate=sampling_rate,
    padding=True,
    return_tensors="pt",
).to(device, torch.float16)
inputs

{'input_values': tensor([[ 0.0386,  0.0337,  0.0322,  ...,  0.0070,  0.0095,  0.0169],
        [-0.0015, -0.0008, -0.0019,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0', dtype=torch.int32)}

In [12]:
inputs["input_values"].shape

torch.Size([2, 93680])

In [13]:
processor.decode(5)

'E'

In [14]:
processor.batch_decode(range(40))

['<pad>',
 '<s>',
 '</s>',
 '<unk>',
 '',
 'E',
 'T',
 'A',
 'O',
 'N',
 'I',
 'H',
 'S',
 'R',
 'D',
 'L',
 'U',
 'M',
 'W',
 'C',
 'F',
 'G',
 'Y',
 'P',
 'B',
 'V',
 'K',
 "'",
 'X',
 'J',
 'Q',
 'Z',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>']

# Data2VecAudioForXVector

Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.

In [15]:
model: Data2VecAudioForXVector = Data2VecAudioForXVector.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

Some weights of Data2VecAudioForXVector were not initialized from the model checkpoint at facebook/data2vec-audio-base-960h and are newly initialized: ['tdnn.1.kernel.bias', 'tdnn.2.kernel.weight', 'tdnn.0.kernel.bias', 'tdnn.3.kernel.bias', 'feature_extractor.weight', 'tdnn.0.kernel.weight', 'tdnn.4.kernel.bias', 'tdnn.1.kernel.weight', 'tdnn.2.kernel.bias', 'tdnn.4.kernel.weight', 'classifier.weight', 'objective.weight', 'projector.weight', 'feature_extractor.bias', 'projector.bias', 'classifier.bias', 'tdnn.3.kernel.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Data2VecAudioForXVector(
  (data2vec_audio): Data2VecAudioModel(
    (feature_extractor): Data2VecAudioFeatureEncoder(
      (conv_layers): ModuleList(
        (0): Data2VecAudioConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Data2VecAudioConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Data2VecAudioConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Data2VecAudioFeatureProjection(
      (layer_norm): LayerNorm((51

In [16]:
model.eval()
with torch.inference_mode():
    outputs = model(**inputs)
outputs

XVectorOutput(loss=None, logits=tensor([[-0.0009,  0.0011, -0.0018,  ..., -0.0008,  0.0022,  0.0040],
        [-0.0010,  0.0012, -0.0016,  ..., -0.0008,  0.0023,  0.0041]],
       device='cuda:0'), embeddings=tensor([[ 0.0009,  0.0027,  0.0124,  ...,  0.0003,  0.0084, -0.0014],
        [ 0.0010,  0.0031,  0.0123,  ...,  0.0005,  0.0079, -0.0017]],
       device='cuda:0'), hidden_states=None, attentions=None)

In [17]:
print(outputs.logits.shape)
print(outputs.logits)

torch.Size([2, 512])
tensor([[-0.0009,  0.0011, -0.0018,  ..., -0.0008,  0.0022,  0.0040],
        [-0.0010,  0.0012, -0.0016,  ..., -0.0008,  0.0023,  0.0041]],
       device='cuda:0')


In [18]:
print(outputs.embeddings.shape)
print(outputs.embeddings)

torch.Size([2, 512])
tensor([[ 0.0009,  0.0027,  0.0124,  ...,  0.0003,  0.0084, -0.0014],
        [ 0.0010,  0.0031,  0.0123,  ...,  0.0005,  0.0079, -0.0017]],
       device='cuda:0')


In [19]:
# 标准化
torch.nn.functional.normalize(outputs.embeddings, dim=-1)

tensor([[ 0.0094,  0.0286,  0.1313,  ...,  0.0027,  0.0884, -0.0152],
        [ 0.0108,  0.0332,  0.1313,  ...,  0.0048,  0.0845, -0.0180]],
       device='cuda:0')

In [21]:
outputs.hidden_states