In [17]:
import torch
from transformers import WhisperModel, WhisperProcessor
from datasets import load_dataset

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [19]:
version = "openai/whisper-tiny"

# dataset

https://github.com/facebookresearch/ImageBind#usage

For windows users, you might need to install librosa and soundfile for reading/writing audio files. (Thanks @congyue1977)

`pip install soundfile librosa`

In [20]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
dataset = dataset.sort("id")
dataset

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [21]:
sampling_rate = dataset.features["audio"].sampling_rate
sampling_rate

16000

In [22]:
dataset[0]

{'file': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
 'audio': {'path': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
  'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
         0.0010376 ]),
  'sampling_rate': 16000},
 'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 'speaker_id': 1272,
 'chapter_id': 128104,
 'id': '1272-128104-0000'}

In [23]:
# get multi array
[d["array"] for d in dataset[:2]["audio"]]

[array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 array([-1.52587891e-04, -9.15527344e-05, -1.83105469e-04, ...,
         9.76562500e-04,  9.46044922e-04, -4.88281250e-04])]

In [24]:
# get multi text
[d for d in dataset[:2]["text"]]

['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 "NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER"]

# WhisperProcessor

In [25]:
processor: WhisperProcessor = WhisperProcessor.from_pretrained(version)
processor

WhisperProcessor:
- feature_extractor: WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: WhisperTokenizer(name_or_path='openai/whisper-tiny', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftranscr

## processor

In [26]:
inputs = processor(
    dataset[0]["audio"]["array"],
    sampling_rate=sampling_rate,
    # padding=True,
    return_tensors="pt"
).to(device, torch.float16)
inputs

{'input_features': tensor([[[ 1.1933e-01, -9.4576e-02, -1.0978e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [ 4.9347e-04, -8.9271e-02, -6.7290e-02,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-1.5326e-01, -2.0804e-01, -2.2227e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         ...,
         [-8.0603e-01, -8.0603e-01, -7.9997e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-8.0603e-01, -7.7211e-01, -8.0603e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-8.0603e-01, -8.0603e-01, -8.0603e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01]]], device='cuda:0')}

In [27]:
inputs["input_features"].shape

torch.Size([1, 80, 3000])

In [28]:
processor.decode(5)

'&'

In [29]:
processor.batch_decode(range(40))

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H']

# WhisperModel

The bare Whisper Model outputting raw hidden-states without any specific head on top. 

In [30]:
model: WhisperModel = WhisperModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

WhisperModel(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 384)
    (layers): ModuleList(
      (0-3): 4 x WhisperEncoderLayer(
        (self_attn): WhisperAttention(
          (k_proj): Linear(in_features=384, out_features=384, bias=False)
          (v_proj): Linear(in_features=384, out_features=384, bias=True)
          (q_proj): Linear(in_features=384, out_features=384, bias=True)
          (out_proj): Linear(in_features=384, out_features=384, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (final_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      )


In [31]:
decoder_input_ids = torch.tensor([[1, 1]]).to(device) * model.config.decoder_start_token_id
decoder_input_ids

tensor([[50258, 50258]], device='cuda:0')

In [32]:
model.eval()
with torch.inference_mode():
    outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
outputs

Seq2SeqModelOutput(last_hidden_state=tensor([[[ 2.9892e+00, -6.7606e+00,  5.7348e+00,  3.6095e+00,  2.1523e-01,
          -5.7321e+00,  4.8855e+00, -1.6407e+00,  2.8227e-01, -1.5718e+00,
           1.0427e+01,  3.4427e+00,  2.1927e-02, -8.0612e+00,  3.4784e+00,
           8.4246e+00,  4.0575e+00, -2.2865e+00,  1.1108e+01,  9.9623e-01,
           9.8840e-01, -8.5155e+00, -3.5469e+00, -9.3713e+00,  9.7864e-01,
           3.5435e+00,  7.4850e+00, -5.2579e+00, -1.4366e+00,  1.0484e+01,
           9.1240e-01, -1.0381e+00,  2.7401e+00,  3.8081e+00,  3.8284e+00,
           4.3935e+00,  3.4315e+00,  3.0419e+00,  5.3482e+00, -1.4216e+00,
           2.5859e+00,  1.8015e+00,  2.8575e+00, -4.4667e+00, -3.3560e+00,
          -6.1966e+00, -5.7166e+00, -5.4651e+00,  3.0641e+00,  3.5348e+00,
          -1.4854e+00, -7.9050e+00, -1.3091e+00,  6.9293e+00,  5.8069e-02,
           6.4804e-01,  3.0038e+00, -4.0280e+00,  1.1141e+00, -7.2606e-01,
           6.5901e+00,  2.1841e+00,  1.0687e+00,  5.2692e+00, -

In [33]:
outputs.last_hidden_state.shape

torch.Size([1, 2, 384])

In [34]:
for past_key in outputs.past_key_values:
    for past in past_key:
        print(past.shape)
    print("-" * 25)

torch.Size([1, 6, 2, 64])
torch.Size([1, 6, 2, 64])
torch.Size([1, 6, 1500, 64])
torch.Size([1, 6, 1500, 64])
-------------------------
torch.Size([1, 6, 2, 64])
torch.Size([1, 6, 2, 64])
torch.Size([1, 6, 1500, 64])
torch.Size([1, 6, 1500, 64])
-------------------------
torch.Size([1, 6, 2, 64])
torch.Size([1, 6, 2, 64])
torch.Size([1, 6, 1500, 64])
torch.Size([1, 6, 1500, 64])
-------------------------
torch.Size([1, 6, 2, 64])
torch.Size([1, 6, 2, 64])
torch.Size([1, 6, 1500, 64])
torch.Size([1, 6, 1500, 64])
-------------------------


In [35]:
outputs.encoder_last_hidden_state.shape

torch.Size([1, 1500, 384])