In [1]:
import torch
from transformers import WhisperProcessor, WhisperFeatureExtractor
from datasets import load_dataset

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "openai/whisper-tiny"

# dataset

https://github.com/facebookresearch/ImageBind#usage

For windows users, you might need to install librosa and soundfile for reading/writing audio files. (Thanks @congyue1977)

`pip install soundfile librosa`

In [4]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
dataset = dataset.sort("id")
dataset

Using the latest cached version of the module from C:\Users\Administrator\.cache\huggingface\modules\datasets_modules\datasets\hf-internal-testing--librispeech_asr_dummy\d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b (last modified on Mon Aug  7 16:20:21 2023) since it couldn't be found locally at hf-internal-testing/librispeech_asr_dummy., or remotely on the Hugging Face Hub.


Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [5]:
sampling_rate = dataset.features["audio"].sampling_rate
sampling_rate

16000

In [6]:
dataset[0]

{'file': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
 'audio': {'path': 'C:/Users/Administrator/.cache/huggingface/datasets/downloads/extracted/b49df5cb4e26d70a35c542fbe0eadc8bfee0f971809886d2131859668faeba1c/dev_clean/1272/128104\\1272-128104-0000.flac',
  'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
         0.0010376 ]),
  'sampling_rate': 16000},
 'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 'speaker_id': 1272,
 'chapter_id': 128104,
 'id': '1272-128104-0000'}

In [7]:
# get multi array
[d["array"] for d in dataset[:2]["audio"]]

[array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
        0.0010376 ]),
 array([-1.52587891e-04, -9.15527344e-05, -1.83105469e-04, ...,
         9.76562500e-04,  9.46044922e-04, -4.88281250e-04])]

In [8]:
# get multi text
[d for d in dataset[:2]["text"]]

['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL',
 "NOR IS MISTER QUILTER'S MANNER LESS INTERESTING THAN HIS MATTER"]

# WhisperProcessor

In [9]:
processor: WhisperProcessor = WhisperProcessor.from_pretrained(version)
processor

WhisperProcessor:
- feature_extractor: WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: WhisperTokenizer(name_or_path='openai/whisper-tiny', vocab_size=50258, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|startoftranscr

## processor

In [10]:
inputs = processor(
    [d["array"] for d in dataset[:2]["audio"]],
    sampling_rate=sampling_rate,
    padding=True,
    return_tensors="pt"
).to(device)
inputs

{'input_features': tensor([[[ 1.1933e-01, -9.4576e-02, -1.0978e-01,  ..., -3.2093e-02,
          -1.1783e-01, -9.6470e-02],
         [ 4.9347e-04, -8.9271e-02, -6.7290e-02,  ..., -1.6093e-02,
          -1.2162e-01, -5.6718e-02],
         [-1.5326e-01, -2.0804e-01, -2.2227e-01,  ..., -1.4056e-01,
          -2.2989e-01, -2.5005e-01],
         ...,
         [-8.0603e-01, -8.0603e-01, -7.9997e-01,  ..., -6.0504e-01,
          -6.0567e-01, -5.8229e-01],
         [-8.0603e-01, -7.7211e-01, -8.0603e-01,  ..., -6.6865e-01,
          -6.0604e-01, -6.8128e-01],
         [-8.0603e-01, -8.0603e-01, -8.0603e-01,  ..., -6.5374e-01,
          -6.5197e-01, -7.3024e-01]],

        [[-4.6956e-01, -7.5109e-02,  2.7610e-02,  ..., -7.0427e-01,
          -7.0427e-01, -7.0427e-01],
         [-1.2772e-01, -2.0680e-02, -3.2390e-02,  ..., -7.0427e-01,
          -7.0427e-01, -7.0427e-01],
         [-3.1414e-01, -9.7058e-02, -1.8364e-01,  ..., -7.0427e-01,
          -7.0427e-01, -7.0427e-01],
         ...,
      

In [11]:
inputs["input_features"].shape

torch.Size([2, 80, 585])

In [12]:
processor.decode(5)

'&'

In [13]:
processor.batch_decode(range(40))

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H']