In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from pathlib import Path

In [None]:
whisperFolder = Path('D:/git/ai-playground/out/whisper')
isCuda = torch.cuda.is_available() #@param {type:"boolean"}
device = isCuda and 'cuda' or 'cpu'
torchDtype = isCuda and torch.float16 or torch.float32
modelId = 'openai/whisper-large-v3'
modelLoadingOptions = {
  'torch_dtype': torchDtype,
  'low_cpu_mem_usage': True,
  'use_safetensors': True,
  'attn_implementation': 'eager'
}

In [None]:

model = AutoModelForSpeechSeq2Seq.from_pretrained(
  modelId,
  **modelLoadingOptions
)

In [None]:
model.to(device)

In [None]:
processor = AutoProcessor.from_pretrained(modelId)

In [None]:
pipe = pipeline(
  'automatic-speech-recognition',
  model=model,
  tokenizer=processor.tokenizer,
  feature_extractor=processor.feature_extractor,
  max_new_tokens=128,
  chunk_length_s=30,
  batch_size=1,
  return_timestamps='word',
  torch_dtype=torchDtype,
  device=device,
)

In [None]:
result = pipe(
  'D:/git/ai-playground/out/whisper/input.flac',
)