A.L. Lundervold, 19.01.23

# Introduction

A simple example of speech recognition using OpenAI's Whisper model.

# Setup

We use the 🤗 Transfomers library: https://huggingface.co/docs/transformers/index and the Whisper model from OpenAI: https://openai.com/blog/whisper/

In [None]:
import torch

In [None]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

In [None]:
from transformers import pipeline

In [None]:
device = 0 if torch.cuda.is_available() else "cpu"
print(device)

# Setup model and pipeline

In [None]:
model_name = "openai/whisper-large"

In [None]:
processor = AutoProcessor.from_pretrained("openai/whisper-large")

model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large").to('cuda')

In [None]:
pipe = pipeline(
    task="automatic-speech-recognition",
    model=model_name,
    chunk_length_s=30,
    device=device,
)

In [None]:
text = pipe("audio.ogg")["text"]

In [None]:
text

# A simple application

## Setup

In [None]:
import gradio as gr

In [None]:
def transcribe(audio):
    transcription = pipe(audio)["text"]
    return transcription

In [None]:
examples = [
    ["audio.ogg"]
]

## Application

In [None]:
gr.Interface(
    title = 'OpenAI Whisper', fn=transcribe, 
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath")
    ],
    outputs=["textbox"],
    examples=examples).launch(share=True)