Implementing pipeline that translate transcripted audio from Polish to English

In [1]:
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from transformers import pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
  "automatic-speech-recognition",
  model="openai/whisper-large-v2",
  chunk_length_s=30,
  device=device,
)

In [3]:
import pyaudio
import numpy as np

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000

In [4]:
import threading
import queue

audio_queue = queue.Queue()

continue_streaming = True

In [5]:
def transcription_thread():
    while continue_streaming:
        try:
            audio_data = audio_queue.get(timeout=1)
            transcription = pipe(audio_data, batch_size=8)
            print(transcription, end='', flush=True)
        except queue.Empty:
            continue

In [6]:
import wave

def record_audio(filename, record_seconds=5, channels=1, rate=16000):
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=channels,
                    rate=rate,
                    input=True,
                    frames_per_buffer=CHUNK,
                    input_device_index=2)
    print("Recording...")
    frames = []
    for i in range(0, int(rate / CHUNK * record_seconds)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("Finished recording.")
    stream.stop_stream()
    stream.close()
    p.terminate()
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(rate)
    wf.writeframes(b''.join(frames))
    wf.close()

In [7]:
def transcribe_audio(filename):
    audio_data = np.fromfile(filename, dtype=np.int16)  # Load audio data from file
    transcription = pipe(audio_data)
    print(transcription)

In [None]:
import sounddevice as sd

print(sd.query_devices())

In [9]:
record_audio('test.wav', record_seconds=10)

Recording...
Finished recording.


In [10]:
transcribe_audio('test.wav')



{'text': " It's 6.38 pm I'm checking if the test works in Polish Hey, how are you? Thank you, goodbye"}
