Implementing pipeline that translate transcripted audio from Polish to English

In [1]:
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v2")

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
import torch
from transformers import pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
  "automatic-speech-recognition",
  model="openai/whisper-large-v2",
  chunk_length_s=5,
  device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [40]:
import pyaudio
import numpy as np

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000

In [41]:
import threading
import queue

audio_queue = queue.Queue()

continue_streaming = True

In [42]:
def transcription_thread():
    while continue_streaming:
        try:
            audio_data = audio_queue.get(timeout=1)  # Get audio chunk from the queue
            transcription = pipe(audio_data)
            print(transcription, end='', flush=True)
        except queue.Empty:
            continue

In [43]:
import wave

def record_audio(filename, record_seconds=5, channels=1, rate=16000):
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=channels,
                    rate=rate,
                    input=True,
                    frames_per_buffer=CHUNK,
                    input_device_index=2)
    print("Recording...")
    frames = []
    for i in range(0, int(rate / CHUNK * record_seconds)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("Finished recording.")
    stream.stop_stream()
    stream.close()
    p.terminate()
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(rate)
    wf.writeframes(b''.join(frames))
    wf.close()

In [44]:
def transcribe_audio(filename):
    audio_data = np.fromfile(filename, dtype=np.int16)  # Load audio data from file
    transcription = pipe(audio_data)
    print(transcription)

In [45]:
import sounddevice as sd

print(sd.query_devices())

    0 Mapowanie dźwięku Microsoft - Input, MME (2 in, 0 out)
>   1 Mikrofon (Virtual Desktop Audio, MME (2 in, 0 out)
    2 Stream Mix (2 — Razer Seiren V2, MME (2 in, 0 out)
    3 Playback Mix (2 — Razer Seiren , MME (2 in, 0 out)
    4 Mikrofon (Voicemod Virtual Audi, MME (2 in, 0 out)
    5 Mikrofon (Steam Streaming Micro, MME (2 in, 0 out)
    6 Headset Microphone (Oculus Virt, MME (2 in, 0 out)
    7 Mikrofon (Razer Barracuda X), MME (2 in, 0 out)
    8 Microphone (2 — Razer Seiren V2, MME (2 in, 0 out)
    9 Mapowanie dźwięku Microsoft - Output, MME (0 in, 2 out)
<  10 Głośniki (Razer Barracuda X), MME (0 in, 2 out)
   11 Słuchawki (Oculus Virtual Audio, MME (0 in, 2 out)
   12 Sound Effects (2 — Razer Seiren, MME (0 in, 2 out)
   13 Game (2 — Razer Seiren V2 X), MME (0 in, 2 out)
   14 Głośniki (7.1 Surround Sound), MME (0 in, 2 out)
   15 Głośniki (Steam Streaming Speak, MME (0 in, 2 out)
   16 Realtek Digital Output (Realtek, MME (0 in, 2 out)
   17 Aux 2 (2 — Razer Seiren V2 

In [47]:
record_audio('test.wav', record_seconds=10)

Recording...
Finished recording.


In [48]:
transcribe_audio('test.wav')

{'text': ' Halo, halo, jeden test, test, jeden, dwa, trzy, test, halo, jak się czujesz? Czy mnie słychać? Dobrze?'}


In [None]:
import logging

logging.StreamHandler

In [38]:
from transformers import file_utils

print(file_utils.default_cache_path)

C:\Users\Kuba\.cache\huggingface\hub
