Implementing pipeline that translate transcripted audio from Polish to English

In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=False
)

In [4]:
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=Tr

In [5]:
processor = AutoProcessor.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=5,
    batch_size=16,
    return_timestamps=False,
    torch_dtype=torch_dtype,
    device=device,
    use_fast=True,
    generate_kwargs={"language": "english", "task": "translate"}
)

In [7]:
import pyaudio
import numpy as np

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000

In [9]:
import threading
import queue

audio_queue = queue.Queue()


In [16]:
def transcribe_audio_saved(filename):
    audio_data = np.fromfile(filename, dtype=np.int16)  # Load audio data from file
    transcription = pipe(audio_data)
    print(transcription)

In [8]:
continue_streaming = True

def transcribe_audio(audio_data):
    transcription = pipe(audio_data)
    print(transcription)

def record_and_transcribe_audio(record_seconds=5, channels=1, rate=16000):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=channels,
                    rate=rate,
                    input=True,
                    frames_per_buffer=rate * record_seconds,  # Adjust buffer size for desired chunk size
                    input_device_index=None)  # Use default input device

    print("Recording and transcribing...")
    while continue_streaming:
        data = stream.read(rate * record_seconds)  # Read audio chunk from the microphone
        audio_data = np.frombuffer(data, dtype=np.int16)  # Convert audio chunk to numpy array
        transcribe_audio(audio_data)  # Transcribe audio chunk in real-time

    print("Finished recording and transcribing.")
    stream.stop_stream()
    stream.close()
    p.terminate()

In [20]:
import wave

def record_audio(filename, record_seconds=5, channels=1, rate=16000):
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=channels,
                    rate=rate,
                    input=True,
                    frames_per_buffer=CHUNK,
                    input_device_index=2)
    print("Recording...")
    frames = []
    for i in range(0, int(rate / CHUNK * record_seconds)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("Finished recording.")
    stream.stop_stream()
    stream.close()
    p.terminate()
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(rate)
    wf.writeframes(b''.join(frames))
    wf.close()

In [None]:
import sounddevice as sd

print(sd.query_devices())

    0 Mapowanie dźwięku Microsoft - Input, MME (2 in, 0 out)
>   1 Mikrofon (Virtual Desktop Audio, MME (2 in, 0 out)
    2 Stream Mix (2 — Razer Seiren V2, MME (2 in, 0 out)
    3 Playback Mix (2 — Razer Seiren , MME (2 in, 0 out)
    4 Mikrofon (Voicemod Virtual Audi, MME (2 in, 0 out)
    5 Mikrofon (Steam Streaming Micro, MME (2 in, 0 out)
    6 Headset Microphone (Oculus Virt, MME (2 in, 0 out)
    7 Mikrofon (Razer Barracuda X), MME (2 in, 0 out)
    8 Microphone (2 — Razer Seiren V2, MME (2 in, 0 out)
    9 Mapowanie dźwięku Microsoft - Output, MME (0 in, 2 out)
<  10 Głośniki (Razer Barracuda X), MME (0 in, 2 out)
   11 Słuchawki (Oculus Virtual Audio, MME (0 in, 2 out)
   12 Sound Effects (2 — Razer Seiren, MME (0 in, 2 out)
   13 Game (2 — Razer Seiren V2 X), MME (0 in, 2 out)
   14 Głośniki (7.1 Surround Sound), MME (0 in, 2 out)
   15 Głośniki (Steam Streaming Speak, MME (0 in, 2 out)
   16 Realtek Digital Output (Realtek, MME (0 in, 2 out)
   17 Aux 2 (2 — Razer Seiren V2 

In [21]:
record_audio('test.wav', record_seconds=7)

Recording...
Finished recording.


In [22]:
transcribe_audio_saved('test.wav')

{'text': " Good morning, I'm Jakub Dzieka, my name is Jakub Dzieka, I'm 20 years old and I play Baldur's Gate in 20 seconds"}


In [14]:
record_and_transcribe_audio()

Recording and transcribing...
{'text': ' Vindelbrotご視聴ありがとうございました'}
{'text': ' Vindelstofご視聴ありがとうございました'}
{'text': ' Vindelbrotご視聴ありがとうございました'}
{'text': ' Vindelbrotご視聴ありがとうございました'}
{'text': ' Vindelbrotご視聴ありがとうございました'}
{'text': ' Vindelbrotご視聴ありがとうございました'}


KeyboardInterrupt: 