Implementing pipeline that translate transcripted audio from Polish to English

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [3]:
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=False
)

In [4]:
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=Tr

In [5]:
processor = AutoProcessor.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=5,
    batch_size=16,
    return_timestamps=False,
    torch_dtype=torch_dtype,
    device=device,
    use_fast=True,
    generate_kwargs={"language": "french"}
)

In [7]:
import pyaudio
import numpy as np

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000

In [8]:
import threading
import queue

audio_queue = queue.Queue()


In [9]:
def transcribe_audio_saved(filename):
    audio_data = np.fromfile(filename, dtype=np.int16)  # Load audio data from file
    transcription = pipe(audio_data)
    print(transcription)

In [10]:
continue_streaming = True

def transcribe_audio(audio_data):
    transcription = pipe(audio_data)
    print(transcription)

def record_and_transcribe_audio(record_seconds=5, channels=1, rate=16000):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=channels,
                    rate=rate,
                    input=True,
                    frames_per_buffer=rate * record_seconds,  # Adjust buffer size for desired chunk size
                    input_device_index=None)  # Use default input device

    print("Recording and transcribing...")
    while continue_streaming:
        data = stream.read(rate * record_seconds)  # Read audio chunk from the microphone
        audio_data = np.frombuffer(data, dtype=np.int16)  # Convert audio chunk to numpy array
        transcribe_audio(audio_data)  # Transcribe audio chunk in real-time

    print("Finished recording and transcribing.")
    stream.stop_stream()
    stream.close()
    p.terminate()

In [11]:
import wave

def record_audio(filename, record_seconds=5, channels=1, rate=16000):
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=channels,
                    rate=rate,
                    input=True,
                    frames_per_buffer=CHUNK,
                    input_device_index=2)
    print("Recording...")
    frames = []
    for i in range(0, int(rate / CHUNK * record_seconds)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("Finished recording.")
    stream.stop_stream()
    stream.close()
    p.terminate()
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(rate)
    wf.writeframes(b''.join(frames))
    wf.close()

In [None]:
import sounddevice as sd

print(sd.query_devices())

In [13]:
# record_audio('test.wav', record_seconds=7)

In [17]:
audio_rec = transcribe_audio_saved('test.wav')

{'text': " Bonjour, je m'appelle Jakub Dzieka, j'ai 20 ans et je joue à Baldur's Gate dans 20 secondes"}


In [None]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


In [21]:
from datasets import load_dataset

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

speaker_embeddings = embeddings_dataset[7306]["xvector"]
speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)

Downloading data: 100%|██████████| 21.3M/21.3M [00:02<00:00, 9.21MB/s]
Generating validation split: 100%|██████████| 7931/7931 [00:00<00:00, 101495.43 examples/s]


In [22]:
set_seed(555) 
speech = model.generate(audio_rec, speaker_embeddings, vocoder=vocoder)
speech.shape
torch.Size([15872])

ValueError: `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    