Implementing pipeline that translate transcripted audio from Polish to English

In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [2]:
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=False
)

In [3]:
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=Tr

In [4]:
processor = AutoProcessor.from_pretrained(model_id, return_attention_mask=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=5,
    batch_size=16,
    return_timestamps=False,
    torch_dtype=torch_dtype,
    device=device,
    use_fast=False,
    generate_kwargs={"language": "english"},
)

In [6]:
import pyaudio
import numpy as np

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000

In [7]:
import queue

audio_queue = queue.Queue()


In [8]:
def transcribe_audio_saved(filename):
    audio_data = np.fromfile(filename, dtype=np.int16)  # Load audio data from file
    transcription = pipe(audio_data)
    return transcription

In [9]:
continue_streaming = True

def transcribe_audio(audio_data):
    transcription = pipe(audio_data)
    print(transcription)

def record_and_transcribe_audio(record_seconds=5, channels=1, rate=16000):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=channels,
                    rate=rate,
                    input=True,
                    frames_per_buffer=rate * record_seconds,  # Adjust buffer size for desired chunk size
                    input_device_index=None)  # Use default input device

    print("Recording and transcribing...")
    while continue_streaming:
        data = stream.read(rate * record_seconds)  # Read audio chunk from the microphone
        audio_data = np.frombuffer(data, dtype=np.int16)  # Convert audio chunk to numpy array
        transcribe_audio(audio_data)  # Transcribe audio chunk in real-time

    print("Finished recording and transcribing.")
    stream.stop_stream()
    stream.close()
    p.terminate()

In [10]:
import wave

def record_audio(filename, record_seconds=5, channels=1, rate=16000):
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=channels,
                    rate=rate,
                    input=True,
                    frames_per_buffer=CHUNK,
                    input_device_index=2)
    print("Recording...")
    frames = []
    for i in range(0, int(rate / CHUNK * record_seconds)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("Finished recording.")
    stream.stop_stream()
    stream.close()
    p.terminate()
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(rate)
    wf.writeframes(b''.join(frames))
    wf.close()

In [11]:
import sounddevice as sd

print(sd.query_devices())

    0 Mapowanie dźwięku Microsoft - Input, MME (2 in, 0 out)
>   1 Mikrofon (Virtual Desktop Audio, MME (2 in, 0 out)
    2 Stream Mix (2 — Razer Seiren V2, MME (2 in, 0 out)
    3 Playback Mix (2 — Razer Seiren , MME (2 in, 0 out)
    4 Mikrofon (Voicemod Virtual Audi, MME (2 in, 0 out)
    5 Mikrofon (Steam Streaming Micro, MME (2 in, 0 out)
    6 Headset Microphone (Oculus Virt, MME (2 in, 0 out)
    7 Mikrofon (Razer Barracuda X), MME (2 in, 0 out)
    8 Microphone (2 — Razer Seiren V2, MME (2 in, 0 out)
    9 Mapowanie dźwięku Microsoft - Output, MME (0 in, 2 out)
<  10 Głośniki (Razer Barracuda X), MME (0 in, 2 out)
   11 Słuchawki (Oculus Virtual Audio, MME (0 in, 2 out)
   12 Sound Effects (2 — Razer Seiren, MME (0 in, 2 out)
   13 Game (2 — Razer Seiren V2 X), MME (0 in, 2 out)
   14 Głośniki (7.1 Surround Sound), MME (0 in, 2 out)
   15 Głośniki (Steam Streaming Speak, MME (0 in, 2 out)
   16 Realtek Digital Output (Realtek, MME (0 in, 2 out)
   17 Aux 2 (2 — Razer Seiren V2 

In [222]:
record_audio('test.wav', record_seconds=10)

Recording...
Finished recording.


In [12]:
audio_file = 'test.wav'
audio_rec = transcribe_audio_saved(audio_file)

In [13]:
audio_text = audio_rec['text']

print(audio_text)

 In today's busy times it is worth to find a moment for a breath and reflection. daily rush of life often pushes us with its speed, but but we decide how to deal with him


In [14]:
import torchaudio

# Load your audio file first
waveform, sample_rate = torchaudio.load(audio_file)

In [15]:
from speechbrain.pretrained import EncoderClassifier

classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb")

The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.


In [89]:
with torch.no_grad():
    embeddings = classifier.encode_batch(waveform)
    embeddings = torch.nn.functional.normalize(embeddings, dim=2)
    embeddings = embeddings.squeeze().cpu().numpy()
    embeddings = torch.tensor(embeddings).unsqueeze(0)

In [90]:
embeddings.shape

torch.Size([1, 512])

In [92]:
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
# tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
sts_model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")

Some weights of SpeechT5ForSpeechToSpeech were not initialized from the model checkpoint at microsoft/speecht5_vc and are newly initialized: ['speecht5.encoder.prenet.pos_sinusoidal_embed.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [94]:
from transformers import AutoTokenizer, AutoModelForTextToWaveform

tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
tts_model = AutoModelForTextToWaveform.from_pretrained("facebook/mms-tts-eng")

In [101]:
tts_model_id = "facebook/mms-1b-all"
target_lang = "fra"

tts_pipe = pipeline(model=tts_model_id, model_kwargs={"target_lang": "fra", "ignore_mismatched_sizes": True})

config.json:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

adapter.fra.safetensors:   0%|          | 0.00/10.2M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/mms-1b-all and are newly initialized because the shapes did not match:
- lm_head.bias: found shape torch.Size([154]) in the checkpoint and torch.Size([314]) in the model instantiated
- lm_head.weight: found shape torch.Size([154, 1280]) in the checkpoint and torch.Size([314, 1280]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

In [96]:
import soundfile as sf


def text_to_speech(text):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        speech = tts_model(**inputs).waveform    
    return speech

In [97]:
speech = text_to_speech(audio_text)

In [99]:
speech.shape

torch.Size([1, 171776])

In [100]:
speech_audio = Audio(speech.numpy(), rate=tts_model.config.sampling_rate)
speech_audio

In [231]:
synthesiser = pipeline(
    "text-to-speech", 
    model=tts_model,
    tokenizer=tts_processor.tokenizer,
    feature_extractor=tts_processor.feature_extractor)

In [142]:
tts_processor

SpeechT5Processor:
- feature_extractor: SpeechT5FeatureExtractor {
  "do_normalize": false,
  "feature_extractor_type": "SpeechT5FeatureExtractor",
  "feature_size": 1,
  "fmax": 7600,
  "fmin": 80,
  "frame_signal_scale": 1.0,
  "hop_length": 16,
  "mel_floor": 1e-10,
  "num_mel_bins": 80,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "SpeechT5Processor",
  "reduction_factor": 2,
  "return_attention_mask": true,
  "sampling_rate": 16000,
  "win_function": "hann_window",
  "win_length": 64
}

- tokenizer: SpeechT5Tokenizer(name_or_path='microsoft/speecht5_tts', vocab_size=79, model_max_length=600, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pa

In [236]:
def text_to_speech_from_pipeline(text):
    speech = synthesiser(text, forward_params={"speaker_embeddings": embeddings})

    # inputs = tts_processor(audio=speech['audio'], sampling_rate=speech['sampling_rate'], return_tensors="pt")
    
    # speech = sts_model.generate_speech(inputs['input_values'], embeddings, vocoder=vocoder)

    sf.write("speech_pipe.wav", speech, samplerate=16000)

In [None]:
text_to_speech_from_pipeline(audio_text)