In [70]:
import torch
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
from datasets import load_dataset
import soundfile as sf

# Use CPU to avoid MPS issues
device = torch.device("cpu")
print("Device:", device)

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr").to(device, dtype=torch.float32)
print(model)

processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
print(processor)

def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
ds = ds.map(map_to_array)

inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt").to(device, dtype=torch.float32)
generated_ids = model.generate(input_features=inputs["input_features"],
                               attention_mask=inputs["attention_mask"])

transcription = processor.batch_decode(generated_ids)
print("Transcription:", transcription)

Device: cpu


Some weights of Speech2TextForConditionalGeneration were not initialized from the model checkpoint at facebook/s2t-small-librispeech-asr and are newly initialized: ['model.decoder.embed_positions.weights', 'model.encoder.embed_positions.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Speech2TextForConditionalGeneration(
  (model): Speech2TextModel(
    (encoder): Speech2TextEncoder(
      (conv): Conv1dSubsampler(
        (conv_layers): ModuleList(
          (0): Conv1d(80, 1024, kernel_size=(5,), stride=(2,), padding=(2,))
          (1): Conv1d(512, 512, kernel_size=(5,), stride=(2,), padding=(2,))
        )
      )
      (embed_positions): Speech2TextSinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x Speech2TextEncoderLayer(
          (self_attn): Speech2TextAttention(
            (k_proj): Linear(in_features=256, out_features=256, bias=True)
            (v_proj): Linear(in_features=256, out_features=256, bias=True)
            (q_proj): Linear(in_features=256, out_features=256, bias=True)
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=2

In [71]:
# Function to load and preprocess the audio file
def transcribe_audio(file_path):
    # Load the audio file
    speech, sample_rate = sf.read(file_path)
    
    # Ensure the sampling rate is correct (16kHz)
    if sample_rate != 24000:
        raise ValueError(f"Expected sampling rate of 16000, but got {sample_rate}")

    # Process the audio file
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt").to(device)

    # Generate transcription
    generated_ids = model.generate(input_features=inputs["input_features"],
                                   attention_mask=inputs["attention_mask"])

    # Decode the generated IDs to get the transcription
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
    
    return transcription

# Path to your audio file
audio_file = "Audios/output.wav"

# Transcribe the audio
transcription = transcribe_audio(audio_file)
print("Transcription:", transcription)

Transcription: ["hello my name is hafiz hassan mustapha i am twenty four year old i'm a mechanical engineer and this audio is for testing purpose"]


In [None]:
from gtts import gTTS
import os

# Text to convert
text = "Hello, My name is Hafiz Hassan Mustafa. I'm 24 year old. I'm a Mechanical Engineer. And this audio is for testing purpose."

# Language
language = 'en'

# Create gTTS object
audio = gTTS(text=text, lang=language, slow=False, tld="com.au")

# Save to WAV file
audio.save("Audios/output.wav")