Using Whisper - OpenAI

In [None]:
import whisper
import numpy as np

def transcribe_long_audio(audio_path, model_name="base"):

    model = whisper.load_model(model_name)
    audio = whisper.load_audio(audio_path)
    
    audio_sample = whisper.pad_or_trim(audio[:30 * 16000])
    mel_sample = whisper.log_mel_spectrogram(audio_sample, n_mels=model.dims.n_mels).to(model.device)
    _, probs = model.detect_language(mel_sample)
    detected_language = max(probs, key=probs.get)
    print(f"Language Detected: {detected_language}")
    
    audio_duration = len(audio) / 16000 
    print(f"Audio Duration: {audio_duration:.2f} seconds")
    
    segment_length = 30  
    samples_per_segment = segment_length * 16000
    
    full_text = ""
    
    num_segments = int(np.ceil(len(audio) / samples_per_segment))
    
    for i in range(num_segments):
        start = i * samples_per_segment
        end = min(start + samples_per_segment, len(audio))
        
        print(f"Transcription of segment {i+1}/{num_segments} ({start/16000:.1f}s - {end/16000:.1f}s)...")
        
        audio_segment = audio[start:end]
        
        audio_segment = whisper.pad_or_trim(audio_segment, segment_length * 16000)
        
        mel = whisper.log_mel_spectrogram(audio_segment, n_mels=model.dims.n_mels).to(model.device)
        
        options = whisper.DecodingOptions(language=detected_language, without_timestamps=False)
        
        result = whisper.decode(model, mel, options)
        
        segment_text = result.text.strip()
        if segment_text:
            full_text += segment_text + " "
    return full_text.strip()

# Ejemplo de uso
if __name__ == "__main__":
    audio_path = "record_out (2).wav"
    model_name = "medium" 
    
    transcription = transcribe_long_audio(audio_path, model_name)
    
    print("\n==== Completed ====")
    print(transcription)
    
    with open("transcription.txt", "w", encoding="utf-8") as f:
        f.write(transcription)
    print("\nSaved to 'transcription.txt'")

Using Google's Speech Recognition

In [None]:
import speech_recognition as sr

def transcribe_audio(file_path=None):

    recognizer = sr.Recognizer()
    
    try:
        if file_path:
            with sr.AudioFile(file_path) as fuente:
                print("Processing...")
                audio = recognizer.record(fuente)
        
        text = recognizer.recognize_google(audio, language="es-ES")
        print(f"Recognized text: {text}")
        return text
    
    except sr.UnknownValueError:
        print("Impossible to understand the audio")
        return ""
    except sr.RequestError as e:
        print(f"Error with Google Speech Recognition; {e}")
        return ""

if __name__ == "__main__":

    text = transcribe_audio("record_out (2).wav")