In [24]:
import whisper
import os
import numpy as np
import torch
from pydub import AudioSegment
from transformers import WhisperProcessor, WhisperForConditionalGeneration

from datasets import load_dataset

model_name = "/openai/whisper-base"
audio_path = "ytmp3free.cc_the-cure-friday-im-in-love-youtubemp3free.org.mp3"

In [25]:
def export_model(model_name: str):
    """
        Función que exporta el modelo y el procesador de Whisper a un directorio
        Args:
            model_name: str, nombre del modelo a exportar
    """
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    model.save_pretrained(model_name)
    processor.save_pretrained(model_name)
    return model, processor

if not os.path.exists("openai/whisper-base"): 
    model, processor = export_model("openai/whisper-base")



In [28]:
def load_model(model_name: str):
    """
        Función que carga el modelo y el procesador de Whisper desde un directorio
        Args:
            model_name: str, nombre del directorio donde se encuentra el modelo y el procesador
    """
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    return model, processor


model, processor = load_model("./openai/whisper-base")

In [None]:
def segment_audio(audio_path: str, segment_duration_ms: int = 30000):
    """
        Función generador que segmenta un audio en segmentos de duración segment_duration_ms y los exporta a archivos .wav
        Args:
            audio_path: str, ruta al audio a segmentar
            segment_duration_ms: int, duración de los segmentos en milisegundos
    """
    
    audio = AudioSegment.from_file(audio_path)
    duration_ms = len(audio)

    for start_ms in range(0, duration_ms, segment_duration_ms):
        end_ms = min(start_ms + segment_duration_ms, duration_ms)
        segment = audio[start_ms:end_ms]
        segment_path = f"segment_{start_ms // segment_duration_ms}.wav"
        segment.export(segment_path, format="wav")
        yield segment_path

In [None]:
import librosa

def transcribe_long_audio(model:WhisperForConditionalGeneration, processor:WhisperProcessor, audio_path: str):
    """
        Función que transcribe un audio largo dividiéndolo en segmentos de 30 segundos
        Args:
            model: WhisperForConditionalGeneration, modelo de Whisper
            processor: WhisperProcessor, procesador de Whisper
            audio_path: str, ruta al audio
    """
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)


    full_transcription = ""
    for segment_path in segment_audio(audio_path):

        audio_data, _ = librosa.load(segment_path, sr=16000)
        input_features = processor(audio_data, return_tensors="pt", sampling_rate=16000).input_features
        input_features = input_features.to(device)
        generated_ids = model.generate(input_features)
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        full_transcription += transcription + "\n"

        os.remove(segment_path)

    return full_transcription.strip()


if __name__ == "__main__":
    transcription = transcribe_long_audio(model, processor, audio_path)
    print("Transcripción completa:")
    print(transcription)

Transcripción completa:
Thank you everyone and cut!
 I don't care if Monday's blue Tuesday is grey and Wednesday too Thursday I don't care about you it's Friday I'm in love I'm there you can fall apart Tuesday Wednesday break my heart all Thursday doesn't even stop it's Friday I'm in love
 Saturday
 One day you can hold your hand Choose day when say stay in bed of Thursday Watch the walls instead it's Friday, I live love Saturday, wait Sunday always comes to
 I'm
 Friday I'm in love I don't care if my face blue Choose this prayer when stay tuned First day I don't care but choose this Friday I'm in love
 Oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, o