In [10]:
import whisper
import os
import numpy as np
import torch
from pydub import AudioSegment
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

def export_model(model_name: str):
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    model.save_pretrained(model_name)
    processor.save_pretrained(model_name)
    return model, processor
if not os.path.exists("openai/whisper-base"):
    model, procesor = export_model("openai/whisper-base")



In [11]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

def load_model(model_name: str):
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    return model, processor

# Uso
model_name = "./openai/whisper-base"
model, processor = load_model("./openai/whisper-base")

In [20]:
import librosa

def transcribe_long_audio(model, processor, audio_path: str):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    audio = AudioSegment.from_file(audio_path)
    duration_ms = len(audio)
    segment_duration_ms = 30 * 1000  # 30 seconds in milliseconds

    segments = []
    for start_ms in range(0, duration_ms, segment_duration_ms):
        end_ms = min(start_ms + segment_duration_ms, duration_ms)
        segment = audio[start_ms:end_ms]
        segments.append(segment)

    full_transcription = ""
    for i, segment in enumerate(segments):
        segment_path = f"segment_{i}.wav"
        segment.export(segment_path, format="wav")

        audio_data, _ = librosa.load(segment_path, sr=16000)
        input_features = processor(audio_data, return_tensors="pt", sampling_rate=16000).input_features
        input_features = input_features.to(device)
        generated_ids = model.generate(input_features)
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        full_transcription += transcription + " "

        os.remove(segment_path)

    return full_transcription.strip()

audio_path = "ytmp3free.cc_the-cure-friday-im-in-love-youtubemp3free.org.mp3"
if __name__ == "__main__":
    transcription = transcribe_long_audio(model, processor, audio_path)
    print("Transcripción completa:")
    print(transcription)

Transcripción completa:
Thank you everyone and cut!  I don't care if Monday's blue Tuesday is grey and Wednesday too Thursday I don't care about you it's Friday I'm in love I'm there you can fall apart Tuesday Wednesday break my heart all Thursday doesn't even stop it's Friday I'm in love  Saturday  One day you can hold your hand Choose day when say stay in bed of Thursday Watch the walls instead it's Friday, I live love Saturday, wait Sunday always comes to  I'm  Friday I'm in love I don't care if my face blue Choose this prayer when stay tuned First day I don't care but choose this Friday I'm in love  Oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, o