In [2]:
import os
import pandas as pd
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi
import whisperx
import torch
from whisperx.diarize import DiarizationPipeline
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.list_audio_backends()


In [3]:
# Verify if using GPU or CPU
print("Using device:", "cuda" if torch.cuda.is_available() else "cpu")

Using device: cuda


In [4]:
# ---------- CONFIG ----------
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if DEVICE == "cuda" else "float32"
# ----------------------------

In [6]:
# Function to download audio from a youtube video URL
def download_audio(video_url, output_name="audio"):
    """Download audio from youtube video URL using yt_dlp"""

    ydl_opts = {
        "format": "bestaudio/best",
        "quiet": True,
        "outtmpl": output_name + ".%(ext)s",
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "mp3",
            "preferredquality": "192",
        }],
    }

    # Save audio output as a .mp3 file
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=True)
        filename = ydl.prepare_filename(info)

    # Replace extension with .mp3
    audio_path = os.path.splitext(filename)[0] + ".mp3"

    return audio_path

def transcribe_with_whisperx(audio_path, model="base"):
    """Generate transcript + speaker diarization using WhisperX."""

    if HF_TOKEN is None:
        raise RuntimeError("Missing HF_TOKEN in environment variables.")

    _original_torch_load = torch.load

    def _trusted_load(*args, **kwargs):
        kwargs['weights_only'] = False
        return _original_torch_load(*args, **kwargs)

    torch.load = _trusted_load
    
    # Transcribe
    model = whisperx.load_model(model, DEVICE, compute_type=compute_type)
    result = model.transcribe(audio_path)

    # Alignment
    model_a, metadata = whisperx.load_align_model(
        language_code=result["language"], device=DEVICE
    )
    result_aligned = whisperx.align(
        result["segments"], model_a, metadata, audio_path, DEVICE
    )

    # Diarization
    diarize_model = DiarizationPipeline(
        use_auth_token=HF_TOKEN, device=DEVICE
    )
    diarization = diarize_model(audio_path)

    # Assign speakers to alignment
    result_aligned = whisperx.assign_word_speakers(diarization, result_aligned)

    # Extract to DataFrame
    segments = result_aligned["segments"]

    df = pd.DataFrame([
        {
            "speaker": seg.get("speaker", "unknown"),
            "start": seg["start"],
            "end": seg["end"],
            "text": seg["text"],
        }
        for seg in segments
    ])

    # Save output in a .txt file
    with open("transcript.txt", "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            f.write(f"{row.speaker}: {row.text}\n")

    return df

In [7]:
print("Downloading audio...")
audio_path = download_audio("https://www.youtube.com/watch?v=oya_pgbik7g")

Downloading audio...




                                                           

In [8]:
print("Transcribing with WhisperX...")
df = transcribe_with_whisperx(audio_path,"large-v3")

print("Transcript sucessfully saved to transcript.txt")
print(df.head())

Transcribing with WhisperX...


  import pkg_resources
  available_backends = torchaudio.list_audio_backends()


2025-12-02 17:18:15 - whisperx.asr - INFO - No language specified, language will be detected for each audio file (increases inference time)
2025-12-02 17:18:15 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...


  if ismodule(module) and hasattr(module, '__file__'):
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.6.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\Gery\anaconda3\envs\ai_transcribe\lib\site-packages\whisperx\assets\pytorch_model.bin`
  torchaudio.list_audio_backends()


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cu128. Bad things might happen unless you revert torch to 1.x.


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



2025-12-02 17:18:22 - whisperx.asr - INFO - Detected language: fr (1.00) in first 30s of audio
2025-12-02 17:22:49 - whisperx.diarize - INFO - Loading diarization model: pyannote/speaker-diarization-3.1


  torchaudio.list_audio_backends()
  std = sequences.std(dim=-1, correction=1)


Transcript sucessfully saved to transcript.txt
      speaker   start     end  \
0  SPEAKER_00   0.031   3.178   
1  SPEAKER_00   3.239  10.315   
2  SPEAKER_00  10.335  15.407   
3  SPEAKER_00  15.387  22.262   
4  SPEAKER_00  22.282  25.850   

                                                text  
0   Aujourd'hui, on a la chance de recevoir une d...  
1  Elle fait des filatures, mène des enquêtes sou...  
2  Mais surtout, elle a une spécialité très origi...  
3   Elle va nous raconter comment elle a résolu l...  
4  Mais surtout, elle va lever le voile sur le ma...  
