In [1]:
from pytube import Channel, YouTube
from pydub import AudioSegment
import whisperx
from dotenv import load_dotenv
import os
import yt_dlp
import re
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

load_dotenv()

def get_hf_token():
    hf_token = os.getenv("HUGGINGFACE_TOKEN")
    if hf_token is None:
        raise ValueError("Please set your Hugging Face token in HUGGINGFACE_TOKEN environment variable.")
    return hf_token

In [15]:
def sanitize_filename(name):
    # Remove illegal Windows characters
    return re.sub(r'[\\/*?:"<>|]', "", name)

def get_recent_videos(channel_url, k=3):
    """Return the most recent k video URLs from any YouTube channel/handle."""
    ydl_opts = {'quiet': True, 'extract_flat': True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(channel_url, download=False)
        entries = info.get("entries", [])
        recent_videos = entries[:k]
        video_urls = ["https://www.youtube.com/watch?v=" + v["id"] for v in recent_videos]
        return video_urls

def download_youtube_audio(url, output_dir="audio"):
    os.makedirs(output_dir, exist_ok=True)
    ydl_opts = {
        'format': 'bestaudio/best',
        'quiet': True,
        'outtmpl': os.path.join(output_dir, 'temp_audio.%(ext)s'),
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            video_title = info.get("title", "video")
            safe_title = sanitize_filename(video_title)
            output_path = os.path.join(output_dir, f"{safe_title}.wav")
            
            temp_path = os.path.join(output_dir, "temp_audio.wav")
            if os.path.exists(temp_path):
                os.rename(temp_path, output_path)
            
            return output_path, safe_title
    except Exception as e:
        print(f"Skipping video {url} due to error: {e}")
        return None, None

def transcribe_with_diarization(audio_path, hf_token, model_size="large-v2"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Load audio
    audio = whisperx.load_audio(audio_path)
    
    # Load Whisper model
    model = whisperx.load_model(model_size, device=device)
    
    # Transcribe
    result = model.transcribe(audio)
    
    # Alignment
    alignment_model, metadata = whisperx.load_align_model(
        language_code=result["language"], device=device
    )
    result = whisperx.align(result["segments"], alignment_model, metadata, audio, device=device)
    
    # Diarization
    diarization_model = whisperx.DiarizationPipeline(use_auth_token=hf_token)
    diarization_result = diarization_model(audio_path)
    
    # Assign speakers
    diarized_result = whisperx.assign_word_speakers(diarization_result, result)
    return diarized_result

def save_transcript(diarized_result, video_title, output_dir="transcripts"):
    os.makedirs(output_dir, exist_ok=True)
    safe_title = video_title.replace("/", "_")
    output_file = os.path.join(output_dir, f"{safe_title}.txt")
    
    with open(output_file, "w", encoding="utf-8") as f:
        for segment in diarized_result["segments"]:
            speaker = segment["speaker"]
            start = segment["start"]
            end = segment["end"]
            text = segment["text"]
            f.write(f"{speaker} [{start:.2f}-{end:.2f}]: {text}\n")
    print(f"Saved transcript: {output_file}")
    return output_file

# -------------------------
# Main workflow
# -------------------------

def transcribe_recent_videos(channel_url, k=3):
    hf_token = get_hf_token()
    recent_video_urls = get_recent_videos(channel_url, k)
    
    transcripts = []
    for url in recent_video_urls:
        print(f"Processing video: {url}")
        audio_path, title = download_youtube_audio(url)
        diarized_result = transcribe_with_diarization(audio_path, hf_token)
        transcript_file = save_transcript(diarized_result, title)
        transcripts.append(transcript_file)
    
    return transcripts

In [None]:
channel_url = "https://www.youtube.com/@pizzaandproperty1246/videos"
transcripts = transcribe_recent_videos(channel_url, k=3)

Deprecated Feature: Support for Python version 3.9 has been deprecated. Please update to Python 3.10 or above
Deprecated Feature: Support for Python version 3.9 has been deprecated. Please update to Python 3.10 or above


Processing video: https://www.youtube.com/watch?v=KVF6A6snzFk
                                                         

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...


  if ismodule(module) and hasattr(module, '__file__'):
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\HENRY\GitHub\property_researcher_LLM\env\lib\site-packages\whisperx\assets\pytorch_model.bin`
It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu118. Bad things might happen unless you revert torch to 1.x.
