In [2]:
import os
from pathlib import Path
import yt_dlp
import whisper
import re

DOWNLOAD_DIR = Path("transcripts")
DOWNLOAD_DIR.mkdir(exist_ok=True)

In [7]:
def sanitize_filename(title):
    # Remove characters that are invalid for Windows/ffmpeg
    return re.sub(r'[<>:"/\\|?*$]', '', title)

def download_recent_videos(channel_name, k=3):
    channel_url = f"https://www.youtube.com/@{channel_name}/videos"
    downloaded_files = []
    ydl_opts = {
        "format": "bestaudio/best",
        "quiet": True,
        "noplaylist": True,
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "mp3",
            "preferredquality": "192",
        }],
        "playlistend": k
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(channel_url, download=False)
        videos = info["entries"] if "entries" in info else [info]

        for video in videos[:k]:
            safe_title = sanitize_filename(video['title'])
            filename = DOWNLOAD_DIR / f"{safe_title}.mp3"
            text_filename = DOWNLOAD_DIR / f"{safe_title}.txt"
            if text_filename.exists():
                print(f"Skipping already downloaded: {safe_title}")
            else:
                print(f"Downloading: {safe_title}")
                ydl_opts["outtmpl"] = str(DOWNLOAD_DIR / f"{safe_title}.%(ext)s")
                with yt_dlp.YoutubeDL(ydl_opts) as inner_ydl:
                    inner_ydl.download([video["webpage_url"]])
            downloaded_files.append(filename)

    return downloaded_files

def transcribe_audio_files(audio_files, model_name="small"):
    """
    Transcribes a list of audio files using Whisper.
    Skips files that have already been transcribed.
    """
    model = whisper.load_model(model_name)
    transcripts = {}

    for audio_file in audio_files:
        transcript_file = audio_file.with_suffix(".txt")
        if transcript_file.exists():
            print(f"Skipping already transcribed: {audio_file.name}")
            with open(transcript_file, "r", encoding="utf-8") as f:
                transcripts[audio_file.name] = f.read()
        else:
            print(f"Transcribing: {audio_file.name}")
            result = model.transcribe(str(audio_file))
            transcripts[audio_file.name] = result["text"]
            with open(transcript_file, "w", encoding="utf-8") as f:
                f.write(result["text"])

    return transcripts

def remove_mp3s(folder=DOWNLOAD_DIR):
    count = 0
    for file in folder.glob("*.mp3"):
        try:
            file.unlink()
            print(f"Deleted: {file.name}")
            count += 1
        except Exception as e:
            print(f"Error deleting {file.name}: {e}")
    print(f"\nDeleted {count} mp3 files.")

In [8]:
CHANNEL_URL = "pizzaandproperty1246"
K = 3

audio_files = download_recent_videos(CHANNEL_URL, k=K)
transcripts = transcribe_audio_files(audio_files)

Deprecated Feature: Support for Python version 3.9 has been deprecated. Please update to Python 3.10 or above


Skipping already downloaded: 800k - 1M Budget for a Home Listeners Choice Victoria - with Junge Ma & Todd Sloan
Skipping already downloaded: Melbourne will Kill Your Portfolio Growth if you don’t know this!!! - With Simon Loo & Todd Sloan
Skipping already downloaded: August 2025 Interest Rate Cut! - with Morgan Bushell & Todd Sloan
Skipping already transcribed: 800k - 1M Budget for a Home Listeners Choice Victoria - with Junge Ma & Todd Sloan.mp3
Skipping already transcribed: Melbourne will Kill Your Portfolio Growth if you don’t know this!!! - With Simon Loo & Todd Sloan.mp3
Skipping already transcribed: August 2025 Interest Rate Cut! - with Morgan Bushell & Todd Sloan.mp3


In [4]:
remove_mp3s(DOWNLOAD_DIR)

Deleted: 800k - 1M Budget for a Home Listeners Choice Victoria - with Junge Ma & Todd Sloan.mp3
Deleted: August 2025 Interest Rate Cut! - with Morgan Bushell & Todd Sloan.mp3
Deleted: Melbourne will Kill Your Portfolio Growth if you don’t know this!!! - With Simon Loo & Todd Sloan.mp3

Deleted 3 mp3 files.
