In [1]:
import pandas as pd
import numpy as np
import os
import yt_dlp
import whisper

pd.set_option('display.max_rows', 20)

In [2]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Should return your GPU name

True
NVIDIA GeForce RTX 2060


In [3]:
# Define directories for audio and transcripts
AUDIO_DIR = "audio_files"
TRANSCRIPT_DIR = "transcripts"

# Create directories if they don't exist
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)

In [12]:
def download_audio(video_id, output_format="mp3"):

    try:
        url = f"https://www.youtube.com/watch?v={video_id}"
        output_filename = os.path.join(AUDIO_DIR, f"{video_id}.%(ext)s")  # Save in AUDIO_DIR

        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': output_filename,
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': output_format,
                'preferredquality': '192',
            }],
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])

        # Check for incorrect double extension
        expected_filename = os.path.join(AUDIO_DIR, f"{video_id}.{output_format}")
        double_extension = os.path.join(AUDIO_DIR, f"{video_id}.{output_format}.{output_format}")

        if os.path.exists(double_extension):
            os.rename(double_extension, expected_filename)

        return expected_filename
    except Exception as e:
        print(f"Error downloading audio for video {video_id}: {e}")
        return None


def transcribe_audio(audio_file):
    device = "cuda" if torch.cuda.is_available() else "cpu"  # Use GPU if available
    model = whisper.load_model("base", device=device)  # Load Whisper on GPU, can change to "small", "medium", or "large" depending on your needs
    result = model.transcribe(audio_file)
    return result["text"]


def save_transcript(video_id, transcript_text):
    output_filename = os.path.join(TRANSCRIPT_DIR, f"{video_id}.txt")
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(transcript_text)
    print(f"Transcript saved at: {output_filename}")

In [6]:
video_id = "UI0Hgxan_LE"

# Step 1: Download Audio
audio_file = download_audio(video_id)

# Check if file exists
if os.path.exists(audio_file):
    print(f"File exists: {audio_file}")
else:
    print(f"File not found: {audio_file}")

# Step 2: Transcribe Audio
transcript = transcribe_audio(audio_file)

# Step 3: Save Transcript
save_transcript(video_id, transcript)

[youtube] Extracting URL: https://www.youtube.com/watch?v=UI0Hgxan_LE
[youtube] UI0Hgxan_LE: Downloading webpage
[youtube] UI0Hgxan_LE: Downloading tv client config
[youtube] UI0Hgxan_LE: Downloading player af7f576f
[youtube] UI0Hgxan_LE: Downloading tv player API JSON
[youtube] UI0Hgxan_LE: Downloading ios player API JSON
[youtube] UI0Hgxan_LE: Downloading m3u8 information
[info] UI0Hgxan_LE: Downloading 1 format(s): 251
[download] Destination: audio_files\UI0Hgxan_LE.webm
[download] 100% of    7.20MiB in 00:00:00 at 8.14MiB/s   
[ExtractAudio] Destination: audio_files\UI0Hgxan_LE.mp3
Deleting original file audio_files\UI0Hgxan_LE.webm (pass -k to keep)
File exists: audio_files\UI0Hgxan_LE.mp3
Transcript saved at: transcripts\UI0Hgxan_LE.txt


In [None]:
df = pd.read_csv("transcriptless_videos.csv")

failed_videoes = []

for index, row in df.iterrows():
    video_id = row["Video Id"]  # Extract Video ID
    print(f"\nProcessing Video ID: {video_id}")

    audio_file = download_audio(video_id)

    if audio_file is None:
        print(f"Skipping {video_id}: Download failed.")
        continue  # Skip to the next video

    if os.path.exists(audio_file):
        print(f"File exists: {audio_file}")
        try:
            transcript = transcribe_audio(audio_file)
            save_transcript(video_id, transcript)
        except Exception as e:
            print(f"Error during transcription for {video_id}: {e}")
            failed_videoes.append(video_id)
    else:
        print(f"Error: Audio file {audio_file} not found! Skipping.")
        failed_videoes.append(video_id)


if failed_videoes:
    print("\n⚠️ The following videos failed to generate transcripts:")
    for vid in failed_videoes:
        print(f"- {vid}")
else:
    print("\n✅ All videos were successfully processed!")


Processing Video ID: 3NLhX-X1QkI
[youtube] Extracting URL: https://www.youtube.com/watch?v=3NLhX-X1QkI
[youtube] 3NLhX-X1QkI: Downloading webpage
[youtube] 3NLhX-X1QkI: Downloading tv client config
[youtube] 3NLhX-X1QkI: Downloading player af7f576f
[youtube] 3NLhX-X1QkI: Downloading tv player API JSON
[youtube] 3NLhX-X1QkI: Downloading ios player API JSON
[youtube] 3NLhX-X1QkI: Downloading m3u8 information
[info] 3NLhX-X1QkI: Downloading 1 format(s): 140
[download] Destination: audio_files\3NLhX-X1QkI.m4a
[download] 100% of   58.06KiB in 00:00:00 at 305.64KiB/s   
[FixupM4a] Correcting container of "audio_files\3NLhX-X1QkI.m4a"
[ExtractAudio] Destination: audio_files\3NLhX-X1QkI.mp3
Deleting original file audio_files\3NLhX-X1QkI.m4a (pass -k to keep)
File exists: audio_files\3NLhX-X1QkI.mp3
Transcript saved at: transcripts\3NLhX-X1QkI.txt

Processing Video ID: 9sMmTC9x6wc
[youtube] Extracting URL: https://www.youtube.com/watch?v=9sMmTC9x6wc
[youtube] 9sMmTC9x6wc: Downloading webpage
[

ERROR: [youtube] 1zkeD98NIw0: Video unavailable


Error downloading audio for video 1zkeD98NIw0: ERROR: [youtube] 1zkeD98NIw0: Video unavailable


TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType