In [1]:
import os
import json

def attempt_download(yt_obj, existing_videos, location = "dataset/"):
    vid = yt_obj.video_id
    print(vid, "attempt_download called")
    if vid in existing_videos:
        print(vid, "Skipping, found existing folder.")
        return False
    
    os.makedirs(location + vid)
    try:
        stream = yt_obj.streams.get_by_itag(139)
    except Exception as e:
        print(vid, "Stopping download because of Exception", e)
        return False
    stream.download(output_path = location + vid + "/", filename = "audio.mp4")
    print(vid, "Audio downloaded")
    
    print(vid, "Found captions for", list(yt_obj.captions.keys()))
    for caption in yt_obj.captions:
        if "en" in caption.code:
            a_en_caption = caption.json_captions
            json.dump(a_en_caption, open(location + vid + "/caption-" + caption.code + ".json", 'w'))
            print(vid, "Downloaded", caption.code, "captions")
    
    existing_videos.add(yt_obj.video_id)

In [2]:
from pytube import Search

def deep_search(search_term, existing_videos, depth = 3):
    print("Search", search_term, "Performing search at depth", depth)
    search = Search(search_term)
    search.results
    for _ in range(depth - 1):
        search.get_next_results()
        
    print("Search", search_term, "Found", len(search.results), "videos")
        
    for yt in search.results:
        attempt_download(yt, existing_videos)

In [3]:
from concurrent.futures import ThreadPoolExecutor

existing_videos = {x for x in os.listdir("dataset/")}
print("Starting with " + str(len(existing_videos)) + " videos")

search_terms = ["Joe Biden", "Kamala Harris", "Donald Trump", "Mike Pence", "Ron DeSantis"]

def perform_search(search_term):
    deep_search(search_term, existing_videos, depth=3)

    
with ThreadPoolExecutor(max_workers = 5) as executor:
    executor.map(perform_search, search_terms)

print("Ending with " + str(len(existing_videos)) + " videos")

Starting with 222 videos
Search Joe Biden Performing search at depth 3
Search Kamala Harris Performing search at depth 3
Search Donald Trump Performing search at depth 3
Search Mike Pence Performing search at depth 3
Search Ron DeSantis Performing search at depth 3


Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Joe Biden
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Kamala Harris
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.


Search Joe Biden Found 55 videos
h9FzYgiES84 attempt_download called
h9FzYgiES84 Skipping, found existing folder.
Lf6FTNSuZfM attempt_download called
Lf6FTNSuZfM Skipping, found existing folder.
5NW2EEyyYxs attempt_download called
5NW2EEyyYxs Skipping, found existing folder.
H_zelA-F9Ss attempt_download called
Search Mike Pence Found 58 videos
Ti8ipMVBf5Y attempt_download called
Ti8ipMVBf5Y Skipping, found existing folder.
ZrA9acvG6ck attempt_download called
ZrA9acvG6ck Skipping, found existing folder.
B9ykN0RKDQY attempt_download called
B9ykN0RKDQY Skipping, found existing folder.
_lD_biUbmaQ attempt_download called
_lD_biUbmaQ Skipping, found existing folder.
PHTCH1nA-eQ attempt_download called
PHTCH1nA-eQ Skipping, found existing folder.
-Fs0XjuFq5A attempt_download called
-Fs0XjuFq5A Skipping, found existing folder.
Gb-mzaOvexE attempt_download called
Gb-mzaOvexE Skipping, found existing folder.
Eum23dPZ-Yg attempt_download called
Eum23dPZ-Yg Skipping, found existing folder.
0R0GUt

Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Kamala Harris
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Kamala Harris
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.


Search Donald Trump Found 53 videos
ZOzguIxm1Ps attempt_download called
ZOzguIxm1Ps Skipping, found existing folder.
1bXi-GKzdfk attempt_download called
Search Kamala Harris Found 55 videos
XudEcLXg72I attempt_download called
XudEcLXg72I Skipping, found existing folder.
AaDrcsp9vN4 attempt_download called
AaDrcsp9vN4 Skipping, found existing folder.
Vya_cl3kWRI attempt_download called
Vya_cl3kWRI Skipping, found existing folder.
fVbMWFGy4HM attempt_download called
fVbMWFGy4HM Skipping, found existing folder.
6pfH2YS9Hd8 attempt_download called
6pfH2YS9Hd8 Skipping, found existing folder.
Wh9laE-XO4U attempt_download called
Wh9laE-XO4U Skipping, found existing folder.
6Ti2gDCTlFc attempt_download called
6Ti2gDCTlFc Skipping, found existing folder.
Ak3S33zdxIw attempt_download called
Ak3S33zdxIw Skipping, found existing folder.
B5cu3zJSIV4 attempt_download called
B5cu3zJSIV4 Skipping, found existing folder.
xz7rNOAFkgE attempt_download called
xz7rNOAFkgE Skipping, found existing folder.


ChjibtX0UzU Downloaded en-US captions
s2PHzMCJb0M attempt_download called
s2PHzMCJb0M Skipping, found existing folder.
L1YOKDlP254 attempt_download called
L1YOKDlP254 Audio downloaded
L1YOKDlP254 Found captions for [<Caption lang="English (auto-generated)" code="a.en">]
L1YOKDlP254 Downloaded a.en captions
LKhfhMMEHU4 attempt_download called
LKhfhMMEHU4 Skipping, found existing folder.
qmX7y8B9kTU attempt_download called
qmX7y8B9kTU Skipping, found existing folder.
FXWfHq9TjAk attempt_download called
FXWfHq9TjAk Skipping, found existing folder.
oc7bn-QLv6E attempt_download called
oc7bn-QLv6E Skipping, found existing folder.
Cn9Zo0VE1UY attempt_download called
Cn9Zo0VE1UY Skipping, found existing folder.
9rCevF2fCC8 attempt_download called
9rCevF2fCC8 Skipping, found existing folder.
nBLc8cOxpIQ attempt_download called
nBLc8cOxpIQ Skipping, found existing folder.
1UPyZHMpsKU attempt_download called
FnaRbUeXsaI Audio downloaded
FnaRbUeXsaI Found captions for [<Caption lang="English (auto

COzAPK5XQG4 Downloaded a.en captions
SMkk7dk4cuQ attempt_download called
SMkk7dk4cuQ Skipping, found existing folder.
WixdQXQzKaA attempt_download called
WixdQXQzKaA Skipping, found existing folder.
zPwsZl9f5UM attempt_download called
zPwsZl9f5UM Skipping, found existing folder.
HezhVpAJIhU attempt_download called
HezhVpAJIhU Skipping, found existing folder.
9kvSTyC9Hnk attempt_download called
9kvSTyC9Hnk Skipping, found existing folder.
rXJIgW1niZA attempt_download called
rXJIgW1niZA Skipping, found existing folder.
saZfcclFdIA attempt_download called
saZfcclFdIA Skipping, found existing folder.
pLZgBYPiJ24 attempt_download called
CH335fMo0Yw Audio downloaded
CH335fMo0Yw Found captions for [<Caption lang="English (auto-generated)" code="a.en">]
CH335fMo0Yw Downloaded a.en captions
SuQunX2Oyl8 attempt_download called
SuQunX2Oyl8 Skipping, found existing folder.
R6nwZdOGpXk attempt_download called
jbQmmTm7iPU Audio downloaded
jbQmmTm7iPU Found captions for [<Caption lang="English (auto-

-Mb8OzR4JjQ Audio downloaded
-Mb8OzR4JjQ Found captions for [<Caption lang="English - CC1" code="en.uYU-mmqFLq8">, <Caption lang="English - DTVCC1" code="en.JkeT_87f4cc">]
-Mb8OzR4JjQ Downloaded en.uYU-mmqFLq8 captions
-Mb8OzR4JjQ Downloaded en.JkeT_87f4cc captions
SIyo2TH1IsQ attempt_download called
OSus3lMyEQA Audio downloaded
OSus3lMyEQA Found captions for [<Caption lang="English (auto-generated)" code="a.en">]
OSus3lMyEQA Downloaded a.en captions
MI7x9WepEQU attempt_download called
MI7x9WepEQU Audio downloaded
MI7x9WepEQU Found captions for []
th8W7c86KFQ attempt_download called
SIyo2TH1IsQ Audio downloaded
SIyo2TH1IsQ Found captions for [<Caption lang="English (auto-generated)" code="a.en">, <Caption lang="English - CC1" code="en.uYU-mmqFLq8">, <Caption lang="English - en" code="en.nP7-2PuUl7o">]
SIyo2TH1IsQ Downloaded a.en captions
SIyo2TH1IsQ Downloaded en.uYU-mmqFLq8 captions
SIyo2TH1IsQ Downloaded en.nP7-2PuUl7o captions
SP6nP4lk5Uw attempt_download called
SP6nP4lk5Uw Skipping

9hE6NT7y2ZY Audio downloaded
9hE6NT7y2ZY Found captions for []
26mZboENjd4 attempt_download called
26mZboENjd4 Audio downloaded
26mZboENjd4 Found captions for [<Caption lang="English (auto-generated)" code="a.en">]
26mZboENjd4 Downloaded a.en captions
QuYLHok8ZyY attempt_download called
QuYLHok8ZyY Skipping, found existing folder.
Ending with 303 videos


In [4]:
import glob
from moviepy.editor import AudioFileClip

def static_analysis(location):
    folders = {x for x in os.listdir(location)}
    print("Videos crawled:", len(folders))
    print()
    
    has_audio = 0
    has_at_least_one_transcript = 0
    
    total_bytes = 0
    total_length = 0
    transcripts = {}   

    for folder in folders:
        if not os.path.exists(location + folder + "/audio.mp4"):
            continue        
        has_audio += 1
        with AudioFileClip(location + folder + "/audio.mp4") as audio:
            total_length += audio.duration
        total_bytes += os.path.getsize(location + folder + "/audio.mp4")
        has_transcript = False
        for file in os.listdir(location + folder):
            if ".json" in file:
                has_transcript = True
                language = file.replace("caption-", "").replace(".json", "")
                if language not in transcripts:
                    transcripts[language] = 0
                transcripts[language] += 1
        if has_transcript:
            has_at_least_one_transcript += 1
        
    print("Downloaded mp4 Files:", has_audio)
    print("Total mp4 Audio Duration (seconds):", int(total_length))
    print("Total mp4 Filesize (bytes):", total_bytes)
    print()
    
    transcripts = sorted(transcripts.items(), key=lambda item: item[1], reverse=True)
    print("Has at least one transcript:", has_at_least_one_transcript)
    for language, count in transcripts:
        print("Transcripts of language", language + ":", count)

static_analysis("dataset/")

Videos crawled: 309

Downloaded mp4 Files: 293
Total mp4 Audio Duration (seconds): 127077
Total mp4 Filesize (bytes): 775100442

Has at least one transcript: 261
Transcripts of language a.en: 242
Transcripts of language en.uYU-mmqFLq8: 65
Transcripts of language en.JkeT_87f4cc: 55
Transcripts of language en-US: 29
Transcripts of language en.nP7-2PuUl7o: 25
Transcripts of language en: 15
Transcripts of language en.dQs7zDoAYDs: 4
Transcripts of language en.IbbGYl6aSTg: 1
Transcripts of language en.YhZ6GAzbjEg: 1
Transcripts of language en.RTbB2cpHawQ: 1
Transcripts of language en-GB: 1
Transcripts of language en.LoGGid9nkGE: 1
Transcripts of language en.I1eY0-195hw: 1
Transcripts of language en.PkKg-NMw-yM: 1
Transcripts of language en.0irEcW9boG4: 1
