In [1]:
import os
import json

def attempt_download(yt_obj, existing_videos, location = "dataset-5/"):
    vid = yt_obj.video_id
    print(vid, "attempt_download called")
    if vid in existing_videos:
        print(vid, "Skipping, found existing folder.")
        return False
    
    os.makedirs(location + vid)
    try:
        stream = yt_obj.streams.get_by_itag(139)
    except Exception as e:
        print(vid, "Stopping download because of Exception", e)
        return False
    stream.download(output_path = location + vid + "/", filename = "audio.mp4")
    print(vid, "Audio downloaded")
    
    print(vid, "Found captions for", list(yt_obj.captions.keys()))
    for caption in yt_obj.captions:
        if "en" in caption.code:
            a_en_caption = caption.json_captions
            json.dump(a_en_caption, open(location + vid + "/caption-" + caption.code + ".json", 'w'))
            print(vid, "Downloaded", caption.code, "captions")
    
    existing_videos.add(yt_obj.video_id)

In [2]:
from pytube import Search

def deep_search(search_term, existing_videos, depth = 10):
    print("Search", search_term, "Performing search at depth", depth)
    search = Search(search_term)
    search.results
    for _ in range(depth - 1):
        search.get_next_results()
        
    print("Search", search_term, "Found", len(search.results), "videos")
        
    for yt in search.results:
        attempt_download(yt, existing_videos)

In [4]:
from concurrent.futures import ThreadPoolExecutor
import time

directories = ["dataset/", "dataset-2/", "dataset-3/", "dataset-4/"]
existing_videos = {file for directory in directories for file in os.listdir(directory)}


start_time = time.time()
starting_videos = str(len(existing_videos))

print("Starting with " + starting_videos + " videos")
print("Starting at", start_time)

"""
search_terms = [
    "Joe Biden", "Kamala Harris", "Donald Trump", "Mike Pence", "Ron DeSantis",
    "Barack Obama", "Angela Merkel", "Vladimir Putin", "Xi Jinping", "Emmanuel Macron",
    "Narendra Modi", "Jair Bolsonaro", "Justin Trudeau", "Boris Johnson", "Jacinda Ardern",
    "Leonardo DiCaprio", "Meryl Streep", "Denzel Washington", "Tom Hanks", "Jennifer Lawrence",
    "Scarlett Johansson", "Brad Pitt", "Angelina Jolie", "Johnny Depp", "Nicole Kidman",
    "Beyoncé", "Taylor Swift", "Kanye West", "Rihanna", "Jay-Z",
    "Drake", "Adele", "Ed Sheeran", "Justin Bieber", "Lady Gaga",
    "Cristiano Ronaldo", "Lionel Messi", "LeBron James", "Serena Williams", "Roger Federer",
    "Usain Bolt", "Michael Phelps", "Simone Biles", "Novak Djokovic", "Tiger Woods",
    "Elon Musk", "Jeff Bezos", "Bill Gates", "Mark Zuckerberg", "Warren Buffett",
    "Tim Cook", "Larry Page", "Sergey Brin", "Jack Ma", "Mukesh Ambani",
    "Oprah Winfrey", "Ellen DeGeneres", "Kim Kardashian", "Kylie Jenner", "Greta Thunberg",
    "Malala Yousafzai", "Dalai Lama", "Pope Francis", "Donald Glover", "Dave Chappelle",
    "Stephen King", "J.K. Rowling", "George R.R. Martin", "Neil Gaiman", "Margaret Atwood",
    "Steven Spielberg", "Christopher Nolan", "Quentin Tarantino", "Martin Scorsese", "Wes Anderson",
    "Ariana Grande", "Snoop Dogg", "Eminem", "Shakira", "Katy Perry",
    "Dwayne Johnson", "Chris Hemsworth", "Robert Downey Jr.", "Hugh Jackman", "Ryan Reynolds"
]
"""


"""
search_terms = [
    "Apple", "Microsoft", "Google", "Amazon", "Facebook",
    "Berkshire Hathaway", "Visa", "Johnson & Johnson", "Walmart", "Procter & Gamble",
    "Intel", "Verizon Communications", "AT&T", "Coca-Cola", "Pfizer",
    "Chevron", "Home Depot", "Merck & Co.", "Disney", "Cisco Systems",
    "IBM", "Boeing", "Goldman Sachs", "McDonald's", "3M",
    "Nike", "American Express", "UnitedHealth Group", "ExxonMobil", "General Electric",
    "JP Morgan Chase", "Morgan Stanley", "Tesla", "Netflix", "Adobe Systems",
    "Salesforce", "Oracle", "PayPal", "Qualcomm", "AMD",
    "eBay", "General Motors", "Ford Motor Company", "Delta Air Lines", "United Airlines",
    "Lockheed Martin", "Raytheon Technologies", "Northrop Grumman", "General Dynamics", "Honeywell",
    "Caterpillar", "DuPont", "FedEx", "UPS", "Booz Allen Hamilton",
    "Charles Schwab", "Citigroup", "Bank of America", "Wells Fargo", "Goldman Sachs Group",
    "American Airlines Group", "Southwest Airlines", "Gilead Sciences", "Bristol-Myers Squibb", "Amgen",
    "Biogen", "Celgene", "AbbVie", "Eli Lilly and Company", "Merck",
    "Snap Inc.", "Twitter", "LinkedIn", "Spotify", "Square",
    "Lyft", "Uber Technologies", "Airbnb", "SpaceX", "Palantir Technologies",
    "Stripe", "Robinhood", "Coinbase", "Zoom Video Communications", "Slack Technologies",
    "Dell Technologies", "HP Inc.", "IBM Corporation", "Intel Corporation", "Micron Technology",
    "NVIDIA", "Broadcom Inc.", "Texas Instruments", "Qualtrics", "Snowflake",
    "DoorDash", "Instacart", "Etsy", "Roku", "Pinterest"
]
"""

"""
search_terms = [
    "True Crime Stories", "Startup Founders", "Health and Wellness Tips", "Investing for Beginners", "Science Discoveries",
    "History Mysteries", "Technology Innovations", "Mindfulness and Meditation", "Global News Analysis", "Book Review Series",
    "Comedy Shows", "Film Criticism", "Music Industry Insights", "Sports Commentary", "Fashion Trends",
    "Culinary Arts", "Language Learning", "Travel Adventures", "Parenting Advice", "Personal Finance Strategies",
    "Political Analysis", "Art and Design", "Entrepreneurship Lessons", "Educational Strategies", "Mental Health Support",
    "Fitness Coaching", "Environmental Conservation", "Digital Marketing", "Philosophy and Ethics", "Astronomy and Space",
    "Mythology and Folklore", "Cybersecurity Updates", "Blockchain and Cryptocurrency", "Artificial Intelligence Trends", "Real Estate Investing",
    "Sustainability Practices", "Creative Writing", "Public Speaking Tips", "Career Development", "Classical Music Appreciation",
    "Video Game Reviews", "Comics and Graphic Novels", "Quantum Computing", "Social Media Strategies", "DIY Home Projects",
    "Yoga and Pilates", "Pet Care and Training", "Indie Film Making", "Jazz Music Exploration", "Virtual Reality Experiences"
]"""

"""
search_terms = [
    "TED Talk Artificial Intelligence", "TED Talk Blockchain Technology", "TED Talk Climate Change", "TED Talk Cybersecurity", "TED Talk Data Privacy",
    "TED Talk Digital Health", "TED Talk E-commerce Trends", "TED Talk Future of Work", "TED Talk Genetic Engineering", "TED Talk Green Energy",
    "TED Talk Human Rights", "TED Talk Innovation in Education", "TED Talk Journalism and Media", "TED Talk Leadership and Management", "TED Talk Mental Health",
    "TED Talk Neurotechnology", "TED Talk Online Learning", "TED Talk Pandemic Response", "TED Talk Quantum Computing", "TED Talk Racial Equality",
    "TED Talk Renewable Resources", "TED Talk Space Exploration", "TED Talk Sustainable Living", "TED Talk Technology and Society", "TED Talk Universal Basic Income",
    "TED Talk Virtual Reality", "TED Talk Women in STEM", "TED Talk Xenotransplantation", "TED Talk Youth Activism", "TED Talk Zero Waste Lifestyle",
    "TED Talk Creative Problem Solving", "TED Talk Digital Nomads", "TED Talk Emotional Intelligence", "TED Talk Financial Literacy", "TED Talk Globalization and Trade",
    "TED Talk Health and Nutrition", "TED Talk Immigrant Experiences", "TED Talk Job Automation", "TED Talk Knowledge Sharing", "TED Talk Language Preservation",
    "TED Talk Mindfulness and Meditation", "TED Talk Nature Conservation", "TED Talk Open Source Software", "TED Talk Political Polarization", "TED Talk Quantum Teleportation",
    "TED Talk Refugee Crisis", "TED Talk Social Entrepreneurship", "TED Talk Telemedicine", "TED Talk Urban Planning", "TED Talk Vaccine Development"
]
"""
"""
search_terms = [
    "Khan Academy", "CrashCourse", "TED-Ed", "National Geographic", "SciShow",
    "MinutePhysics", "Vsauce", "CGP Grey", "Veritasium", "SmarterEveryDay",
    "AsapSCIENCE", "Physics Girl", "Numberphile", "Kurzgesagt – In a Nutshell", "BBC Earth",
    "The School of Life", "Big Think", "PBS Space Time", "3Blue1Brown", "BrainCraft",
    "Extra Credits", "Deep Look", "It's Okay To Be Smart", "The Royal Institution", "Practical Engineering",
    "Historia Civilis", "The Art Assignment", "TEDx Talks", "Computerphile", "Seeker",
    "Vsauce2", "Vsauce3", "Wendover Productions", "Real Engineering", "Biographics",
    "SciShow Space", "SciShow Psych", "MinuteEarth", "Tom Scott", "Cody'sLab",
    "Nerdwriter1", "PolyMatter", "Half as Interesting", "TierZoo", "PBS Eons",
    "The Infographics Show", "Overly Sarcastic Productions", "PatrickJMT", "Sixty Symbols", "Geography Now"
]
"""

"""
search_terms = [
    "Unbox Therapy", "Marques Brownlee", "Linus Tech Tips", "Dave Lee", "Jonathan Morrison",
    "Austin Evans", "TechSource", "JerryRigEverything", "CNET", "The Verge",
    "MKBHD", "TechRadar", "Engadget", "TLD Today", "UrAvgConsumer",
    "Flossy Carter", "PhoneBuff", "iJustine", "What's Inside?", "TechSmartt",
    "Mrwhosetheboss", "Pocketnow", "SuperSaf TV", "TechnoBuffalo", "Android Authority",
    "Jenna Ezarik", "Lou Later", "Tech Chap", "MobileTechReview", "EverythingApplePro",
    "TechMeOut", "GadgetMatch", "Erica Griffin", "TheUnlockr", "Zollotech",
    "Booredatwork.com", "TechCrunch", "GSMArena Official", "Geekyranjit", "Tech Spurt",
    "Safwan Ahmedmia (SuperSaf)", "Chris Stuckmann", "Canoopsy", "DetroitBORG", "iGyaan",
    "Krystal Lora", "Matthew Moniz", "Michael Fisher", "Mr Mobile", "Dave2D"
]
"""

"""search_terms = [
    "Tasty", "Binging with Babish", "Rosanna Pansino", "Gordon Ramsay", "Jamie Oliver",
    "King of Random", "5-Minute Crafts", "Bright Side", "ThreadBanger", "Laura in the Kitchen",
    "Yoga With Adriene", "FitnessBlender", "Bob Ross", "Adam Savage’s Tested", "Anne Reardon - How To Cook That",
    "Clean My Space", "Garden Answer", "Home Repair Tutor", "This Old House", "Man Sewing",
    "The Sorry Girls", "Makeup Geek", "Wayne Goss", "FreeCodeCamp.org", "Traversy Media",
    "Justin Guitar", "Drumeo", "Chess.com", "Peter McKinnon", "PHLEARN",
    "Sewing Parts Online", "Crazy Russian Hacker", "Primitive Technology", "Grant Thompson", "The Art Sherpa",
    "Blender Guru", "Langfocus", "Easy Languages", "Khan Academy", "CG Cookie",
    "JunsKitchen", "Maangchi", "ChefSteps", "Joshua Weissman", "Food Wishes",
    "Gentle Whispering ASMR", "The Coding Train", "Corey Schafer", "Academy of Ideas", "School of Life"
]"""

"""search_terms = [
    "Casey Neistat", "Zoella", "Logan Paul", "Shaytards", "Jenna Marbles",
    "David Dobrik", "PewDiePie", "Roman Atwood Vlogs", "Tyler Oakley", "Connor Franta",
    "FunForLouis", "Tanya Burr", "PointlessBlogVlogs", "Safiya Nygaard", "Dude Perfect",
    "Liza Koshy", "The Dolan Twins", "Emma Chamberlain", "James Charles", "Jeffree Star",
    "NikkieTutorials", "Shane Dawson", "Trisha Paytas", "Gigi Gorgeous", "Joey Graceffa",
    "The Ace Family", "The Michalaks", "Sailing La Vagabonde", "Jon Olsson", "Erik Conover",
    "Louis Cole", "Wil Dasovich", "Raya Was Here", "Lost LeBlanc", "Kara and Nate",
    "Sam Kolder", "Peter McKinnon", "Exploring Alternatives", "Vanessa Lau", "GaryVee",
    "Ben Brown", "Lavendaire", "Mimi Ikonn", "Alex Ikonn", "Rachel Aust",
    "The Bucket List Family", "Flying The Nest", "Kombi Life", "Eileen Aldis", "Bald and Bankrupt"
]"""


"""
search_terms = [
    "Intelligence Squared Debates", "Oxford Union", "Cambridge Union", "TED Conferences", "The Doha Debates",
    "Big Think", "The Aspen Institute", "World Economic Forum", "National Geographic", "PBS NewsHour",
    "Democracy Now!", "The Rubin Report", "C-SPAN", "Hoover Institution", "Council on Foreign Relations",
    "Carnegie Council for Ethics in International Affairs", "The Heritage Foundation", "Brookings Institution", "Chatham House", "American Enterprise Institute",
    "Pew Research Center", "The Economist", "Financial Times", "Vox Media", "Vice News",
    "Real Time with Bill Maher", "The Agenda with Steve Paikin", "The Royal Society", "London School of Economics and Political Science", "Harvard University",
    "Stanford University", "Yale University", "Princeton University", "MIT Media Lab", "University of California Television",
    "The New School", "New York Public Library", "Chicago Ideas", "TEDx Talks", "92nd Street Y",
    "Sydney Opera House Talks & Ideas", "RSA Events", "Skepticon", "How To Academy", "Battle of Ideas"
]
"""


def perform_search(search_term):
    deep_search(search_term, existing_videos, depth=5)

    
with ThreadPoolExecutor(max_workers = 32) as executor:
    executor.map(perform_search, search_terms)

end_time = time.time()
ending_videos = str(len(existing_videos))

print("Started with " + start_time + " videos")
print("Started at", start_time)
print("Ending with " + ending_videos + " videos")
print("Ending at", end_time)

Starting with 42304 videos
Starting at 1708391673.844219
Search Intelligence Squared Debates Performing search at depth 5
Search Oxford Union Performing search at depth 5
Search Cambridge Union Performing search at depth 5
Search TED Conferences Performing search at depth 5
Search The Doha Debates Performing search at depth 5
Search Big Think Performing search at depth 5
Search The Aspen Institute Performing search at depth 5
Search World Economic Forum Performing search at depth 5
Search National Geographic Performing search at depth 5
Search PBS NewsHour Performing search at depth 5
Search Democracy Now! Performing search at depth 5
Search The Rubin Report Performing search at depth 5
Search C-SPAN Performing search at depth 5
Search Hoover Institution Performing search at depth 5
Search Council on Foreign Relations Performing search at depth 5
Search Carnegie Council for Ethics in International Affairs Performing search at depth 5
Search The Heritage Foundation Performing search at 

Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Oxford Union
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Oxford Union
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: The Rubin Report
Unexpected renderer encountered.
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Renderer name: dict_keys(['reelShelfRenderer'])
Unexpected renderer encountered.
Search term: Cambridge Union
Renderer name: dict_keys(['reelShelfRenderer'])
Unexpected renderer encountered.
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Search term: World Economic Forum
Renderer name: dict_keys(['reelShelfRenderer'

KeyboardInterrupt: 

Unexpected renderer encountered.
Unexpected renderer encountered.
Search term: The Doha Debates
Unexpected renderer encountered.
Search term: Yale University
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Renderer name: dict_keys(['reelShelfRenderer'])
Renderer name: dict_keys(['adSlotRenderer'])
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Renderer name: dict_keys(['reelShelfRenderer'])
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Big Think
Unexpected renderer encountered.
Search term: Harvard University
Search term: Stanford University
Search term: London School of Economics and Political Science
Unexpected renderer encountered.
Unexpected renderer encountered.
Search term: Pew Research Center
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Renderer na

Search The Rubin Report Found 97 videos
KyKkia8Mw_o attempt_download called
KyKkia8Mw_o Skipping, found existing folder.
ia50QTinbT8 attempt_download called
ia50QTinbT8 Skipping, found existing folder.
ZRYlcrGKE-k attempt_download called
ZRYlcrGKE-k Skipping, found existing folder.
Lkvpw86iV7s attempt_download called
Lkvpw86iV7s Skipping, found existing folder.
-_40_JrUv_Y attempt_download called
-_40_JrUv_Y Skipping, found existing folder.
0SVt0MDIYTE attempt_download called
0SVt0MDIYTE Skipping, found existing folder.
kd7hAUoO028 attempt_download called
kd7hAUoO028 Skipping, found existing folder.
dbePI_2YDBM attempt_download called
dbePI_2YDBM Skipping, found existing folder.
sWieMROb0ig attempt_download called
sWieMROb0ig Skipping, found existing folder.
8-5auGkAu7Y attempt_download called
8-5auGkAu7Y Skipping, found existing folder.
NEA4M6VMya4 attempt_download called
NEA4M6VMya4 Skipping, found existing folder.
V-VjJohZzTI attempt_download called
V-VjJohZzTI Skipping, found exist

Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: London School of Economics and Political Science
Unexpected renderer encountered.
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Renderer name: dict_keys(['reelShelfRenderer'])
Unexpected renderer encountered.
Search term: National Geographic
Renderer name: dict_keys(['reelShelfRenderer'])
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Search term: London School of Economics and Political Science
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Renderer name: dict_keys(['reelShelfRenderer'])
Unexpected renderer encountered.
Search term: Yale University
Search term: National Geographic
Renderer name: dict_keys(['reelShelfRenderer'])
Please open an i

Search C-SPAN Found 97 videos
6VpJKUscNaM attempt_download called
6VpJKUscNaM Skipping, found existing folder.
gh_C0gEkRXs attempt_download called
gh_C0gEkRXs Skipping, found existing folder.
PThfYee8aE4 attempt_download called
PThfYee8aE4 Skipping, found existing folder.
YX4wZRlrurQ attempt_download called
YX4wZRlrurQ Skipping, found existing folder.
2OiFR-bHvy4 attempt_download called
2OiFR-bHvy4 Skipping, found existing folder.
reN7UYQDQBo attempt_download called
reN7UYQDQBo Skipping, found existing folder.
4rdRZVzhGSs attempt_download called
4rdRZVzhGSs Skipping, found existing folder.
GkKUDT2VTLQ attempt_download called
GkKUDT2VTLQ Skipping, found existing folder.
SIZeFoVH8H0 attempt_download called
SIZeFoVH8H0 Skipping, found existing folder.
e1lcK3zgrpE attempt_download called
e1lcK3zgrpE Skipping, found existing folder.
0WnQEWKavZ8 attempt_download called
0WnQEWKavZ8 Skipping, found existing folder.
vsOakz6pyNc attempt_download called
vsOakz6pyNc Skipping, found existing folder

Unexpected renderer encountered.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Harvard University
Search term: Stanford University
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Harvard University
Search term: Stanford University
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Yale University


Search Oxford Union Found 97 videos
gWnMgMEVABM attempt_download called
gWnMgMEVABM Skipping, found existing folder.
sP08sIKXdLE attempt_download called
sP08sIKXdLE Skipping, found existing folder.
-1NFirxhXWE attempt_download called
-1NFirxhXWE Skipping, found existing folder.
F4GGxcHijNk attempt_download called
F4GGxcHijNk Skipping, found existing folder.
DmmSDfc2Qo8 attempt_download called
Search Pew Research Center Found 99 videos
BmGVQouqz1s attempt_download called
BmGVQouqz1s Skipping, found existing folder.
eFzGdQrr2K8 attempt_download called
eFzGdQrr2K8 Skipping, found existing folder.
NMUIoxXU8m8 attempt_download called
NMUIoxXU8m8 Skipping, found existing folder.
tdxhDHcF5Fc attempt_download called
tdxhDHcF5Fc Skipping, found existing folder.
ykr4WeHaJNE attempt_download called
ykr4WeHaJNE Skipping, found existing folder.
l2WaNmhvEzo attempt_download called
l2WaNmhvEzo Skipping, found existing folder.
hPCoDz_CPCc attempt_download called
hPCoDz_CPCc Skipping, found existing fo

Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Yale University
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.


Search Chatham House Found 98 videos
IsgTt5tldww attempt_download called
IsgTt5tldww Skipping, found existing folder.
gW6nirXos5A attempt_download called
gW6nirXos5A Skipping, found existing folder.
gU11PM_QIzU attempt_download called
gU11PM_QIzU Skipping, found existing folder.
nill8M_Qi-Y attempt_download called
nill8M_Qi-Y Skipping, found existing folder.
2wTUPSJDQzo attempt_download called
2wTUPSJDQzo Skipping, found existing folder.
lHuXUeq6XH8 attempt_download called
lHuXUeq6XH8 Skipping, found existing folder.
mZDNn17YBQI attempt_download called
mZDNn17YBQI Skipping, found existing folder.
HNhL2Yf7Ao4 attempt_download called
HNhL2Yf7Ao4 Skipping, found existing folder.
GK7OeRUzKIw attempt_download called
GK7OeRUzKIw Skipping, found existing folder.
WWb2js6FgEc attempt_download called
WWb2js6FgEc Skipping, found existing folder.
2nT4lngp600 attempt_download called
2nT4lngp600 Skipping, found existing folder.
halMzmvRfC0 attempt_download called
Search Brookings Institution Found 9

Unexpected renderer encountered.


Search Vox Media Found 96 videos
sJs_64OUpEs attempt_download called
sJs_64OUpEs Skipping, found existing folder.
yvdtWfHpCR4 attempt_download called
yvdtWfHpCR4 Skipping, found existing folder.
f5acbuOuCR0 attempt_download called
f5acbuOuCR0 Skipping, found existing folder.
JjAwOSNEupM attempt_download called


Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Intelligence Squared Debates
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Intelligence Squared Debates
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.


Search TED Conferences Found 88 videos
8jPQjjsBbIc attempt_download called
8jPQjjsBbIc Skipping, found existing folder.
v9f6twy70iM attempt_download called
v9f6twy70iM Skipping, found existing folder.
l4Lq7gaKW4A attempt_download called
Search Hoover Institution Found 98 videos
DAUJSAcCldc attempt_download called
DAUJSAcCldc Skipping, found existing folder.
LpMQSVp7NTg attempt_download called
LpMQSVp7NTg Skipping, found existing folder.
pn2gda_phAA attempt_download called
pn2gda_phAA Skipping, found existing folder.
0XqZ6L0MgQo attempt_download called
0XqZ6L0MgQo Skipping, found existing folder.
QPc46gpHYw4 attempt_download called
QPc46gpHYw4 Skipping, found existing folder.
IL4Jxa6v4JA attempt_download called
IL4Jxa6v4JA Skipping, found existing folder.
LMRYvl2Jefg attempt_download called
LMRYvl2Jefg Skipping, found existing folder.
wJY-1alhpJM attempt_download called
wJY-1alhpJM Skipping, found existing folder.
AQTmVrlolSY attempt_download called
AQTmVrlolSY Skipping, found existing 

Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: PBS NewsHour
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: PBS NewsHour
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Real Time with Bill Maher
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.
Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Real Time with Bill Maher


Search The Agenda with Steve Paikin Found 99 videos
rFVHUOT4Vyk attempt_download called
rFVHUOT4Vyk Skipping, found existing folder.
H7GSRt8dYe8 attempt_download called
H7GSRt8dYe8 Skipping, found existing folder.
zf276pdUC8Y attempt_download called
zf276pdUC8Y Skipping, found existing folder.
5XyU--hEKAk attempt_download called
5XyU--hEKAk Skipping, found existing folder.
P9k0bA5CQGE attempt_download called
P9k0bA5CQGE Skipping, found existing folder.
RLPKMK8Q-sA attempt_download called
RLPKMK8Q-sA Skipping, found existing folder.
GiGTWhSFz2k attempt_download called
GiGTWhSFz2k Skipping, found existing folder.
gomI_Yh-0c4 attempt_download called
gomI_Yh-0c4 Skipping, found existing folder.
_H0uUj5peTM attempt_download called
_H0uUj5peTM Skipping, found existing folder.
kbfla1q60TA attempt_download called
kbfla1q60TA Skipping, found existing folder.
W_-uhO1KTGc attempt_download called
W_-uhO1KTGc Skipping, found existing folder.
3Ybg16RNc5g attempt_download called
3Ybg16RNc5g Skipping,

Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.


Search Big Think Found 97 videos
Azky23nqQds attempt_download called
Azky23nqQds Skipping, found existing folder.
7yLeDg_KuOA attempt_download called
7yLeDg_KuOA Skipping, found existing folder.
4mPZC8QQXCo attempt_download called
4mPZC8QQXCo Skipping, found existing folder.
LTaPtQMHBxQ attempt_download called
LTaPtQMHBxQ Skipping, found existing folder.
HPsz4ipF4i4 attempt_download called
HPsz4ipF4i4 Skipping, found existing folder.
e-or_D-qNqM attempt_download called
e-or_D-qNqM Skipping, found existing folder.
H5Q_-gIZIps attempt_download called
H5Q_-gIZIps Skipping, found existing folder.
tpPFdFdfxxM attempt_download called
tpPFdFdfxxM Skipping, found existing folder.
BjmPvovl-V4 attempt_download called
BjmPvovl-V4 Skipping, found existing folder.
Ywec1MbeQDk attempt_download called
Ywec1MbeQDk Skipping, found existing folder.
PuAwied4x2Q attempt_download called
PuAwied4x2Q Skipping, found existing folder.
GF-xvBXgqa4 attempt_download called
GF-xvBXgqa4 Skipping, found existing fol

Unexpected renderer encountered.
Renderer name: dict_keys(['reelShelfRenderer'])
Search term: Council on Foreign Relations
Please open an issue at https://github.com/pytube/pytube/issues and provide this log output.


In [10]:
import glob
import os
from moviepy.editor import AudioFileClip

def static_analysis(location):
    folders = {x for x in os.listdir(location)}
    print("Videos crawled:", len(folders))
    print()
    
    has_audio = 0 
    has_at_least_one_transcript = 0
    
    total_bytes = 0
    total_length = 0
    transcripts = {}   

    for folder in folders[:500]:
        if not os.path.exists(location + folder + "/audio.mp4"):
            continue
        total_bytes += os.path.getsize(location + folder + "/audio.mp4")
        if total_bytes == 0:
            continue
        has_audio += 1
        with AudioFileClip(location + folder + "/audio.mp4") as audio:
            total_length += audio.duration
        has_transcript = False
        for file in os.listdir(location + folder):
            if ".json" in file:
                has_transcript = True
                language = file.replace("caption-", "").replace(".json", "")
                if language not in transcripts:
                    transcripts[language] = 0
                transcripts[language] += 1
        if has_transcript:
            has_at_least_one_transcript += 1
        
    print("Downloaded mp4 Files:", has_audio)
    print("Total mp4 Audio Duration (seconds):", int(total_length))
    print("Total mp4 Filesize (bytes):", total_bytes)
    print()
    
    transcripts = sorted(transcripts.items(), key=lambda item: item[1], reverse=True)
    print("Has at least one transcript:", has_at_least_one_transcript)
    for language, count in transcripts:
        print("Transcripts of language", language + ":", count)

static_analysis("dataset-4/")

Videos crawled: 5367



TypeError: 'set' object is not subscriptable