In [1]:
import os
import json
import time
from concurrent.futures import ThreadPoolExecutor
import threading
import time
from pytube import Search
import logging
import random
logging.getLogger().setLevel(logging.ERROR)

In [2]:
_QUEUED = 0
_WORKING = 0
_DOWNLOADING = 0

_SKIPPED = 0
_EXCEPTIONED = 0
_DOWNLOADED = 0

_SEARCH_QUEUE = 0
_SEARCH_WORKING = 0

def attempt_download(yt_obj, existing_videos, location):
    global _SKIPPED, _EXCEPTIONED, _DOWNLOADING, _DOWNLOADED, _WORKING
    vid = yt_obj.video_id
    
    # print(vid, "attempt_download called")
    if vid in existing_videos:
        # print(vid, "Skipping, found existing folder.")
        with lock:
            _SKIPPED += 1
            _WORKING -= 1
        return 0
        
    os.makedirs(location + vid)
    try:
        stream = yt_obj.streams.get_by_itag(139)
    except Exception as e:
        # print(vid, "Stopping download because of Exception", e)
        with lock:
            _EXCEPTIONED += 1
            _WORKING -= 1
        return 0

    with lock:
        _DOWNLOADING += 1
    stream.download(output_path = location + vid + "/", filename = "audio.mp4")
    # print(vid, "Audio downloaded")
    
    # print(vid, "Found captions for", list(yt_obj.captions.keys()))
    for caption in yt_obj.captions:
        if "en" in caption.code:
            a_en_caption = caption.json_captions
            json.dump(a_en_caption, open(location + vid + "/caption-" + caption.code + ".json", 'w'))
            # print(vid, "Downloaded", caption.code, "captions")
    
    existing_videos.add(yt_obj.video_id)
    with lock:
        _DOWNLOADED += 1
        _DOWNLOADING -= 1
        _WORKING -= 1
    return 1

def deep_search(search_term, existing_videos, location, depth):
    global _SEARCH_QUEUE, _SEARCH_WORKING, _QUEUED, _WORKING
    
    print("STARTING SEARCH TERM", search_term)
    
    with lock:
        _SEARCH_QUEUE -= 1
        _SEARCH_WORKING += 1
        
    # print("Search", search_term, "Performing search at depth", depth)
    search = Search(search_term)
    search.results
    for _ in range(depth - 1):
        time.sleep(random.uniform(0, 2))
        search.get_next_results()
        
    # print("Search", search_term, "Found", len(search.results), "videos")
    # print(search.results)

    start_time = time.time()
    search_results = len(search.results)
    hits = 0

    with lock:
        _QUEUED += search_results

    # print(search.results)
    for yt in search.results:
        # print(yt)
        with lock:
            _QUEUED -= 1
            _WORKING += 1
        hits += attempt_download(yt, existing_videos, location)
    
    duration = time.time() - start_time
    with lock:
        _SEARCH_WORKING -= 1
    print("COMPLETED SEARCH TERM", search_term, "|", str(int(duration)), "seconds |", str(int(search_results)), "results |", str(int(hits)), "downloaded")
    return attempts, success


existing_dirs = ["dataset/", "dataset-2/", "dataset-3/", "dataset-4/", "dataset-5/", "dataset-6/", "dataset-dev/"]
existing_videos = {file for directory in existing_dirs for file in os.listdir(directory)}
def perform_search(search_term):
    depth = 5
    location = "dataset-6/"
    deep_search(search_term, existing_videos, location, depth)

In [3]:
def print_summary_periodically(interval, diff_interval):
    cnt = 0
    while True:
        cnt += 1
        time.sleep(interval)
        with lock:
            print(cnt, int(time.time()), "| Queued:", _QUEUED, "| Working:", _WORKING, "| Downloading:", _DOWNLOADING, "| Good:", _DOWNLOADED, "| Skip:", _SKIPPED, "| Error:", _EXCEPTIONED, "| Term Queue:", _SEARCH_QUEUE, "| Term Working:", _SEARCH_WORKING)
        if searches_done:
            break

In [4]:
# Dataset 1, Depth 3
"""
search_terms = [
    "Joe Biden", "Kamala Harris", "Donald Trump", "Mike Pence", "Ron DeSantis",
    "Barack Obama", "Angela Merkel", "Vladimir Putin", "Xi Jinping", "Emmanuel Macron",
    "Narendra Modi", "Jair Bolsonaro", "Justin Trudeau", "Boris Johnson", "Jacinda Ardern",
    "Leonardo DiCaprio", "Meryl Streep", "Denzel Washington", "Tom Hanks", "Jennifer Lawrence",
    "Scarlett Johansson", "Brad Pitt", "Angelina Jolie", "Johnny Depp", "Nicole Kidman",
    "Beyoncé", "Taylor Swift", "Kanye West", "Rihanna", "Jay-Z",
    "Drake", "Adele", "Ed Sheeran", "Justin Bieber", "Lady Gaga",
    "Cristiano Ronaldo", "Lionel Messi", "LeBron James", "Serena Williams", "Roger Federer",
    "Usain Bolt", "Michael Phelps", "Simone Biles", "Novak Djokovic", "Tiger Woods",
    "Elon Musk", "Jeff Bezos", "Bill Gates", "Mark Zuckerberg", "Warren Buffett",
    "Tim Cook", "Larry Page", "Sergey Brin", "Jack Ma", "Mukesh Ambani",
    "Oprah Winfrey", "Ellen DeGeneres", "Kim Kardashian", "Kylie Jenner", "Greta Thunberg",
    "Malala Yousafzai", "Dalai Lama", "Pope Francis", "Donald Glover", "Dave Chappelle",
    "Stephen King", "J.K. Rowling", "George R.R. Martin", "Neil Gaiman", "Margaret Atwood",
    "Steven Spielberg", "Christopher Nolan", "Quentin Tarantino", "Martin Scorsese", "Wes Anderson",
    "Ariana Grande", "Snoop Dogg", "Eminem", "Shakira", "Katy Perry",
    "Dwayne Johnson", "Chris Hemsworth", "Robert Downey Jr.", "Hugh Jackman", "Ryan Reynolds"
]
"""

# Dataset 2, Depth 3
"""
search_terms = [
    "Apple", "Microsoft", "Google", "Amazon", "Facebook",
    "Berkshire Hathaway", "Visa", "Johnson & Johnson", "Walmart", "Procter & Gamble",
    "Intel", "Verizon Communications", "AT&T", "Coca-Cola", "Pfizer",
    "Chevron", "Home Depot", "Merck & Co.", "Disney", "Cisco Systems",
    "IBM", "Boeing", "Goldman Sachs", "McDonald's", "3M",
    "Nike", "American Express", "UnitedHealth Group", "ExxonMobil", "General Electric",
    "JP Morgan Chase", "Morgan Stanley", "Tesla", "Netflix", "Adobe Systems",
    "Salesforce", "Oracle", "PayPal", "Qualcomm", "AMD",
    "eBay", "General Motors", "Ford Motor Company", "Delta Air Lines", "United Airlines",
    "Lockheed Martin", "Raytheon Technologies", "Northrop Grumman", "General Dynamics", "Honeywell",
    "Caterpillar", "DuPont", "FedEx", "UPS", "Booz Allen Hamilton",
    "Charles Schwab", "Citigroup", "Bank of America", "Wells Fargo", "Goldman Sachs Group",
    "American Airlines Group", "Southwest Airlines", "Gilead Sciences", "Bristol-Myers Squibb", "Amgen",
    "Biogen", "Celgene", "AbbVie", "Eli Lilly and Company", "Merck",
    "Snap Inc.", "Twitter", "LinkedIn", "Spotify", "Square",
    "Lyft", "Uber Technologies", "Airbnb", "SpaceX", "Palantir Technologies",
    "Stripe", "Robinhood", "Coinbase", "Zoom Video Communications", "Slack Technologies",
    "Dell Technologies", "HP Inc.", "IBM Corporation", "Intel Corporation", "Micron Technology",
    "NVIDIA", "Broadcom Inc.", "Texas Instruments", "Qualtrics", "Snowflake",
    "DoorDash", "Instacart", "Etsy", "Roku", "Pinterest"
]
"""

# Dataset 3
"""
search_terms = [
    "True Crime Stories", "Startup Founders", "Health and Wellness Tips", "Investing for Beginners", "Science Discoveries",
    "History Mysteries", "Technology Innovations", "Mindfulness and Meditation", "Global News Analysis", "Book Review Series",
    "Comedy Shows", "Film Criticism", "Music Industry Insights", "Sports Commentary", "Fashion Trends",
    "Culinary Arts", "Language Learning", "Travel Adventures", "Parenting Advice", "Personal Finance Strategies",
    "Political Analysis", "Art and Design", "Entrepreneurship Lessons", "Educational Strategies", "Mental Health Support",
    "Fitness Coaching", "Environmental Conservation", "Digital Marketing", "Philosophy and Ethics", "Astronomy and Space",
    "Mythology and Folklore", "Cybersecurity Updates", "Blockchain and Cryptocurrency", "Artificial Intelligence Trends", "Real Estate Investing",
    "Sustainability Practices", "Creative Writing", "Public Speaking Tips", "Career Development", "Classical Music Appreciation",
    "Video Game Reviews", "Comics and Graphic Novels", "Quantum Computing", "Social Media Strategies", "DIY Home Projects",
    "Yoga and Pilates", "Pet Care and Training", "Indie Film Making", "Jazz Music Exploration", "Virtual Reality Experiences"
]
"""

# Dataset 3
"""
search_terms = [
    "TED Talk Artificial Intelligence", "TED Talk Blockchain Technology", "TED Talk Climate Change", "TED Talk Cybersecurity", "TED Talk Data Privacy",
    "TED Talk Digital Health", "TED Talk E-commerce Trends", "TED Talk Future of Work", "TED Talk Genetic Engineering", "TED Talk Green Energy",
    "TED Talk Human Rights", "TED Talk Innovation in Education", "TED Talk Journalism and Media", "TED Talk Leadership and Management", "TED Talk Mental Health",
    "TED Talk Neurotechnology", "TED Talk Online Learning", "TED Talk Pandemic Response", "TED Talk Quantum Computing", "TED Talk Racial Equality",
    "TED Talk Renewable Resources", "TED Talk Space Exploration", "TED Talk Sustainable Living", "TED Talk Technology and Society", "TED Talk Universal Basic Income",
    "TED Talk Virtual Reality", "TED Talk Women in STEM", "TED Talk Xenotransplantation", "TED Talk Youth Activism", "TED Talk Zero Waste Lifestyle",
    "TED Talk Creative Problem Solving", "TED Talk Digital Nomads", "TED Talk Emotional Intelligence", "TED Talk Financial Literacy", "TED Talk Globalization and Trade",
    "TED Talk Health and Nutrition", "TED Talk Immigrant Experiences", "TED Talk Job Automation", "TED Talk Knowledge Sharing", "TED Talk Language Preservation",
    "TED Talk Mindfulness and Meditation", "TED Talk Nature Conservation", "TED Talk Open Source Software", "TED Talk Political Polarization", "TED Talk Quantum Teleportation",
    "TED Talk Refugee Crisis", "TED Talk Social Entrepreneurship", "TED Talk Telemedicine", "TED Talk Urban Planning", "TED Talk Vaccine Development"
]
"""

# Dataset 3
"""
search_terms = [
    "Khan Academy", "CrashCourse", "TED-Ed", "National Geographic", "SciShow",
    "MinutePhysics", "Vsauce", "CGP Grey", "Veritasium", "SmarterEveryDay",
    "AsapSCIENCE", "Physics Girl", "Numberphile", "Kurzgesagt – In a Nutshell", "BBC Earth",
    "The School of Life", "Big Think", "PBS Space Time", "3Blue1Brown", "BrainCraft",
    "Extra Credits", "Deep Look", "It's Okay To Be Smart", "The Royal Institution", "Practical Engineering",
    "Historia Civilis", "The Art Assignment", "TEDx Talks", "Computerphile", "Seeker",
    "Vsauce2", "Vsauce3", "Wendover Productions", "Real Engineering", "Biographics",
    "SciShow Space", "SciShow Psych", "MinuteEarth", "Tom Scott", "Cody'sLab",
    "Nerdwriter1", "PolyMatter", "Half as Interesting", "TierZoo", "PBS Eons",
    "The Infographics Show", "Overly Sarcastic Productions", "PatrickJMT", "Sixty Symbols", "Geography Now"
]
"""

# Dataset 4, Depth 3
"""
search_terms = [
    "Unbox Therapy", "Marques Brownlee", "Linus Tech Tips", "Dave Lee", "Jonathan Morrison",
    "Austin Evans", "TechSource", "JerryRigEverything", "CNET", "The Verge",
    "MKBHD", "TechRadar", "Engadget", "TLD Today", "UrAvgConsumer",
    "Flossy Carter", "PhoneBuff", "iJustine", "What's Inside?", "TechSmartt",
    "Mrwhosetheboss", "Pocketnow", "SuperSaf TV", "TechnoBuffalo", "Android Authority",
    "Jenna Ezarik", "Lou Later", "Tech Chap", "MobileTechReview", "EverythingApplePro",
    "TechMeOut", "GadgetMatch", "Erica Griffin", "TheUnlockr", "Zollotech",
    "Booredatwork.com", "TechCrunch", "GSMArena Official", "Geekyranjit", "Tech Spurt",
    "Safwan Ahmedmia (SuperSaf)", "Chris Stuckmann", "Canoopsy", "DetroitBORG", "iGyaan",
    "Krystal Lora", "Matthew Moniz", "Michael Fisher", "Mr Mobile", "Dave2D"
]
"""

# Dataset 4, Depth 3
"""search_terms = [
    "Tasty", "Binging with Babish", "Rosanna Pansino", "Gordon Ramsay", "Jamie Oliver",
    "King of Random", "5-Minute Crafts", "Bright Side", "ThreadBanger", "Laura in the Kitchen",
    "Yoga With Adriene", "FitnessBlender", "Bob Ross", "Adam Savage’s Tested", "Anne Reardon - How To Cook That",
    "Clean My Space", "Garden Answer", "Home Repair Tutor", "This Old House", "Man Sewing",
    "The Sorry Girls", "Makeup Geek", "Wayne Goss", "FreeCodeCamp.org", "Traversy Media",
    "Justin Guitar", "Drumeo", "Chess.com", "Peter McKinnon", "PHLEARN",
    "Sewing Parts Online", "Crazy Russian Hacker", "Primitive Technology", "Grant Thompson", "The Art Sherpa",
    "Blender Guru", "Langfocus", "Easy Languages", "Khan Academy", "CG Cookie",
    "JunsKitchen", "Maangchi", "ChefSteps", "Joshua Weissman", "Food Wishes",
    "Gentle Whispering ASMR", "The Coding Train", "Corey Schafer", "Academy of Ideas", "School of Life"
]"""

# Dataset 4, Depth 3
"""search_terms = [
    "Casey Neistat", "Zoella", "Logan Paul", "Shaytards", "Jenna Marbles",
    "David Dobrik", "PewDiePie", "Roman Atwood Vlogs", "Tyler Oakley", "Connor Franta",
    "FunForLouis", "Tanya Burr", "PointlessBlogVlogs", "Safiya Nygaard", "Dude Perfect",
    "Liza Koshy", "The Dolan Twins", "Emma Chamberlain", "James Charles", "Jeffree Star",
    "NikkieTutorials", "Shane Dawson", "Trisha Paytas", "Gigi Gorgeous", "Joey Graceffa",
    "The Ace Family", "The Michalaks", "Sailing La Vagabonde", "Jon Olsson", "Erik Conover",
    "Louis Cole", "Wil Dasovich", "Raya Was Here", "Lost LeBlanc", "Kara and Nate",
    "Sam Kolder", "Peter McKinnon", "Exploring Alternatives", "Vanessa Lau", "GaryVee",
    "Ben Brown", "Lavendaire", "Mimi Ikonn", "Alex Ikonn", "Rachel Aust",
    "The Bucket List Family", "Flying The Nest", "Kombi Life", "Eileen Aldis", "Bald and Bankrupt"
]"""


# Dataset 4, Depth 3
"""
search_terms = [
    "Intelligence Squared Debates", "Oxford Union", "Cambridge Union", "TED Conferences", "The Doha Debates",
    "Big Think", "The Aspen Institute", "World Economic Forum", "National Geographic", "PBS NewsHour",
    "Democracy Now!", "The Rubin Report", "C-SPAN", "Hoover Institution", "Council on Foreign Relations",
    "Carnegie Council for Ethics in International Affairs", "The Heritage Foundation", "Brookings Institution", "Chatham House", "American Enterprise Institute",
    "Pew Research Center", "The Economist", "Financial Times", "Vox Media", "Vice News",
    "Real Time with Bill Maher", "The Agenda with Steve Paikin", "The Royal Society", "London School of Economics and Political Science", "Harvard University",
    "Stanford University", "Yale University", "Princeton University", "MIT Media Lab", "University of California Television",
    "The New School", "New York Public Library", "Chicago Ideas", "TEDx Talks", "92nd Street Y",
    "Sydney Opera House Talks & Ideas", "RSA Events", "Skepticon", "How To Academy", "Battle of Ideas"
]
"""

# Dataset 5, Depth 5
"""
search_terms = [
    "Classical Mechanics", "Quantum Mechanics", "Thermodynamics", "Electromagnetism", "Relativity",
    "Statistical Mechanics", "Condensed Matter Physics", "Particle Physics", "Nuclear Physics", "Astrophysics",
    "Cosmology", "Fluid Dynamics", "Plasma Physics", "Field Theory", "Chaos Theory",
    "Quantum Field Theory", "String Theory", "Quantum Gravity", "Loop Quantum Gravity", "Quantum Electrodynamics",
    "Quantum Chromodynamics", "Electroweak Interaction", "Strong Nuclear Force", "Weak Nuclear Force", "General Relativity",
    "Special Relativity", "Photonics", "Optics", "Acoustics", "Solid State Physics",
    "Superconductivity", "Superfluidity", "Magnetism", "Electronics", "Quantum Computing",
    "Photovoltaics", "Nanotechnology", "Materials Science", "Energy Physics", "Biophysics",
    "Geophysics", "Meteorology", "Environmental Physics", "Medical Physics", "Neurophysics",
    "Quantum Optics", "Atomic Physics", "Molecular Physics", "Femtoscience", "Nanophysics",
    "Cryogenics", "Photonics", "Nuclear Fusion", "Nuclear Fission", "Particle Accelerators",
    "Higgs Boson", "Quantum Tunneling", "Quantum Entanglement", "Quantum Superposition", "Quantum Decoherence",
    "Bell's Theorem", "Schrodinger's Cat", "Heisenberg Uncertainty Principle", "Pauli Exclusion Principle", "Fermi-Dirac Statistics",
    "Bose-Einstein Condensate", "Wave-Particle Duality", "Michelson-Morley Experiment", "LIGO", "Gravitational Waves",
    "Dark Matter", "Dark Energy", "Black Holes", "White Dwarfs", "Neutron Stars",
    "Supernovae", "Galaxies", "The Big Bang Theory", "Cosmic Microwave Background", "Inflation Theory",
    "Multiverse Theory", "Quantum Fluctuations", "Spacetime Singularities", "Wormholes", "Time Dilation",
    "Lorentz Transformation", "Gauge Theory", "Topological Defects", "Quantum Anomalies", "Entropy",
    "Second Law of Thermodynamics", "Carnot Cycle", "Maxwell's Equations", "Gauss's Law", "Faraday's Law",
    "Atomic Structure", "Periodic Table", "Chemical Bonds", "Molecular Geometry", "Stoichiometry",
    "Thermochemistry", "Chemical Kinetics", "Chemical Equilibrium", "Acids and Bases", "Redox Reactions",
    "Electrochemistry", "Quantum Chemistry", "Spectroscopy", "Chromatography", "Mass Spectrometry",
    "Nuclear Chemistry", "Organic Chemistry", "Inorganic Chemistry", "Physical Chemistry", "Analytical Chemistry",
    "Biochemistry", "Environmental Chemistry", "Industrial Chemistry", "Polymer Chemistry", "Pharmaceutical Chemistry",
    "Green Chemistry", "Supramolecular Chemistry", "Materials Science", "Nanochemistry", "Chemical Thermodynamics",
    "Chemical Engineering", "Catalysis", "Reaction Mechanisms", "Solubility", "Phase Equilibria",
    "Coordination Chemistry", "Crystallography", "Solid State Chemistry", "Computational Chemistry", "Chemical Nomenclature",
    "Molecular Orbitals", "Valence Bond Theory", "Electron Configuration", "Lewis Structures", "VSEPR Theory",
    "Molecular Polarity", "Hybridization", "Sigma and Pi Bonds", "Ionic Bonds", "Covalent Bonds",
    "Metallic Bonds", "Van der Waals Forces", "Hydrogen Bonding", "Dipole-Dipole Interactions", "London Dispersion Forces",
    "Colligative Properties", "Osmosis and Diffusion", "Electrophilic Addition", "Nucleophilic Substitution", "Elimination Reactions",
    "Aromaticity", "Alkanes, Alkenes, Alkynes", "Functional Groups", "Stereochemistry", "Carbohydrates",
    "Proteins", "Nucleic Acids", "Lipids", "Enzyme Kinetics", "Metabolism",
    "Vitamins and Minerals", "Toxicology", "Pharmacology", "Medicinal Chemistry", "Drug Design",
    "Chemoinformatics", "Petrochemistry", "Agrochemistry", "Food Chemistry", "Cosmetic Chemistry",
    "Atmospheric Chemistry", "Ocean Chemistry", "Soil Chemistry", "Water Chemistry", "Fire Chemistry",
    "Color Chemistry", "Photochemistry", "Electrolytes", "Non-covalent Interactions", "Chemical Safety",
    "Radiochemistry", "Actinides", "Lanthanides", "Transition Metals", "Rare Earth Elements",
    "Cell Theory", "Evolution", "Genetics", "Homeostasis", "Ecology",
    "Photosynthesis", "Cellular Respiration", "Molecular Biology", "Biotechnology", "Microbiology",
    "Immunology", "Neurobiology", "Developmental Biology", "Plant Biology", "Animal Physiology",
    "Population Biology", "Conservation Biology", "Behavioral Biology", "Bioinformatics", "Evolutionary Biology",
    "Genomics", "Proteomics", "Metabolomics", "Structural Biology", "Pharmacology",
    "Environmental Biology", "Marine Biology", "Zoology", "Botany", "Mycology",
    "Parasitology", "Virology", "Bacteriology", "Endocrinology", "Reproductive Biology",
    "Taxonomy", "Biophysics", "Biochemistry", "Cellular Anatomy", "Systematics",
    "Paleobiology", "Biogeography", "Epidemiology", "Pathology", "Entomology",
    "Herpetology", "Ornithology", "Ichthyology", "Mammalogy", "Primatology",
    "Ecotoxicology", "Aquatic Biology", "Soil Biology", "Agronomy", "Horticulture",
    "Forestry", "Bioclimatology", "Paleontology", "Chronobiology", "Gerontology",
    "Cryobiology", "Biomechanics", "Biomimetics", "Synthetic Biology", "Systems Biology",
    "Astrobiology", "Nanobiology", "Computational Biology", "Functional Genomics", "Transcriptomics",
    "Phylogenetics", "Molecular Genetics", "Population Genetics", "Conservation Genetics", "Quantitative Genetics",
    "Molecular Pathogenesis", "Cancer Biology", "Neuroscience", "Cardiovascular Biology", "Pulmonary Biology",
    "Gastrointestinal Biology", "Renal Biology", "Musculoskeletal Biology", "Dermatological Biology", "Ophthalmology Biology",
    "Otology Biology", "Immunopathology", "Microbial Genetics", "Plant Pathology", "Wildlife Biology",
    "Human Biology", "Medical Biology", "Veterinary Biology", "Ethnobiology", "Paleoanthropology",
    "Cognitive Biology", "Psychobiology", "Sociobiology", "Bioethics", "Biostatistics"
]
search_terms.reverse()
"""

# Dataset 6
search_terms = [
    "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
    "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
    "Austin", "Jacksonville", "Fort Worth", "Columbus", "Charlotte",
    "San Francisco", "Indianapolis", "Seattle", "Denver", "Washington",
    "Boston", "El Paso", "Detroit", "Nashville", "Portland",
    "Memphis", "Oklahoma City", "Las Vegas", "Louisville", "Baltimore",
    "Milwaukee", "Albuquerque", "Tucson", "Fresno", "Mesa",
    "Sacramento", "Atlanta", "Kansas City", "Colorado Springs", "Miami",
    "Raleigh", "Omaha", "Long Beach", "Virginia Beach", "Oakland",
    "Minneapolis", "Tulsa", "Arlington", "Tampa", "New Orleans",
    "Wichita", "Cleveland", "Bakersfield", "Aurora", "Anaheim",
    "Honolulu", "Santa Ana", "Riverside", "Corpus Christi", "Lexington",
    "Stockton", "Henderson", "Saint Paul", "St. Louis", "Cincinnati",
    "Pittsburgh", "Greensboro", "Anchorage", "Plano", "Lincoln",
    "Orlando", "Irvine", "Newark", "Toledo", "Durham",
    "Chula Vista", "Fort Wayne", "Jersey City", "St. Petersburg", "Laredo",
    "Madison", "Chandler", "Buffalo", "Lubbock", "Scottsdale",
    "Reno", "Glendale", "Gilbert", "Winston–Salem", "North Las Vegas",
    "Norfolk", "Chesapeake", "Garland", "Irving", "Hialeah",
    "London", "Edinburgh", "Manchester", "Birmingham", "Glasgow",
    "Liverpool", "Bristol", "Oxford", "Cambridge", "Cardiff",
    "Brighton", "Newcastle", "Leeds", "Belfast", "Nottingham",
    "Sheffield", "Leicester", "Coventry", "Bradford", "Southampton",
    "Stoke-on-Trent", "Derby", "Sunderland", "Bath", "Lincoln",
    "Exeter", "Norwich", "Plymouth", "Lancaster", "Newport",
    "Preston", "Swansea", "Salford", "Aberdeen", "Westminster",
    "Portsmouth", "York", "Peterborough", "Dundee", "Lancaster",
    "Oxford", "Wolverhampton", "St Albans", "Chester", "Swindon",
    "Reading", "Blackburn", "Worcester", "Newcastle under Lyme", "Bolton",
    "Bournemouth", "Norwich", "Maidstone", "Chelmsford", "Cambridge",
    "Doncaster", "Dundee", "Wolverhampton", "Southampton", "Swansea",
    "Plymouth", "Luton", "Farnborough", "Medway", "Worthing",
    "Ipswich", "Middlesbrough", "Sunderland", "Colchester", "Crawley",
    "Crewe", "Darlington", "Eastbourne", "Exeter", "Gloucester",
    "Guildford", "Hamilton", "Hastings", "Hemel Hempstead", "Hereford",
    "High Wycombe", "Huddersfield", "Hull", "Inverness", "Ipswich",
    "Kilmarnock", "Kingston upon Thames", "Lancaster", "Leeds", "Leicester",
    "Lichfield", "Lincoln", "Lisburn", "Liverpool", "Livingston",
    "Llandudno", "Loughborough", "Lowestoft", "Luton", "Macclesfield",
    "Sydney", "Melbourne", "Brisbane", "Perth", "Adelaide",
    "Gold Coast", "Canberra", "Newcastle", "Wollongong", "Logan City",
    "Geelong", "Hobart", "Townsville", "Cairns", "Darwin",
    "Toowoomba", "Ballarat", "Bendigo", "Albury", "Launceston",
    "Mackay", "Rockhampton", "Bunbury", "Bundaberg", "Coffs Harbour",
    "Wagga Wagga", "Hervey Bay", "Mildura", "Port Macquarie", "Tamworth",
    "Orange", "Dubbo", "Geraldton", "Nowra", "Bathurst",
    "Warrnambool", "Lismore", "Gladstone", "Alice Springs", "Mount Gambier",
    "Kalgoorlie", "Taree", "Traralgon", "Orange", "Bowral",
    "Busselton", "Kalgoorlie", "Albany", "Warragul", "Devonport",
    "Lismore", "Maryborough", "Broken Hill", "Taree", "Ballina",
    "Goulburn", "Armidale", "Gympie", "Mount Isa", "Launceston",
    "Burnie", "Murray Bridge", "Mount Barker", "Victor Harbor", "Whyalla",
    "Cessnock", "Hobart", "Geelong", "Townsville", "Cairns",
    "Toowoomba", "Mandurah", "Port Hedland", "Emerald", "Broome",
    "Karratha", "Griffith", "Kingston", "Rockingham", "Bunbury",
    "Horsham", "Port Lincoln", "Warwick", "Kempsey", "Portland",
    "Grafton", "Katherine", "Bairnsdale", "Sale", "Muswellbrook",
    "Parkes", "Lithgow", "Singleton", "Port Pirie", "Campbelltown",
    "Toronto", "Montreal", "Vancouver", "Calgary", "Edmonton",
    "Ottawa", "Winnipeg", "Quebec City", "Hamilton", "Kitchener",
    "London", "Victoria", "Halifax", "Oshawa", "Windsor",
    "Saskatoon", "St. Catharines", "Regina", "St. John’s", "Kelowna",
    "Barrie", "Sherbrooke", "Guelph", "Abbotsford", "Kingston",
    "Kanata", "Trois-Rivières", "Moncton", "Chicoutimi", "Milton",
    "Red Deer", "Brantford", "Thunder Bay", "White Rock", "Nanaimo",
    "Sudbury", "Lethbridge", "Saint-Jean-sur-Richelieu", "Peterborough", "Kamloops",
    "Saint-Jérôme", "Chilliwack", "Sarnia", "Châteauguay", "Drummondville",
    "Belleville", "Fort McMurray", "Sault Ste. Marie", "Prince George", "Medicine Hat",
    "Welland", "Grande Prairie", "Airdrie", "Granby", "Fredericton",
    "Saint John", "Beloeil", "North Bay", "Saint-Hyacinthe", "Brandon",
    "Vernon", "New Westminster", "Woodstock", "Georgetown", "St. Thomas",
    "Rocky Mountain House", "Lloydminster", "Orillia", "Stratford", "Orangeville",
    "Cape Breton-Sydney", "Courtenay", "Leduc", "Timmins", "Quesnel",
    "Brooks", "Fort St. John", "Cranbrook", "Squamish", "Saint-Georges",
    "Clarington", "Rimouski", "Laval", "Marieville", "Penticton",
    "St. Albert", "Sherwood Park", "Lévis", "Saguenay", "Trois-Rivières",
    "Lakeshore", "Kelowna", "Cambridge", "Whitby", "Gatineau",
    "Langley", "Ajax", "Prince Albert", "Pickering", "Oakville"
]

In [None]:
lock = threading.Lock()
start_time = time.time()
starting_videos = str(len(existing_videos))

print("Starting with " + starting_videos + " videos")
print("Starting at", start_time)
print("\n")

global searches_done
searches_done = False

global _SEARCH_QUEUE
_SEARCH_QUEUE += len(search_terms)

# Starting the summary thread
summary_thread = threading.Thread(target=print_summary_periodically, args=(5, 12))
summary_thread.start()
    
with ThreadPoolExecutor(max_workers = 128) as executor:
    for search_term in search_terms:
        time.sleep(random.uniform(0, 20))
        executor.submit(perform_search, search_term)
        
with lock:
    searches_done = True

summary_thread.join()

end_time = time.time()
ending_videos = str(len(existing_videos))
print("DONE", int(time.time()), "| Queued:", _QUEUED, "| Working:", _WORKING, "| Downloading:", _DOWNLOADING, "| Downloaded:", _DOWNLOADED, "| Skipped:", _SKIPPED, "| Exceptioned:", _EXCEPTIONED)
print("\n")
print("Started with " + str(starting_videos) + " videos")
print("Started at", start_time)
print("Ending with " + str(ending_videos) + " videos")
print("Ending at", end_time)

Starting with 55784 videos
Starting at 1708564427.6764553


1 1708564432 | Queued: 0 | Working: 0 | Downloading: 0 | Good: 0 | Skip: 0 | Error: 0 | Term Queue: 390 | Term Working: 0
STARTING SEARCH TERM New York
STARTING SEARCH TERM Los Angeles
2 1708564437 | Queued: 0 | Working: 0 | Downloading: 0 | Good: 0 | Skip: 0 | Error: 0 | Term Queue: 388 | Term Working: 2
3 1708564442 | Queued: 0 | Working: 0 | Downloading: 0 | Good: 0 | Skip: 0 | Error: 0 | Term Queue: 388 | Term Working: 2
4 1708564447 | Queued: 127 | Working: 2 | Downloading: 1 | Good: 1 | Skip: 63 | Error: 0 | Term Queue: 388 | Term Working: 2
STARTING SEARCH TERM Chicago
5 1708564452 | Queued: 112 | Working: 2 | Downloading: 0 | Good: 5 | Skip: 74 | Error: 0 | Term Queue: 387 | Term Working: 3
6 1708564458 | Queued: 99 | Working: 2 | Downloading: 1 | Good: 8 | Skip: 84 | Error: 0 | Term Queue: 387 | Term Working: 3
STARTING SEARCH TERM Houston
7 1708564463 | Queued: 114 | Working: 3 | Downloading: 2 | Good: 11 | Skip: 162

In [10]:
# import glob
# import os
# from moviepy.editor import AudioFileClip

# def static_analysis(location):
#     folders = {x for x in os.listdir(location)}
#     print("Videos crawled:", len(folders))
#     print()
    
#     has_audio = 0 
#     has_at_least_one_transcript = 0
    
#     total_bytes = 0
#     total_length = 0
#     transcripts = {}   

#     for folder in folders[:500]:
#         if not os.path.exists(location + folder + "/audio.mp4"):
#             continue
#         total_bytes += os.path.getsize(location + folder + "/audio.mp4")
#         if total_bytes == 0:
#             continue
#         has_audio += 1
#         with AudioFileClip(location + folder + "/audio.mp4") as audio:
#             total_length += audio.duration
#         has_transcript = False
#         for file in os.listdir(location + folder):
#             if ".json" in file:
#                 has_transcript = True
#                 language = file.replace("caption-", "").replace(".json", "")
#                 if language not in transcripts:
#                     transcripts[language] = 0
#                 transcripts[language] += 1
#         if has_transcript:
#             has_at_least_one_transcript += 1
        
#     print("Downloaded mp4 Files:", has_audio)
#     print("Total mp4 Audio Duration (seconds):", int(total_length))
#     print("Total mp4 Filesize (bytes):", total_bytes)
#     print()
    
#     transcripts = sorted(transcripts.items(), key=lambda item: item[1], reverse=True)
#     print("Has at least one transcript:", has_at_least_one_transcript)
#     for language, count in transcripts:
#         print("Transcripts of language", language + ":", count)

# static_analysis("dataset-4/")

Videos crawled: 5367



TypeError: 'set' object is not subscriptable