In [1]:
import os
import json
import time
from concurrent.futures import ThreadPoolExecutor
import threading
import time
from pytube import Search
import logging
import random
logging.getLogger().setLevel(logging.ERROR)

In [2]:
_QUEUED = 0
_WORKING = 0
_DOWNLOADING = 0

_SKIPPED = 0
_EXCEPTIONED = 0
_DOWNLOADED = 0

_SEARCH_QUEUE = 0
_SEARCH_WORKING = 0

_HIT_429 = False

In [3]:
def attempt_download(yt_obj, existing_videos, location):
    global _SKIPPED, _EXCEPTIONED, _DOWNLOADING, _DOWNLOADED, _WORKING, _HIT_429
    vid = yt_obj.video_id
    
    # print(vid, "attempt_download called")
    if vid in existing_videos:
        # print(vid, "Skipping, found existing folder.")
        with lock:
            _SKIPPED += 1
            _WORKING -= 1
        return 0
        
    os.makedirs(location + vid)
    try:
        stream = yt_obj.streams.get_by_itag(139)
    except Exception as e:
        print(vid, "Stopping download because of Exception", e)
        with lock:
            _EXCEPTIONED += 1
            _WORKING -= 1
        if "HTTP Error 429" in e.args[0]:
            print("KILLING THREAD DUE to 429 ERROR")
            with lock:
                _HIT_429 = True
            return -1
        return 0

    with lock:
        _DOWNLOADING += 1
    stream.download(output_path = location + vid + "/", filename = "audio.mp4")
    # print(vid, "Audio downloaded")
    
    # print(vid, "Found captions for", list(yt_obj.captions.keys()))
    for caption in yt_obj.captions:
        if "en" in caption.code:
            a_en_caption = caption.json_captions
            json.dump(a_en_caption, open(location + vid + "/caption-" + caption.code + ".json", 'w'))
            # print(vid, "Downloaded", caption.code, "captions")
    
    existing_videos.add(yt_obj.video_id)
    with lock:
        _DOWNLOADED += 1
        _DOWNLOADING -= 1
        _WORKING -= 1
    return 1

In [4]:
def deep_search(search_term, existing_videos, location, depth):
    global _SEARCH_QUEUE, _SEARCH_WORKING, _QUEUED, _WORKING
    
    print("STARTING SEARCH TERM", search_term)
    
    with lock:
        _SEARCH_QUEUE -= 1
        _SEARCH_WORKING += 1
        
    # print("Search", search_term, "Performing search at depth", depth)
    search = Search(search_term)
    search.results
    for _ in range(depth - 1):
        time.sleep(random.uniform(0, 2))
        search.get_next_results()
        
    # print("Search", search_term, "Found", len(search.results), "videos")
    # print(search.results)

    start_time = time.time()
    search_results = len(search.results)
    hits = 0

    with lock:
        _QUEUED += search_results

    # print(search.results)
    for yt in search.results:
        # print(yt)
        with lock:
            _QUEUED -= 1
            _WORKING += 1
        result = attempt_download(yt, existing_videos, location)
        if result == -1:
            with lock:
                _SEARCH_WORKING -= 1
            return -1
        hits += result
    
    duration = time.time() - start_time
    with lock:
        _SEARCH_WORKING -= 1
    print("COMPLETED SEARCH TERM", search_term, "|", str(int(duration)), "seconds |", str(int(search_results)), "results |", str(int(hits)), "downloaded")
    return attempts, success

In [5]:
def perform_search(search_term):
    with lock:
        if _HIT_429:
            _SEARCH_QUEUE -= 1
            print("429 HIT, NOT PERFORMING SEARCH", search_term)
            return
    depth = 10
    location = "dataset-second-b1/"
    deep_search(search_term, existing_videos, location, depth)

In [6]:
def print_summary_periodically(interval, diff_interval):
    cnt = 0
    while True:
        cnt += 1
        time.sleep(interval)
        with lock:
            print(cnt, int(time.time()), "| Queued:", _QUEUED, "| Working:", _WORKING, "| Downloading:", _DOWNLOADING, "| Good:", _DOWNLOADED, "| Skip:", _SKIPPED, "| Error:", _EXCEPTIONED, "| Term Queue:", _SEARCH_QUEUE, "| Term Working:", _SEARCH_WORKING)
        if searches_done:
            break

In [7]:
existing_dirs_file = "combined_folders_1130am_feb25.txt"
existing_videos_from_file = set()
with open(existing_dirs_file, 'r') as file:
    for line in file:
        parts = line.strip().split('/')
        if len(parts) > 1:
            existing_videos_from_file.add(parts[1])
print(len(existing_videos_from_file))

existing_dirs_on_machine = ["dataset-b-1/", "dataset-b-2/", "dataset-b-3/", "dataset-second-b1/"]
existing_videos_on_machine = {file for directory in existing_dirs_on_machine for file in os.listdir(directory)}
print(len(existing_videos_on_machine))

existing_videos = existing_videos_on_machine | existing_videos_from_file
print(len(existing_videos))

211718
57741
221214


In [8]:
search_terms = [
    "Scott Morrison", "Anthony Albanese", "Malcolm Turnbull", "Julia Gillard", "Kevin Rudd",
    "John Howard", "Paul Keating", "Bob Hawke", "Tony Abbott", "Bill Shorten",
    "Peter Dutton", "Tanya Plibersek", "Josh Frydenberg", "Penny Wong", "Richard Marles",
    "Barnaby Joyce", "Michael McCormack", "Christine Milne", "Adam Bandt", "Larissa Waters",
    "Richard Di Natale", "Bob Brown", "Simon Birmingham", "Marise Payne", "Mathias Cormann",
    "Christopher Pyne", "Joe Hockey", "Peter Costello", "Kim Beazley", "Mark Latham",
    "Julie Bishop", "Greg Hunt", "Christian Porter", "Linda Burney", "Catherine King",
    "Jim Chalmers", "Jason Clare", "Chris Bowen", "Albo", "Anne Aly",
    "Tim Wilson", "Andrew Leigh", "Sarah Hanson-Young", "Nick Xenophon", "Pauline Hanson",
    "Clive Palmer", "Jacqui Lambie", "Cory Bernardi", "Malcolm Roberts", "David Leyonhjelm",
    "Mehreen Faruqi", "Jordon Steele-John", "Amanda Stoker", "Pat Dodson", "Malarndirri McCarthy",
    "Ken Wyatt", "Lidia Thorpe", "Jenny McAllister", "Katy Gallagher", "Simon Crean",
    "Stephen Conroy", "Fiona Nash", "Mitch Fifield", "Michaelia Cash", "Bridget McKenzie",
    "Matt Canavan", "David Littleproud", "Angus Taylor", "Alan Tudge", "Stuart Robert",
    "Ed Husic", "Michelle Rowland", "Mark Butler", "Terri Butler", "Tony Burke",
    "Joel Fitzgibbon", "Deborah O'Neill", "Kim Carr", "Kristina Keneally", "Don Farrell",
    "Murray Watt", "Sue Lines", "Scott Ryan", "Tony Smith", "Bronwyn Bishop",
    "Peter Slipper", "Harry Jenkins", "Anna Burke", "Rob Mitchell", "Andrew Wallace",
    "David Smith", "Alicia Payne", "Andrew Giles", "Anne Stanley", "Anne Webster",
    "Anthony Byrne", "Antony Green", "Amanda Rishworth", "Alex Hawke", "Angie Bell"
]
search_terms.reverse()

In [None]:
lock = threading.Lock()
start_time = time.time()
starting_videos = str(len(existing_videos))

print("Starting with " + starting_videos + " videos")
print("Starting at", start_time)
print("\n")

global searches_done
searches_done = False

global _SEARCH_QUEUE
_SEARCH_QUEUE += len(search_terms)

# Starting the summary thread
summary_thread = threading.Thread(target=print_summary_periodically, args=(5, 12))
summary_thread.start()

futures = []
with ThreadPoolExecutor(max_workers = 128) as executor:
    for search_term in search_terms:
        time.sleep(random.uniform(0, 20))
        futures.append(executor.submit(perform_search, search_term))

for future in as_completed(futures):
    try:
        result = future.result()
    except Exception as exc:
        print(f'Top-level thread generated an exception: {exc}')

with lock:
    searches_done = True

summary_thread.join()

end_time = time.time()
ending_videos = str(len(existing_videos))
print("DONE", int(time.time()), "| Queued:", _QUEUED, "| Working:", _WORKING, "| Downloading:", _DOWNLOADING, "| Downloaded:", _DOWNLOADED, "| Skipped:", _SKIPPED, "| Exceptioned:", _EXCEPTIONED)
print("\n")
print("Started with " + str(starting_videos) + " videos")
print("Started at", start_time)
print("Ending with " + str(ending_videos) + " videos")
print("Ending at", end_time)

Starting with 221214 videos
Starting at 1709081062.3539186


STARTING SEARCH TERM Angie Bell
1 1709081067 | Queued: 0 | Working: 0 | Downloading: 0 | Good: 0 | Skip: 0 | Error: 0 | Term Queue: 99 | Term Working: 1
2 1709081072 | Queued: 0 | Working: 0 | Downloading: 0 | Good: 0 | Skip: 0 | Error: 0 | Term Queue: 99 | Term Working: 1
STARTING SEARCH TERM Alex Hawke
3 1709081077 | Queued: 0 | Working: 0 | Downloading: 0 | Good: 0 | Skip: 0 | Error: 0 | Term Queue: 98 | Term Working: 2
STARTING SEARCH TERM Amanda Rishworth
8tsR4FSkV2A Stopping download because of Exception 8tsR4FSkV2A is age restricted, and can't be accessed without logging in.
4 1709081082 | Queued: 59 | Working: 1 | Downloading: 1 | Good: 0 | Skip: 112 | Error: 1 | Term Queue: 97 | Term Working: 3
5 1709081087 | Queued: 58 | Working: 1 | Downloading: 1 | Good: 1 | Skip: 112 | Error: 1 | Term Queue: 97 | Term Working: 3
STARTING SEARCH TERM Antony Green
AHXV_XgDBmU Stopping download because of Exception AHXV_XgDBmU is ag