This notebook contains all scraping-related functionality.

The parts of the code which call the ``yt-dlp`` module are all written out in this notebook to be able to conveniently run ``yt-dlp`` via iPython shell commands. Other parts are mostly outsourced to ``scraping_utils.py`` and ``transcript_utils.py``.


### Data Gathering and Filtering Process Overview

1. **Search**
    - Get search results (i.e. videos) for list of search queries
2. **Channel List**
    - Apply filters to search results 
    - Compile list of channels from filtered search results
3. **Channel Video List**
    - Get (full) list of videos (and shorts) for each channel
    - Note: Not full video metadata - only ID, title, view count, approximate upload date, duration (if available...)
4. **Filtered Video List**
    - Apply filters to video list
      - most importantly: upload date (approximately) within timeframe of interest (except for shorts - not available...)
      - view count threshold, language, etc.
    - *Potentially: Classify video titles into relevant and irrelevant (some channels might have majority non-finance content)*
5. **Video Transcripts and Metadata**
    - Get full metadata and transcripts for each video
    - Note: This is the most time-consuming part of the process -> filtering should be done as much as possible before this step
6. **Filtered Transcripts and Metadata**
    - Apply filters which couldn't be applied before (such as exact video upload date & upload date for shorts)
    - Save final selection of data to 'data' folder 
    - Create an index file for the videos included in the dataset


### Step 1: Get search query results

In [5]:
# scraping setup
from yt_search_lists import queries_stocks, queries_funds, queries_crypto, queries_commodities

active_queries = [(queries_stocks, "stocks"), 
              (queries_funds, "funds"), 
              (queries_crypto, "crypto"), 
              (queries_commodities, "commodities")
              ]

ytsearch_n = 30 # number of search results to return for each query (before any filtering!)

In [6]:
### search query execution
import time
import os

output_dir = "1_search"
infojson_dir = f"{output_dir}/info_jsons"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(infojson_dir):
    os.makedirs(infojson_dir)


start_time = time.time()

# implementation in loop
for query_list, query_type in active_queries:
    # create files with headers if necessary
    if not os.path.exists(f"{output_dir}/searchresults_{query_type}.csv"):
        with open(f"{output_dir}/searchresults_{query_type}.csv", "w") as f:
            f.write("upload_date;language;duration;uploader_id;channel_name;video_id;title;tags;query\n")
            
    for idx, query in enumerate(query_list):

        query_start_time = time.time()

        query = query.replace(" ", "+") # youtube search uses + instead of spaces
        output_file = f"{output_dir}/searchresults_{query_type}.csv"

        # download query results using yt-dlp
        !yt-dlp \
        --skip-download \
        --parse-metadata "video::(?P<formats>)" \
        --parse-metadata "video::(?P<automatic_captions>)" \
        --parse-metadata "video::(?P<thumbnails>)" \
        --no-abort-on-error \
        --sleep-requests 1 \
        --write-info-json \
        --quiet \
        --compat-options no-playlist-metafiles \
        --output "infojson:{infojsons_dir}/%(uploader_id)s_%(id)s" \
        --print-to-file "%(upload_date)s;%(language)s;%(duration)s;%(uploader_id)s;%(channel)s;%(id)s;%(title)s;%(tags)s;{query}" {output_file} \
        "ytsearch{ytsearch_n}:{query}"

        # print progress
        t = time.time()
        minutes, seconds = divmod(t - query_start_time, 60) # running time for current query
        print("-"*60)
        print(f"-- Retrieved {ytsearch_n} search results in {minutes} min {round(seconds)} sec for query: '{query}'")
        print(f"-- {idx+1}/{len(query_list)} queries done for query type: {query_type}")
        minutes, seconds = divmod(t - start_time, 60) # total running time
        print(f"-- Total runtime: {minutes} min {round(seconds)} sec.")

------------------------------------------------------------
-- Retrieved 30 search results in 163.7499852180481 seconds for query: 'best+stocks+to+buy'
-- 1/43 queries done for query type:stocks!
-- Total runtime: 2.0 min 43.749985218048096 sec.
------------------------------------------------------------
------------------------------------------------------------
-- Retrieved 30 search results in 157.37215304374695 seconds for query: 'top+shares+to+buytop+stocks+to+invest+in'
-- 2/43 queries done for query type:stocks!
-- Total runtime: 5.0 min 21.122138261795044 sec.
------------------------------------------------------------
------------------------------------------------------------
-- Retrieved 30 search results in 156.1470592021942 seconds for query: 'undervalued+stocks'
-- 3/43 queries done for query type:stocks!
-- Total runtime: 7.0 min 57.26919746398926 sec.
------------------------------------------------------------
------------------------------------------------------



------------------------------------------------------------
-- Retrieved 30 search results in 162.0878562927246 seconds for query: 'best+stocks+august+2018'
-- 27/43 queries done for query type:stocks!
-- Total runtime: 71.0 min 0.95076584815979 sec.
------------------------------------------------------------
------------------------------------------------------------
-- Retrieved 30 search results in 156.58734965324402 seconds for query: 'best+stocks+september+2019'
-- 28/43 queries done for query type:stocks!
-- Total runtime: 73.0 min 37.53811550140381 sec.
------------------------------------------------------------
------------------------------------------------------------
-- Retrieved 30 search results in 152.1039080619812 seconds for query: 'best+stocks+october+2020'
-- 29/43 queries done for query type:stocks!
-- Total runtime: 76.0 min 9.64202356338501 sec.
------------------------------------------------------------
-------------------------------------------------------

### Step 2: Filter search results and compile unique channel list

In [1]:
from scraping_utils import load_channel_search_results, filter_search_results
import pandas as pd

### pre-filtering channels

## read in channel search csvs
channel_search_dir = "1_search"
query_types_to_include = ["stocks", "funds", "crypto", "commodities"]

df = load_channel_search_results(channel_search_dir, query_types_to_include)

# apply filters to initial video search results and obtain df of unique channels for which at least one video passes filters

channels = filter_search_results(df, 
                                 query_types_to_include=query_types_to_include, # already defined above
                                 langs_to_include= ["en"], 
                                 min_view_count = 1000,
                                 min_channel_follower_count = 10000, 
                                 max_duration = 45*60 # (sec), to filter out super long-form podcasts etc.
)

fixed line (semicolons in tags): 20220207;en;101;@MonarchNetworth;Monarch Networth;hqHu39Sgzqk;#GemsofMNCL | Stock Picks | 2021;['MNCL', 'Monarch', 'Networth', 'Capital', 'Ltd', 'vaibhav', 'shah', 'mnclgroup.com'];stock+picks+2021
Could not find file: 1_search/info_jsons/@j.m0ney133_DwQKPtKtW7c.info.json
channel_follower_count missing in: 1_search/info_jsons/@moneywise1379_hwnnKPPIr0I.info.json
Could not find file: 1_search/info_jsons/@marketpost.inbystockmarket_82exJU-IMYk.info.json
Could not find file: 1_search/info_jsons/@LewisHarding.Finance_xxVYLFd3JO8.info.json
Could not find file: 1_search/info_jsons/@capturn.official_DcVr1LsZqRY.info.json
------------------------------------------------------------
Read in 2008 videos (query categories: ['stocks' 'funds' 'crypto' 'commodities'])
Missing value counts:
upload_date                 0
language                  164
duration                    1
uploader_id                 0
channel_name                0
video_id                    0


In [None]:
# rename all files in channel_search/info_jsons 
# previous naming scheme: {uploader_id}_{video_id}.{ext}_info.info.json
# new naming scheme: {uploader_id}_{video_id}.info.json

# note: fixed earlier version of yt-dlp call so this is no longer necessary
from scraping_utils import rename_infojsons

#infojsons_dir = "channel_search/info_jsons"
#rename_infojsons(infojsons_dir)



In [2]:
# remove news (and other irrelevant) channels 
from yt_search_lists import news_channels, non_relevant_channels # (formerly: other_channels_to_ignore)

print(f"number of channels before dropping any: {len(channels)}")
channels = channels[~channels.uploader_id.isin(news_channels)]
print(f"number of channels after dropping news_channels: {len(channels)}")
channels = channels[~channels.uploader_id.isin(non_relevant_channels)].reset_index(drop=True)
print(f"number of channels after dropping non_relevant_channels: {len(channels)}")

# note: entries have been added to non_relevant_channels after this cell was last run (and filtered out in a later step)

number of channels before dropping any: 324
number of channels after dropping news_channels: 311
number of channels after dropping other_channels_to_ignore: 307


In [37]:
## step 4: save to csv, with index
channel_search_dir = "1_search"
channels.to_csv(f"{channel_search_dir}/filtered_channels.csv", sep=";", header=True, index=True)

### Step 3: Get full video lists for channels

In [3]:
# load from csv, with index col
channel_list_dir = "2_channel_list"
channels = pd.read_csv(f"{channel_list_dir}/filtered_channels.csv", sep=";", header=0, index_col=0)
print(f"number of channels after loading from csv: {len(channels)}")

number of channels after loading from csv: 309


In [97]:
import time
import os
### get playlist info for channels (video ids, titles, approx. upload dates, etc. of all videos and shorts)

output_dir = "3_channel_video_lists/channel_playlists"
channel_list_dir = "2_channel_list"
start_at_df_idx = 0 # adjust in case a previous run was interrupted

# load channels from csv
channels = pd.read_csv(f"{channel_list_dir}/filtered_channels.csv", sep=";", header=0, index_col=0)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
skipped_channels = []
start_time = time.time()

channels_to_iterate = channels.iloc[start_at_df_idx:]
#channels_to_iterate = channels[channels['uploader_id'].isin(['@investingforall5118', '@JoelKlattShow'])].reset_index(drop=True)
for idx, row in channels_to_iterate.iterrows():

    channel_start_time = time.time()
    channel_id = row['uploader_id']
    output_file = f"{output_dir}/video_list_{channel_id}.csv"

    # only proceed if file doesn't exist yet
    if not os.path.isfile(output_file):
        with open(output_file, "w") as f:
            f.write("channel_id;video_id;approx_upload_date;duration;yt_video_type;view_count;title\n")

        # get --flat-playlist info for all normal videos and shorts (-> no live streams!)
        for yt_video_type in ['video', 'short']:
            !yt-dlp \
            --skip-download \
            --no-abort-on-error \
            --sleep-requests 1 \
            --flat-playlist \
            --compat-options no-playlist-metafiles \
            --extractor-args "youtubetab:approximate_date=True" \
            --quiet \
            --print-to-file "{channel_id};%(id)s;%(upload_date)s;%(duration)s;{yt_video_type};%(view_count)s;%(title)s" {output_file} \
            "https://www.youtube.com/{channel_id}/{yt_video_type}s"

        # print progress
        t = time.time()
        print("-"*60)
        minutes, seconds = divmod(t - channel_start_time, 60)
        print(f"-- Retrieved video list for channel {channel_id} in {minutes} min {round(seconds)} sec.")
        print(f"-- {idx+1}/{len(channels_to_iterate)} channels done.")
        minutes, seconds = divmod(t - start_time, 60) # total running time
        print(f"-- Total runtime: {minutes} min {round(seconds)} sec.")

    else:
        print(f"skipping channel {channel_id} (file already exists)")
        skipped_channels.append(channel_id)

print(f"skipped channels: {skipped_channels}")


skipping channel @JoelKlattShow (file already exists)
skipping channel @investingforall5118 (file already exists)
skipped channels: ['@JoelKlattShow', '@investingforall5118']


In [1]:
### cleaning the csvs (see channel_search_utils.fix_channel_playlist_csvs for details)

# we proceed as follows:
    # 1. create new folder for fixed csvs
    # 2. iterate over all csvs in backup folder (read-only!) and write lines (with fixes if necessary) to new csvs in new folder
    # 3. verify new csvs for correctness
    # 4. rename old folder to backup_pre_title_fix and new folder to old folder name

# this cell performs steps 1-3

from scraping_utils import fix_channel_playlist_csvs, load_channel_playlist_csvs, check_channel_playlist_df

old_dir = "3_channel_video_lists/channel_playlists"
new_dir = "3_channel_video_lists/channel_playlists_fixed"

new_dir = old_dir # COMMENT OUT TO ACTUALLY PERFORM FIXES!!!

#faulty_lines = fix_channel_playlist_csvs(old_dir=, new_dir, return_faulty_lines=True)
df = load_channel_playlist_csvs(new_dir)
check_channel_playlist_df(df)

empty file: video_list_@investingforall5118.csv
empty file: video_list_@JoelKlattShow.csv
------------------------------------------------------------
Read in 414016 videos from 306 channels.
------------------------------------------------------------
data types:
channel_id                    object
video_id                      object
approx_upload_date    datetime64[ns]
duration                     float64
yt_video_type                 object
view_count                   float64
title                         object
dtype: object
------------------------------------------------------------
n videos with na values: 2/354703
n shorts with na values: 59313/59313 (duration and upload date aren't available for shorts)


In [3]:
# check videos with na values
# these should only be due to issues from youtube's side (e.g. video was deleted)
# (note: apparently there are videos without a viewcount (e.g. m5PM6kDaNrs) or with other weird issues (e.g. 52EWV2rQuZI)
df[df.yt_video_type == 'video'][df[df.yt_video_type == 'video'].isna().any(axis=1)]


Unnamed: 0,channel_id,video_id,approx_upload_date,duration,yt_video_type,view_count,title
256141,@RevZilla,m5PM6kDaNrs,2010-01-27,263.0,video,,Dainese Hot Weather Mesh Jacket Options - Por...
275536,@SchoolOfTrade,52EWV2rQuZI,2010-01-27,,video,0.0,08 27 09SwingHigh


In [2]:
# check na counts for every column
df.isna().sum()

channel_id                0
video_id                  0
approx_upload_date    59313
duration              59314
yt_video_type             0
view_count                1
title                     0
dtype: int64

In [18]:
# step 4: if check looks good, make old folder into backup dir and rename new folder to old folder name

import os

old_dir = "3_channel_video_lists/channel_playlists"
new_dir = "3_channel_video_lists/channel_playlists_fixed"

backup_dir = f"{old_dir}_backup_pre_fix"

if not os.path.exists(new_dir):
    print(f"error: new folder {new_dir} does not exist, please create it first")
elif os.path.exists(backup_dir):
    print(f"error: backup folder {backup_dir} already exists, please delete it first.")
else:
    # rename old folder to backup folder name
    os.rename(old_dir, backup_dir)
    # rename new folder to old folder name
    os.rename(new_dir, old_dir)
    print(f"Successfully renamed folders.")

error: new folder 3_channel_video_lists/channel_playlists_fixed does not exist, please create it first


### Step 4: Filter channel video lists for final round of metadata & transcript scraping

In [2]:
import pandas as pd
from scraping_utils import load_channel_playlist_csvs, filter_channel_playlist_df
# filters (note: we can't filter shorts by upload date!)
df = load_channel_playlist_csvs("3_channel_video_lists/channel_playlists")

df = filter_channel_playlist_df(df, 
                                max_duration = 45*60, # (sec), to filter out super long-form podcasts etc.
                                min_view_count = 10000,
                                min_upload_date = pd.to_datetime("2015-01-01"), # consider approximate dates!
                                max_upload_date = pd.to_datetime("2022-12-31") # consider approximate dates!
)




empty file: video_list_@investingforall5118.csv
empty file: video_list_@JoelKlattShow.csv
------------------------------------------------------------
Read in 414016 videos from 306 channels.
------------------------------------------------------------
number of videos before any filters: 414016
number of videos after duration filter: 402866
number of videos after view count filter: 130924
number of videos after upload date filter: 86477
shorts/normal videos: 21828/64649


In [4]:
# save filtered video list to csv 
df = df.sort_values(by=["channel_id", "approx_upload_date"], ascending=[True, False]).reset_index(drop=True)
df.to_csv("4_filtered_videos/filtered_videos.csv", sep=";", header=True, index=True)

### Step 5: Get full metadata and transcripts for each video

In [1]:
# download transcripts and metadata for remaining videos 
# note: this cell can be stopped and resumed 

import time
import pandas as pd

download_dir = "5_transcripts_and_metadata"
archive_file = f"{download_dir}/downloaded_index.csv"

# read in filtered video list
df = pd.read_csv("4_filtered_videos/filtered_videos.csv", sep=";", header=0, index_col=0)

# get IDs to to download (by checking against already downloaded IDs)
downloaded = pd.read_csv(archive_file, sep=";", header=0, index_col=0)
download_list = df['video_id'].to_list()
archive_list = downloaded['video_id'].to_list()
ids_to_download = [x for x in download_list if x not in archive_list]

print(f"Number of videos already downloaded: {len(archive_list)}")
print(f"Number of videos left to download: {len(ids_to_download)}")
print(f"-"*60)


progress_print_bs = 50 # print progress every x videos
start_time = time.time()
for idx, video_id in enumerate(ids_to_download):
    
    !yt-dlp \
    --no-abort-on-error \
    --write-auto-sub \
    --write-thumbnail \
    --skip-download \
    --sub-lang en \
    --sub-format ttml \
    --parse-metadata "video::(?P<formats>)" \
    --parse-metadata "video::(?P<automatic_captions>)" \
    --sponsorblock-mark all \
    --write-info-json \
    --output "infojson:{download_dir}/infojsons/%(uploader_id)s_%(id)s" \
    --output "thumbnail:{download_dir}/thumbnails/%(uploader_id)s_%(id)s_thumbnail.%(ext)s" \
    --output "subtitle:{download_dir}/transcripts/%(uploader_id)s_%(id)s_subs.%(ext)s" \
    --print-to-file "%(uploader_id)s;%(id)s" {archive_file} \
    --sleep-requests 0.2 \
    --quiet \
    "https://www.youtube.com/watch?v={video_id}"


    # print progress
    if (idx+1) % progress_print_bs == 0:
        t = time.time()
        minutes, seconds = divmod(t - start_time, 60)
        hours, minutes = divmod(minutes, 60)
        print(f"-- Downloaded {idx+1}/{len(ids_to_download)} IDs in {hours:.0f} h {minutes:.0f} min {seconds:.1f} sec since start.")

Number of videos already downloaded: 60188
Number of videos left to download: 26289
------------------------------------------------------------


ERROR: [youtube] 7zuW1TBVSF0: Private video. Sign in if you've been granted access to this video
ERROR: [youtube] 2yB183XtOQg: Private video. Sign in if you've been granted access to this video
ERROR: [youtube] bs4sKe_WMck: Private video. Sign in if you've been granted access to this video


-- Downloaded 50/26289 IDs in 0 h 2 min 4.3 sec since start.




-- Downloaded 100/26289 IDs in 0 h 4 min 15.4 sec since start.




-- Downloaded 150/26289 IDs in 0 h 6 min 34.2 sec since start.
-- Downloaded 200/26289 IDs in 0 h 9 min 0.0 sec since start.




-- Downloaded 250/26289 IDs in 0 h 11 min 24.5 sec since start.
-- Downloaded 300/26289 IDs in 0 h 13 min 48.5 sec since start.
-- Downloaded 350/26289 IDs in 0 h 16 min 14.1 sec since start.
-- Downloaded 400/26289 IDs in 0 h 18 min 34.9 sec since start.
-- Downloaded 450/26289 IDs in 0 h 21 min 46.6 sec since start.
-- Downloaded 500/26289 IDs in 0 h 24 min 28.4 sec since start.
-- Downloaded 550/26289 IDs in 0 h 27 min 4.1 sec since start.
-- Downloaded 600/26289 IDs in 0 h 29 min 31.6 sec since start.
-- Downloaded 650/26289 IDs in 0 h 31 min 58.7 sec since start.
-- Downloaded 700/26289 IDs in 0 h 34 min 23.0 sec since start.




-- Downloaded 750/26289 IDs in 0 h 36 min 39.8 sec since start.




-- Downloaded 800/26289 IDs in 0 h 39 min 3.0 sec since start.




-- Downloaded 850/26289 IDs in 0 h 41 min 22.3 sec since start.




-- Downloaded 900/26289 IDs in 0 h 43 min 36.1 sec since start.




-- Downloaded 950/26289 IDs in 0 h 45 min 53.5 sec since start.




-- Downloaded 1000/26289 IDs in 0 h 48 min 10.4 sec since start.




-- Downloaded 1050/26289 IDs in 0 h 50 min 33.7 sec since start.
-- Downloaded 1100/26289 IDs in 0 h 53 min 10.6 sec since start.
-- Downloaded 1150/26289 IDs in 0 h 55 min 29.7 sec since start.
-- Downloaded 1200/26289 IDs in 0 h 57 min 57.0 sec since start.
-- Downloaded 1250/26289 IDs in 1 h 0 min 14.8 sec since start.
-- Downloaded 1300/26289 IDs in 1 h 2 min 54.2 sec since start.
-- Downloaded 1350/26289 IDs in 1 h 5 min 55.0 sec since start.




-- Downloaded 1400/26289 IDs in 1 h 9 min 34.1 sec since start.




-- Downloaded 1450/26289 IDs in 1 h 11 min 43.2 sec since start.
-- Downloaded 1500/26289 IDs in 1 h 13 min 50.7 sec since start.
-- Downloaded 1550/26289 IDs in 1 h 15 min 53.0 sec since start.
-- Downloaded 1600/26289 IDs in 1 h 17 min 55.7 sec since start.
-- Downloaded 1650/26289 IDs in 1 h 20 min 0.8 sec since start.
-- Downloaded 1700/26289 IDs in 1 h 22 min 6.1 sec since start.
-- Downloaded 1750/26289 IDs in 1 h 24 min 11.2 sec since start.




-- Downloaded 1800/26289 IDs in 1 h 26 min 16.6 sec since start.
-- Downloaded 1850/26289 IDs in 1 h 28 min 24.9 sec since start.




-- Downloaded 1900/26289 IDs in 1 h 30 min 30.0 sec since start.
-- Downloaded 1950/26289 IDs in 1 h 32 min 34.2 sec since start.
-- Downloaded 2000/26289 IDs in 1 h 34 min 38.6 sec since start.




-- Downloaded 2050/26289 IDs in 1 h 36 min 58.7 sec since start.




-- Downloaded 2100/26289 IDs in 1 h 39 min 25.6 sec since start.
-- Downloaded 2150/26289 IDs in 1 h 41 min 30.7 sec since start.
-- Downloaded 2200/26289 IDs in 1 h 43 min 47.9 sec since start.
-- Downloaded 2250/26289 IDs in 1 h 45 min 59.5 sec since start.
-- Downloaded 2300/26289 IDs in 1 h 48 min 5.3 sec since start.
-- Downloaded 2350/26289 IDs in 1 h 50 min 10.8 sec since start.
-- Downloaded 2400/26289 IDs in 1 h 52 min 18.0 sec since start.
-- Downloaded 2450/26289 IDs in 1 h 54 min 24.8 sec since start.
-- Downloaded 2500/26289 IDs in 1 h 56 min 31.3 sec since start.
-- Downloaded 2550/26289 IDs in 1 h 58 min 39.9 sec since start.
-- Downloaded 2600/26289 IDs in 2 h 0 min 52.8 sec since start.
-- Downloaded 2650/26289 IDs in 2 h 2 min 58.7 sec since start.
-- Downloaded 2700/26289 IDs in 2 h 5 min 6.5 sec since start.
-- Downloaded 2750/26289 IDs in 2 h 7 min 12.1 sec since start.
-- Downloaded 2800/26289 IDs in 2 h 9 min 20.1 sec since start.




-- Downloaded 2850/26289 IDs in 2 h 11 min 52.8 sec since start.
-- Downloaded 2900/26289 IDs in 2 h 14 min 3.1 sec since start.
-- Downloaded 2950/26289 IDs in 2 h 16 min 13.4 sec since start.




-- Downloaded 3000/26289 IDs in 2 h 18 min 28.8 sec since start.
-- Downloaded 3050/26289 IDs in 2 h 20 min 44.4 sec since start.




-- Downloaded 3100/26289 IDs in 2 h 23 min 9.3 sec since start.
-- Downloaded 3150/26289 IDs in 2 h 25 min 25.1 sec since start.
-- Downloaded 3200/26289 IDs in 2 h 27 min 43.4 sec since start.




-- Downloaded 3250/26289 IDs in 2 h 30 min 2.6 sec since start.
-- Downloaded 3300/26289 IDs in 2 h 32 min 56.4 sec since start.
-- Downloaded 3350/26289 IDs in 2 h 36 min 4.5 sec since start.
-- Downloaded 3400/26289 IDs in 2 h 38 min 13.6 sec since start.
-- Downloaded 3450/26289 IDs in 2 h 40 min 17.8 sec since start.




-- Downloaded 3500/26289 IDs in 2 h 42 min 22.3 sec since start.




-- Downloaded 3550/26289 IDs in 2 h 44 min 26.8 sec since start.




-- Downloaded 3600/26289 IDs in 2 h 46 min 36.8 sec since start.




-- Downloaded 3650/26289 IDs in 2 h 48 min 40.8 sec since start.




-- Downloaded 3700/26289 IDs in 2 h 50 min 48.2 sec since start.




-- Downloaded 3750/26289 IDs in 2 h 52 min 54.3 sec since start.




-- Downloaded 3800/26289 IDs in 2 h 54 min 59.6 sec since start.




-- Downloaded 3850/26289 IDs in 2 h 57 min 28.8 sec since start.
^C
-- Downloaded 3900/26289 IDs in 2 h 59 min 56.7 sec since start.
-- Downloaded 3950/26289 IDs in 3 h 2 min 29.6 sec since start.




-- Downloaded 4000/26289 IDs in 3 h 4 min 57.9 sec since start.
-- Downloaded 4050/26289 IDs in 3 h 7 min 9.7 sec since start.
-- Downloaded 4100/26289 IDs in 3 h 9 min 18.9 sec since start.
-- Downloaded 4150/26289 IDs in 3 h 11 min 30.3 sec since start.




-- Downloaded 4200/26289 IDs in 3 h 13 min 47.4 sec since start.
-- Downloaded 4250/26289 IDs in 3 h 15 min 54.5 sec since start.
-- Downloaded 4300/26289 IDs in 3 h 18 min 22.8 sec since start.
-- Downloaded 4350/26289 IDs in 3 h 20 min 46.7 sec since start.
-- Downloaded 4400/26289 IDs in 3 h 23 min 17.6 sec since start.
-- Downloaded 4450/26289 IDs in 4 h 2 min 28.8 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate wi

-- Downloaded 4500/26289 IDs in 4 h 10 min 12.9 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway


-- Downloaded 4550/26289 IDs in 4 h 14 min 37.4 sec since start.
-- Downloaded 4600/26289 IDs in 4 h 17 min 47.3 sec since start.
-- Downloaded 4650/26289 IDs in 4 h 21 min 2.1 sec since start.
-- Downloaded 4700/26289 IDs in 4 h 24 min 21.7 sec since start.




-- Downloaded 4750/26289 IDs in 4 h 27 min 45.4 sec since start.




-- Downloaded 4800/26289 IDs in 4 h 31 min 7.2 sec since start.
-- Downloaded 4850/26289 IDs in 4 h 34 min 25.0 sec since start.
-- Downloaded 4900/26289 IDs in 4 h 37 min 40.0 sec since start.
-- Downloaded 4950/26289 IDs in 4 h 41 min 10.5 sec since start.
-- Downloaded 5000/26289 IDs in 4 h 44 min 27.9 sec since start.
-- Downloaded 5050/26289 IDs in 10 h 31 min 46.8 sec since start.
-- Downloaded 5100/26289 IDs in 10 h 35 min 34.4 sec since start.
-- Downloaded 5150/26289 IDs in 10 h 39 min 18.7 sec since start.




-- Downloaded 5200/26289 IDs in 10 h 43 min 2.3 sec since start.
-- Downloaded 5250/26289 IDs in 10 h 46 min 56.5 sec since start.
-- Downloaded 5300/26289 IDs in 10 h 50 min 54.7 sec since start.




-- Downloaded 5350/26289 IDs in 10 h 54 min 41.8 sec since start.
-- Downloaded 5400/26289 IDs in 10 h 58 min 34.9 sec since start.
-- Downloaded 5450/26289 IDs in 11 h 2 min 39.9 sec since start.
-- Downloaded 5500/26289 IDs in 11 h 7 min 14.9 sec since start.
-- Downloaded 5550/26289 IDs in 11 h 11 min 51.4 sec since start.
-- Downloaded 5600/26289 IDs in 11 h 15 min 35.5 sec since start.




-- Downloaded 5650/26289 IDs in 11 h 19 min 21.6 sec since start.




-- Downloaded 5700/26289 IDs in 11 h 23 min 21.6 sec since start.




-- Downloaded 5750/26289 IDs in 11 h 27 min 27.6 sec since start.




-- Downloaded 5800/26289 IDs in 11 h 31 min 20.4 sec since start.




-- Downloaded 5850/26289 IDs in 11 h 35 min 12.5 sec since start.




-- Downloaded 5900/26289 IDs in 11 h 39 min 13.9 sec since start.




-- Downloaded 5950/26289 IDs in 11 h 43 min 12.7 sec since start.




-- Downloaded 6000/26289 IDs in 11 h 47 min 2.8 sec since start.




-- Downloaded 6050/26289 IDs in 11 h 50 min 51.2 sec since start.




-- Downloaded 6100/26289 IDs in 11 h 54 min 42.3 sec since start.




-- Downloaded 6150/26289 IDs in 11 h 58 min 34.6 sec since start.




-- Downloaded 6200/26289 IDs in 12 h 2 min 22.4 sec since start.




-- Downloaded 6250/26289 IDs in 12 h 6 min 15.7 sec since start.




-- Downloaded 6300/26289 IDs in 12 h 10 min 13.9 sec since start.




-- Downloaded 6350/26289 IDs in 12 h 14 min 12.6 sec since start.
-- Downloaded 6400/26289 IDs in 12 h 18 min 53.5 sec since start.
-- Downloaded 6450/26289 IDs in 12 h 23 min 8.7 sec since start.
-- Downloaded 6500/26289 IDs in 12 h 27 min 7.5 sec since start.
-- Downloaded 6550/26289 IDs in 12 h 31 min 50.0 sec since start.




-- Downloaded 6600/26289 IDs in 12 h 35 min 36.5 sec since start.




-- Downloaded 6650/26289 IDs in 12 h 39 min 25.8 sec since start.
-- Downloaded 6700/26289 IDs in 12 h 43 min 15.3 sec since start.




-- Downloaded 6750/26289 IDs in 12 h 47 min 1.4 sec since start.




-- Downloaded 6800/26289 IDs in 12 h 50 min 48.1 sec since start.
-- Downloaded 6850/26289 IDs in 12 h 55 min 12.0 sec since start.




-- Downloaded 6900/26289 IDs in 12 h 59 min 18.9 sec since start.




-- Downloaded 6950/26289 IDs in 13 h 3 min 3.3 sec since start.




-- Downloaded 7000/26289 IDs in 13 h 6 min 46.6 sec since start.
-- Downloaded 7050/26289 IDs in 13 h 10 min 28.9 sec since start.
^C
^C
-- Downloaded 7100/26289 IDs in 13 h 14 min 26.4 sec since start.




-- Downloaded 7150/26289 IDs in 13 h 18 min 12.3 sec since start.
-- Downloaded 7200/26289 IDs in 13 h 21 min 55.3 sec since start.
-- Downloaded 7250/26289 IDs in 13 h 25 min 39.1 sec since start.
-- Downloaded 7300/26289 IDs in 13 h 29 min 28.2 sec since start.
-- Downloaded 7350/26289 IDs in 13 h 33 min 10.5 sec since start.
-- Downloaded 7400/26289 IDs in 13 h 37 min 15.0 sec since start.
-- Downloaded 7450/26289 IDs in 13 h 41 min 19.9 sec since start.
-- Downloaded 7500/26289 IDs in 13 h 45 min 0.9 sec since start.




-- Downloaded 7550/26289 IDs in 13 h 48 min 57.6 sec since start.
-- Downloaded 7600/26289 IDs in 13 h 52 min 37.9 sec since start.
-- Downloaded 7650/26289 IDs in 13 h 56 min 16.5 sec since start.
-- Downloaded 7700/26289 IDs in 13 h 59 min 58.2 sec since start.




-- Downloaded 7750/26289 IDs in 14 h 3 min 48.8 sec since start.
-- Downloaded 7800/26289 IDs in 14 h 7 min 57.3 sec since start.
-- Downloaded 7850/26289 IDs in 14 h 11 min 56.8 sec since start.
-- Downloaded 7900/26289 IDs in 14 h 15 min 36.3 sec since start.




-- Downloaded 7950/26289 IDs in 14 h 19 min 18.6 sec since start.
-- Downloaded 8000/26289 IDs in 14 h 23 min 4.2 sec since start.




-- Downloaded 8050/26289 IDs in 14 h 26 min 54.2 sec since start.




-- Downloaded 8100/26289 IDs in 14 h 30 min 45.2 sec since start.




-- Downloaded 8150/26289 IDs in 14 h 34 min 39.5 sec since start.




-- Downloaded 8200/26289 IDs in 14 h 38 min 27.7 sec since start.




-- Downloaded 8250/26289 IDs in 14 h 42 min 15.7 sec since start.
-- Downloaded 8300/26289 IDs in 14 h 45 min 57.6 sec since start.
-- Downloaded 8350/26289 IDs in 14 h 49 min 42.6 sec since start.




-- Downloaded 8400/26289 IDs in 14 h 53 min 25.5 sec since start.




-- Downloaded 8450/26289 IDs in 14 h 56 min 43.3 sec since start.
-- Downloaded 8500/26289 IDs in 14 h 59 min 44.9 sec since start.
-- Downloaded 8550/26289 IDs in 15 h 2 min 44.1 sec since start.




-- Downloaded 8600/26289 IDs in 15 h 5 min 44.5 sec since start.
-- Downloaded 8650/26289 IDs in 15 h 8 min 47.4 sec since start.
-- Downloaded 8700/26289 IDs in 15 h 12 min 6.5 sec since start.
-- Downloaded 8750/26289 IDs in 15 h 15 min 8.6 sec since start.
-- Downloaded 8800/26289 IDs in 15 h 18 min 13.3 sec since start.




-- Downloaded 8850/26289 IDs in 15 h 21 min 30.6 sec since start.
-- Downloaded 8900/26289 IDs in 15 h 26 min 1.7 sec since start.
-- Downloaded 8950/26289 IDs in 15 h 30 min 27.9 sec since start.
-- Downloaded 9000/26289 IDs in 15 h 33 min 42.5 sec since start.
-- Downloaded 9050/26289 IDs in 15 h 36 min 46.1 sec since start.




-- Downloaded 9100/26289 IDs in 23 h 29 min 42.6 sec since start.
-- Downloaded 9150/26289 IDs in 23 h 32 min 44.7 sec since start.
-- Downloaded 9200/26289 IDs in 23 h 35 min 57.8 sec since start.
-- Downloaded 9250/26289 IDs in 23 h 40 min 1.5 sec since start.
-- Downloaded 9300/26289 IDs in 23 h 44 min 6.2 sec since start.




-- Downloaded 9350/26289 IDs in 23 h 47 min 55.2 sec since start.
-- Downloaded 9400/26289 IDs in 23 h 51 min 58.2 sec since start.




-- Downloaded 9450/26289 IDs in 23 h 55 min 51.9 sec since start.




-- Downloaded 9500/26289 IDs in 23 h 59 min 47.7 sec since start.
-- Downloaded 9550/26289 IDs in 24 h 4 min 4.5 sec since start.
-- Downloaded 9600/26289 IDs in 24 h 7 min 48.9 sec since start.
-- Downloaded 9650/26289 IDs in 24 h 11 min 37.8 sec since start.
-- Downloaded 9700/26289 IDs in 24 h 15 min 31.5 sec since start.




-- Downloaded 9750/26289 IDs in 24 h 19 min 19.4 sec since start.
-- Downloaded 9800/26289 IDs in 24 h 23 min 10.1 sec since start.
-- Downloaded 9850/26289 IDs in 24 h 27 min 10.2 sec since start.
-- Downloaded 9900/26289 IDs in 24 h 31 min 4.0 sec since start.
-- Downloaded 9950/26289 IDs in 24 h 34 min 53.8 sec since start.
-- Downloaded 10000/26289 IDs in 24 h 38 min 51.9 sec since start.
-- Downloaded 10050/26289 IDs in 24 h 43 min 4.9 sec since start.
-- Downloaded 10100/26289 IDs in 24 h 47 min 5.5 sec since start.
-- Downloaded 10150/26289 IDs in 24 h 51 min 4.2 sec since start.
-- Downloaded 10200/26289 IDs in 24 h 55 min 0.3 sec since start.
-- Downloaded 10250/26289 IDs in 24 h 59 min 4.3 sec since start.
-- Downloaded 10300/26289 IDs in 25 h 3 min 11.4 sec since start.
-- Downloaded 10350/26289 IDs in 25 h 6 min 28.6 sec since start.
-- Downloaded 10400/26289 IDs in 25 h 12 min 8.8 sec since start.
-- Downloaded 10450/26289 IDs in 25 h 18 min 58.6 sec since start.
-- Downlo



-- Downloaded 10900/26289 IDs in 26 h 11 min 56.3 sec since start.




-- Downloaded 10950/26289 IDs in 26 h 14 min 49.9 sec since start.




-- Downloaded 11000/26289 IDs in 26 h 18 min 22.9 sec since start.




-- Downloaded 11050/26289 IDs in 26 h 21 min 18.9 sec since start.
-- Downloaded 11100/26289 IDs in 26 h 24 min 23.3 sec since start.
-- Downloaded 11150/26289 IDs in 26 h 27 min 27.7 sec since start.
-- Downloaded 11200/26289 IDs in 26 h 30 min 32.0 sec since start.
-- Downloaded 11250/26289 IDs in 26 h 33 min 47.7 sec since start.


ERROR: The downloaded file is empty


-- Downloaded 11300/26289 IDs in 26 h 36 min 48.5 sec since start.
-- Downloaded 11350/26289 IDs in 26 h 39 min 60.0 sec since start.
-- Downloaded 11400/26289 IDs in 26 h 43 min 10.7 sec since start.
-- Downloaded 11450/26289 IDs in 26 h 46 min 13.4 sec since start.
-- Downloaded 11500/26289 IDs in 26 h 49 min 21.9 sec since start.
-- Downloaded 11550/26289 IDs in 26 h 52 min 29.8 sec since start.
-- Downloaded 11600/26289 IDs in 26 h 55 min 36.1 sec since start.
-- Downloaded 11650/26289 IDs in 26 h 58 min 32.9 sec since start.
-- Downloaded 11700/26289 IDs in 27 h 1 min 28.3 sec since start.
-- Downloaded 11750/26289 IDs in 27 h 4 min 28.4 sec since start.
-- Downloaded 11800/26289 IDs in 27 h 7 min 24.1 sec since start.
-- Downloaded 11850/26289 IDs in 27 h 10 min 26.0 sec since start.
-- Downloaded 11900/26289 IDs in 27 h 13 min 29.7 sec since start.
-- Downloaded 11950/26289 IDs in 27 h 16 min 36.1 sec since start.
-- Downloaded 12000/26289 IDs in 27 h 19 min 41.4 sec since start



-- Downloaded 12050/26289 IDs in 27 h 22 min 46.2 sec since start.
-- Downloaded 12100/26289 IDs in 27 h 25 min 52.4 sec since start.
-- Downloaded 12150/26289 IDs in 27 h 29 min 9.0 sec since start.




-- Downloaded 12200/26289 IDs in 27 h 32 min 24.6 sec since start.
-- Downloaded 12250/26289 IDs in 27 h 35 min 16.0 sec since start.




-- Downloaded 12300/26289 IDs in 27 h 38 min 9.7 sec since start.




-- Downloaded 12350/26289 IDs in 27 h 41 min 6.1 sec since start.




-- Downloaded 12400/26289 IDs in 27 h 44 min 10.1 sec since start.




-- Downloaded 12450/26289 IDs in 27 h 47 min 13.8 sec since start.




-- Downloaded 12500/26289 IDs in 27 h 50 min 10.9 sec since start.




-- Downloaded 12550/26289 IDs in 27 h 53 min 41.9 sec since start.
-- Downloaded 12600/26289 IDs in 27 h 56 min 38.4 sec since start.




-- Downloaded 12650/26289 IDs in 27 h 59 min 31.2 sec since start.
-- Downloaded 12700/26289 IDs in 28 h 2 min 38.2 sec since start.




-- Downloaded 12750/26289 IDs in 28 h 5 min 48.8 sec since start.




-- Downloaded 12800/26289 IDs in 28 h 8 min 51.2 sec since start.




-- Downloaded 12850/26289 IDs in 28 h 11 min 52.1 sec since start.




-- Downloaded 12900/26289 IDs in 28 h 14 min 54.0 sec since start.




-- Downloaded 12950/26289 IDs in 28 h 18 min 3.9 sec since start.




-- Downloaded 13000/26289 IDs in 28 h 21 min 10.3 sec since start.




-- Downloaded 13050/26289 IDs in 28 h 24 min 19.1 sec since start.




-- Downloaded 13100/26289 IDs in 28 h 27 min 23.0 sec since start.
-- Downloaded 13150/26289 IDs in 28 h 30 min 29.6 sec since start.




-- Downloaded 13200/26289 IDs in 28 h 33 min 38.0 sec since start.




-- Downloaded 13250/26289 IDs in 28 h 36 min 44.5 sec since start.




-- Downloaded 13300/26289 IDs in 28 h 39 min 57.7 sec since start.
-- Downloaded 13350/26289 IDs in 28 h 43 min 8.3 sec since start.




-- Downloaded 13400/26289 IDs in 28 h 46 min 11.8 sec since start.




-- Downloaded 13450/26289 IDs in 28 h 49 min 18.0 sec since start.




-- Downloaded 13500/26289 IDs in 28 h 52 min 24.5 sec since start.




-- Downloaded 13550/26289 IDs in 28 h 55 min 39.7 sec since start.




-- Downloaded 13600/26289 IDs in 28 h 58 min 53.9 sec since start.




-- Downloaded 13650/26289 IDs in 29 h 2 min 6.8 sec since start.




-- Downloaded 13700/26289 IDs in 29 h 5 min 16.3 sec since start.




-- Downloaded 13750/26289 IDs in 29 h 8 min 28.6 sec since start.




-- Downloaded 13800/26289 IDs in 29 h 11 min 47.4 sec since start.
-- Downloaded 13850/26289 IDs in 29 h 15 min 11.6 sec since start.




-- Downloaded 13900/26289 IDs in 29 h 18 min 29.1 sec since start.




-- Downloaded 13950/26289 IDs in 29 h 21 min 44.0 sec since start.
-- Downloaded 14000/26289 IDs in 29 h 24 min 55.5 sec since start.
-- Downloaded 14050/26289 IDs in 29 h 28 min 17.9 sec since start.




-- Downloaded 14100/26289 IDs in 29 h 31 min 37.7 sec since start.
-- Downloaded 14150/26289 IDs in 29 h 34 min 59.2 sec since start.
-- Downloaded 14200/26289 IDs in 29 h 38 min 15.4 sec since start.




-- Downloaded 14250/26289 IDs in 29 h 41 min 45.6 sec since start.




-- Downloaded 14300/26289 IDs in 29 h 45 min 59.2 sec since start.
-- Downloaded 14350/26289 IDs in 29 h 49 min 32.1 sec since start.
-- Downloaded 14400/26289 IDs in 29 h 52 min 33.4 sec since start.
-- Downloaded 14450/26289 IDs in 29 h 55 min 37.0 sec since start.
-- Downloaded 14500/26289 IDs in 29 h 58 min 54.1 sec since start.
-- Downloaded 14550/26289 IDs in 30 h 1 min 58.0 sec since start.
-- Downloaded 14600/26289 IDs in 30 h 5 min 27.4 sec since start.
-- Downloaded 14650/26289 IDs in 30 h 8 min 28.6 sec since start.
-- Downloaded 14700/26289 IDs in 30 h 11 min 33.0 sec since start.
-- Downloaded 14750/26289 IDs in 30 h 14 min 35.8 sec since start.
-- Downloaded 14800/26289 IDs in 30 h 17 min 55.4 sec since start.
-- Downloaded 14850/26289 IDs in 30 h 21 min 14.7 sec since start.
-- Downloaded 14900/26289 IDs in 30 h 24 min 27.5 sec since start.




-- Downloaded 14950/26289 IDs in 30 h 27 min 33.0 sec since start.
-- Downloaded 15000/26289 IDs in 30 h 30 min 53.5 sec since start.
-- Downloaded 15050/26289 IDs in 30 h 33 min 53.2 sec since start.
-- Downloaded 15100/26289 IDs in 30 h 36 min 55.5 sec since start.
-- Downloaded 15150/26289 IDs in 30 h 40 min 20.8 sec since start.
-- Downloaded 15200/26289 IDs in 30 h 43 min 46.3 sec since start.
-- Downloaded 15250/26289 IDs in 30 h 47 min 4.4 sec since start.




-- Downloaded 15300/26289 IDs in 30 h 50 min 6.7 sec since start.




-- Downloaded 15350/26289 IDs in 30 h 53 min 22.3 sec since start.
-- Downloaded 15400/26289 IDs in 30 h 56 min 22.1 sec since start.
-- Downloaded 15450/26289 IDs in 30 h 59 min 22.2 sec since start.




-- Downloaded 15500/26289 IDs in 31 h 4 min 34.0 sec since start.




-- Downloaded 15550/26289 IDs in 31 h 10 min 10.3 sec since start.




-- Downloaded 15600/26289 IDs in 31 h 15 min 19.9 sec since start.
-- Downloaded 15650/26289 IDs in 31 h 20 min 23.4 sec since start.




-- Downloaded 15700/26289 IDs in 31 h 23 min 36.8 sec since start.
-- Downloaded 15750/26289 IDs in 31 h 26 min 28.5 sec since start.
-- Downloaded 15800/26289 IDs in 31 h 29 min 28.5 sec since start.
-- Downloaded 15850/26289 IDs in 31 h 32 min 20.7 sec since start.
-- Downloaded 15900/26289 IDs in 31 h 38 min 25.1 sec since start.
-- Downloaded 15950/26289 IDs in 31 h 43 min 24.7 sec since start.
-- Downloaded 16000/26289 IDs in 31 h 46 min 28.3 sec since start.




-- Downloaded 16050/26289 IDs in 31 h 49 min 21.0 sec since start.




-- Downloaded 16100/26289 IDs in 31 h 52 min 28.2 sec since start.
-- Downloaded 16150/26289 IDs in 31 h 55 min 25.4 sec since start.
-- Downloaded 16200/26289 IDs in 31 h 58 min 26.9 sec since start.




-- Downloaded 16250/26289 IDs in 32 h 1 min 26.7 sec since start.
-- Downloaded 16300/26289 IDs in 32 h 4 min 24.3 sec since start.
-- Downloaded 16350/26289 IDs in 32 h 7 min 16.6 sec since start.
-- Downloaded 16400/26289 IDs in 32 h 11 min 24.2 sec since start.
-- Downloaded 16450/26289 IDs in 32 h 15 min 44.8 sec since start.
-- Downloaded 16500/26289 IDs in 32 h 19 min 40.0 sec since start.




-- Downloaded 16550/26289 IDs in 32 h 22 min 44.8 sec since start.
-- Downloaded 16600/26289 IDs in 32 h 25 min 58.8 sec since start.




-- Downloaded 16650/26289 IDs in 32 h 28 min 58.0 sec since start.
-- Downloaded 16700/26289 IDs in 32 h 31 min 54.3 sec since start.
-- Downloaded 16750/26289 IDs in 32 h 34 min 52.8 sec since start.
-- Downloaded 16800/26289 IDs in 32 h 38 min 5.7 sec since start.




-- Downloaded 16850/26289 IDs in 32 h 41 min 5.3 sec since start.
-- Downloaded 16900/26289 IDs in 32 h 44 min 8.2 sec since start.




-- Downloaded 16950/26289 IDs in 32 h 47 min 21.0 sec since start.
-- Downloaded 17000/26289 IDs in 32 h 50 min 21.8 sec since start.




-- Downloaded 17050/26289 IDs in 32 h 53 min 27.1 sec since start.




-- Downloaded 17100/26289 IDs in 32 h 56 min 34.0 sec since start.




-- Downloaded 17150/26289 IDs in 32 h 59 min 37.3 sec since start.
-- Downloaded 17200/26289 IDs in 33 h 2 min 35.7 sec since start.
-- Downloaded 17250/26289 IDs in 33 h 5 min 34.8 sec since start.
-- Downloaded 17300/26289 IDs in 33 h 9 min 4.9 sec since start.
-- Downloaded 17350/26289 IDs in 33 h 11 min 59.9 sec since start.
-- Downloaded 17400/26289 IDs in 33 h 15 min 2.4 sec since start.
-- Downloaded 17450/26289 IDs in 33 h 18 min 16.6 sec since start.
-- Downloaded 17500/26289 IDs in 33 h 21 min 31.9 sec since start.
-- Downloaded 17550/26289 IDs in 33 h 24 min 42.7 sec since start.
-- Downloaded 17600/26289 IDs in 33 h 28 min 15.9 sec since start.
-- Downloaded 17650/26289 IDs in 33 h 31 min 23.5 sec since start.
-- Downloaded 17700/26289 IDs in 33 h 34 min 32.1 sec since start.




-- Downloaded 17750/26289 IDs in 33 h 37 min 40.8 sec since start.
-- Downloaded 17800/26289 IDs in 33 h 40 min 45.2 sec since start.




-- Downloaded 17850/26289 IDs in 33 h 43 min 47.7 sec since start.
-- Downloaded 17900/26289 IDs in 33 h 46 min 53.4 sec since start.
-- Downloaded 17950/26289 IDs in 33 h 49 min 54.4 sec since start.




-- Downloaded 18000/26289 IDs in 33 h 53 min 2.1 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with Sponso

-- Downloaded 18050/26289 IDs in 34 h 4 min 22.2 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with Sponso

-- Downloaded 18100/26289 IDs in 34 h 12 min 5.5 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503

-- Downloaded 18150/26289 IDs in 34 h 17 min 50.6 sec since start.




-- Downloaded 18200/26289 IDs in 34 h 20 min 57.1 sec since start.




-- Downloaded 18250/26289 IDs in 34 h 24 min 3.4 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 18300/26289 IDs in 34 h 29 min 22.8 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with Sponso

-- Downloaded 18350/26289 IDs in 34 h 33 min 54.2 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with Sponso

-- Downloaded 18400/26289 IDs in 34 h 40 min 51.6 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with Sponso

-- Downloaded 18450/26289 IDs in 34 h 47 min 58.8 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with Sponso

-- Downloaded 18500/26289 IDs in 34 h 51 min 57.5 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with Sponso

-- Downloaded 18550/26289 IDs in 34 h 55 min 57.7 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with Sponso

-- Downloaded 18600/26289 IDs in 35 h 9 min 33.5 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with Sponso

-- Downloaded 18650/26289 IDs in 35 h 24 min 24.3 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP E

-- Downloaded 18700/26289 IDs in 35 h 35 min 21.8 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with Sponso

-- Downloaded 18750/26289 IDs in 35 h 51 min 13.8 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', 

-- Downloaded 18800/26289 IDs in 36 h 11 min 19.4 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocess

-- Downloaded 18850/26289 IDs in 36 h 37 min 58.7 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocess

-- Downloaded 18900/26289 IDs in 36 h 53 min 51.2 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP E

-- Downloaded 18950/26289 IDs in 36 h 58 min 41.2 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP E

-- Downloaded 19000/26289 IDs in 37 h 4 min 17.6 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): 

-- Downloaded 19050/26289 IDs in 37 h 15 min 22.3 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', 

-- Downloaded 19100/26289 IDs in 37 h 38 min 55.7 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP E

-- Downloaded 19150/26289 IDs in 37 h 43 min 0.4 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests


-- Downloaded 19200/26289 IDs in 37 h 46 min 52.8 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 19250/26289 IDs in 37 h 50 min 47.1 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests


-- Downloaded 19300/26289 IDs in 37 h 54 min 45.5 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Err

-- Downloaded 19350/26289 IDs in 37 h 58 min 45.6 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: 

-- Downloaded 19400/26289 IDs in 38 h 2 min 48.0 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 19450/26289 IDs in 38 h 6 min 43.7 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 19500/26289 IDs in 38 h 10 min 38.3 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests


-- Downloaded 19550/26289 IDs in 38 h 14 min 24.3 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests


-- Downloaded 19600/26289 IDs in 38 h 18 min 14.1 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 429: Too Many Requests
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error

-- Downloaded 19650/26289 IDs in 38 h 25 min 51.2 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with S

-- Downloaded 19700/26289 IDs in 38 h 33 min 33.0 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP

-- Downloaded 19750/26289 IDs in 38 h 45 min 14.1 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 503: Service Unavailable
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP E

-- Downloaded 19800/26289 IDs in 38 h 48 min 39.6 sec since start.




-- Downloaded 19850/26289 IDs in 38 h 51 min 35.1 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 19900/26289 IDs in 38 h 54 min 29.5 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 19950/26289 IDs in 38 h 57 min 25.8 sec since start.
-- Downloaded 20000/26289 IDs in 39 h 0 min 24.4 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)


-- Downloaded 20050/26289 IDs in 39 h 5 min 11.3 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 502: Bad Gateway


-- Downloaded 20100/26289 IDs in 39 h 9 min 52.7 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTPSConnectionPool(host='sponsor.ajay.app', port=443): Read timed out. (read timeout=20.0)
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20150/26289 IDs in 39 h 14 min 41.1 sec since start.




-- Downloaded 20200/26289 IDs in 39 h 17 min 47.8 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20250/26289 IDs in 39 h 21 min 1.9 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20300/26289 IDs in 39 h 24 min 1.7 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20350/26289 IDs in 39 h 27 min 9.6 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20400/26289 IDs in 39 h 30 min 50.0 sec since start.
-- Downloaded 20450/26289 IDs in 39 h 34 min 11.2 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20500/26289 IDs in 39 h 37 min 33.9 sec since start.
-- Downloaded 20550/26289 IDs in 39 h 40 min 51.5 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20600/26289 IDs in 39 h 44 min 0.9 sec since start.
-- Downloaded 20650/26289 IDs in 39 h 47 min 2.3 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20700/26289 IDs in 39 h 49 min 53.5 sec since start.




-- Downloaded 20750/26289 IDs in 39 h 53 min 12.7 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20800/26289 IDs in 39 h 56 min 24.5 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20850/26289 IDs in 39 h 59 min 37.0 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 20900/26289 IDs in 40 h 2 min 52.7 sec since start.
^C




-- Downloaded 20950/26289 IDs in 40 h 5 min 48.3 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 21000/26289 IDs in 40 h 8 min 44.8 sec since start.




-- Downloaded 21050/26289 IDs in 40 h 12 min 12.4 sec since start.




-- Downloaded 21100/26289 IDs in 40 h 15 min 3.8 sec since start.




-- Downloaded 21150/26289 IDs in 40 h 17 min 55.0 sec since start.
-- Downloaded 21200/26289 IDs in 40 h 20 min 58.0 sec since start.


ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error
ERROR: Preprocessing: Unable to communicate with SponsorBlock API: HTTP Error 500: Internal Server Error


-- Downloaded 21250/26289 IDs in 40 h 24 min 3.1 sec since start.




-- Downloaded 21300/26289 IDs in 40 h 27 min 6.6 sec since start.




-- Downloaded 21350/26289 IDs in 40 h 30 min 8.9 sec since start.




-- Downloaded 21400/26289 IDs in 40 h 33 min 15.6 sec since start.




-- Downloaded 21450/26289 IDs in 40 h 36 min 18.9 sec since start.




-- Downloaded 21500/26289 IDs in 40 h 39 min 19.6 sec since start.


ERROR: Unable to download video subtitles for 'en': HTTP Error 404: Not Found


-- Downloaded 21550/26289 IDs in 49 h 23 min 1.4 sec since start.
-- Downloaded 21600/26289 IDs in 49 h 26 min 0.9 sec since start.
-- Downloaded 21650/26289 IDs in 49 h 29 min 0.4 sec since start.
-- Downloaded 21700/26289 IDs in 49 h 32 min 9.3 sec since start.
-- Downloaded 21750/26289 IDs in 49 h 35 min 17.4 sec since start.
-- Downloaded 21800/26289 IDs in 49 h 38 min 18.6 sec since start.
-- Downloaded 21850/26289 IDs in 49 h 41 min 28.4 sec since start.
-- Downloaded 21900/26289 IDs in 49 h 44 min 41.9 sec since start.
-- Downloaded 21950/26289 IDs in 49 h 47 min 59.9 sec since start.
-- Downloaded 22000/26289 IDs in 49 h 51 min 12.9 sec since start.
-- Downloaded 22050/26289 IDs in 49 h 54 min 23.5 sec since start.
-- Downloaded 22100/26289 IDs in 49 h 57 min 32.7 sec since start.
-- Downloaded 22150/26289 IDs in 50 h 0 min 46.9 sec since start.
-- Downloaded 22200/26289 IDs in 50 h 3 min 57.6 sec since start.
-- Downloaded 22250/26289 IDs in 50 h 7 min 15.6 sec since start.
--



-- Downloaded 22600/26289 IDs in 50 h 30 min 32.9 sec since start.
-- Downloaded 22650/26289 IDs in 50 h 33 min 40.5 sec since start.
-- Downloaded 22700/26289 IDs in 50 h 36 min 59.9 sec since start.
-- Downloaded 22750/26289 IDs in 50 h 40 min 13.5 sec since start.
-- Downloaded 22800/26289 IDs in 50 h 43 min 29.2 sec since start.




-- Downloaded 22850/26289 IDs in 50 h 46 min 39.4 sec since start.
-- Downloaded 22900/26289 IDs in 50 h 49 min 49.3 sec since start.
-- Downloaded 22950/26289 IDs in 50 h 53 min 8.8 sec since start.




-- Downloaded 23000/26289 IDs in 50 h 56 min 21.0 sec since start.
-- Downloaded 23050/26289 IDs in 50 h 59 min 33.9 sec since start.
-- Downloaded 23100/26289 IDs in 51 h 2 min 52.0 sec since start.
-- Downloaded 23150/26289 IDs in 51 h 6 min 20.0 sec since start.
-- Downloaded 23200/26289 IDs in 51 h 9 min 40.8 sec since start.
-- Downloaded 23250/26289 IDs in 51 h 12 min 58.3 sec since start.
-- Downloaded 23300/26289 IDs in 51 h 16 min 32.1 sec since start.
-- Downloaded 23350/26289 IDs in 51 h 20 min 3.2 sec since start.
-- Downloaded 23400/26289 IDs in 51 h 23 min 35.0 sec since start.
-- Downloaded 23450/26289 IDs in 51 h 26 min 59.6 sec since start.
-- Downloaded 23500/26289 IDs in 51 h 30 min 31.4 sec since start.
-- Downloaded 23550/26289 IDs in 51 h 34 min 3.2 sec since start.
-- Downloaded 23600/26289 IDs in 51 h 37 min 33.4 sec since start.
-- Downloaded 23650/26289 IDs in 51 h 40 min 59.9 sec since start.
-- Downloaded 23700/26289 IDs in 51 h 44 min 34.0 sec since start.




-- Downloaded 24850/26289 IDs in 53 h 23 min 56.2 sec since start.




-- Downloaded 24900/26289 IDs in 53 h 29 min 4.5 sec since start.




-- Downloaded 24950/26289 IDs in 53 h 34 min 18.3 sec since start.
-- Downloaded 25000/26289 IDs in 53 h 39 min 34.7 sec since start.
-- Downloaded 25050/26289 IDs in 53 h 45 min 5.3 sec since start.




-- Downloaded 25100/26289 IDs in 53 h 50 min 48.2 sec since start.




-- Downloaded 25150/26289 IDs in 53 h 56 min 17.9 sec since start.




-- Downloaded 25200/26289 IDs in 54 h 1 min 43.8 sec since start.




-- Downloaded 25250/26289 IDs in 54 h 7 min 5.2 sec since start.




-- Downloaded 25300/26289 IDs in 54 h 12 min 41.0 sec since start.




-- Downloaded 25350/26289 IDs in 54 h 18 min 16.0 sec since start.
-- Downloaded 25400/26289 IDs in 54 h 23 min 45.5 sec since start.
-- Downloaded 25450/26289 IDs in 54 h 29 min 28.4 sec since start.
-- Downloaded 25500/26289 IDs in 54 h 35 min 7.7 sec since start.
-- Downloaded 25550/26289 IDs in 54 h 40 min 8.9 sec since start.
-- Downloaded 25600/26289 IDs in 54 h 43 min 58.4 sec since start.
-- Downloaded 25650/26289 IDs in 54 h 47 min 20.6 sec since start.




-- Downloaded 25700/26289 IDs in 54 h 51 min 12.0 sec since start.
-- Downloaded 25750/26289 IDs in 54 h 54 min 57.0 sec since start.




-- Downloaded 25800/26289 IDs in 54 h 58 min 51.9 sec since start.




-- Downloaded 25850/26289 IDs in 55 h 4 min 12.7 sec since start.




-- Downloaded 25900/26289 IDs in 55 h 11 min 28.7 sec since start.
-- Downloaded 25950/26289 IDs in 55 h 19 min 17.1 sec since start.




-- Downloaded 26000/26289 IDs in 55 h 23 min 24.5 sec since start.




-- Downloaded 26050/26289 IDs in 55 h 26 min 20.4 sec since start.




-- Downloaded 26100/26289 IDs in 55 h 29 min 29.4 sec since start.
-- Downloaded 26150/26289 IDs in 55 h 32 min 48.2 sec since start.
-- Downloaded 26200/26289 IDs in 55 h 35 min 51.3 sec since start.
-- Downloaded 26250/26289 IDs in 55 h 38 min 53.0 sec since start.


In [45]:
import pandas as pd
downloaded = pd.read_csv("5_transcripts_and_metadata/downloaded_index.csv", sep=";", header=0, index_col=0)
download_list = df['video_id'].to_list()
print(f"Number of videos downloaded: {len(downloaded)}")

Number of videos downloaded: 86474


In [None]:
### extract, clean and save transcript files into new dir as csvs
import transcript_utils as tu
transcript_dir = "5_transcripts_and_metadata/transcripts"
new_transcript_dir = "5_transcripts_and_metadata/transcripts_csvs"

tu.extract_entire_dir(transcript_dir, new_transcript_dir, clean_filenames=True)

Beginning extraction for 81294 transcript files from 5_transcripts_and_metadata/transcripts to 5_transcripts_and_metadata/transcripts_csvs...
8129/81294 files processed.
16258/81294 files processed.
24387/81294 files processed.
32516/81294 files processed.
40645/81294 files processed.
48774/81294 files processed.
56903/81294 files processed.
65032/81294 files processed.
73161/81294 files processed.
81290/81294 files processed.
------------------------------------------------------------
Extraction complete. 81294 files saved to 5_transcripts_and_metadata/transcripts_csvs.


### Step 6: Final Filtering Steps

We apply these additional filters:

- Remove videos from channels which are not primarily finance-related.
- Upload date must lie within our chosen timeframe (now we have the exact upload dates in the infojsons).
- Remove videos where the automatically recognized language is not English. This avoids two issues:
  - When Youtube recognizes the wrong language, the auto-translated english transcript will still be available but it will be complete gibberish.
  - When the "language" field is missing in the info.json, usually no transcript is available. (The reason why it is missing sometimes is not entirely clear. There are cases where a video contains no spoken words, but also cases where the language should be clearly recognizable.)
- Remove all videos of channels with less than 10 videos in our sample.

In [1]:
# read in necessary data and drop duplicates (which somehow made their way in the yt dlp archive file)

import pandas as pd
import json

df = pd.read_csv("5_transcripts_and_metadata/downloaded_index.csv", sep=";", header=0)
df = df.drop_duplicates(subset="video_id").reset_index(drop=True)

In [2]:
# add fields from infojsons (note: can run for a few min, full df with 86k rows and descriptions/tags is around 0.5 GB)
from scraping_utils import add_infojson_fields
fields_from_json = ['title', 'upload_date', 'duration', 'language', 'view_count', 'tags', 'description', 'categories']

df = df.rename(columns={"channel_id": "uploader_id"})

df = add_infojson_fields(df, fields_from_json, "5_transcripts_and_metadata/infojsons")
df.head()

------------------------------------------------------------
Adding fields from info jsons to df (nrows: 86133, fields: ['title', 'upload_date', 'duration', 'language', 'view_count', 'tags', 'description', 'categories'])...
field 'language' missing in: 5_transcripts_and_metadata/infojsons/@AdamKhoo_ZUbOBj71oFQ.info.json
field 'language' missing in: 5_transcripts_and_metadata/infojsons/@AdamKhoo_apNMkE50kcw.info.json
field 'language' missing in: 5_transcripts_and_metadata/infojsons/@AdamKhoo_I9kfljujR20.info.json
field 'language' missing in: 5_transcripts_and_metadata/infojsons/@AdamKhoo_jtU00txmkPU.info.json
field 'language' missing in: 5_transcripts_and_metadata/infojsons/@AdamKhoo_ldo9EuvJSJc.info.json
field 'language' missing in: 5_transcripts_and_metadata/infojsons/@AkshatZayn_KooB6MqY-jw.info.json
field 'language' missing in: 5_transcripts_and_metadata/infojsons/@AkshatZayn_TW3W0W8PTrc.info.json
field 'language' missing in: 5_transcripts_and_metadata/infojsons/@AkshatZayn_8jf8HYi3

Unnamed: 0,uploader_id,video_id,title,upload_date,duration,language,view_count,tags,description,categories
0,@2PPD,5MzzziZaNYw,Automate your Trades on Coinbase Pro using thi...,2021-10-12,153.0,en,26015.0,"[coinbase pro trading bot, coinbase pro tradin...",Let me show you how this BOT created $211 prof...,[People & Blogs]
1,@2PPD,iW-tdkuMOyQ,Ye's Bank Account Frozen (Another Reason we ne...,2022-11-23,23.0,en,32710.0,,1️⃣ Learn how I (Live Off Crypto)\n2️⃣ Learn h...,[Education]
2,@2PPD,ol2o9VuEbkE,Whats wrong sbf? 💊 Sam Bankman-Fried shaking u...,2022-11-16,7.0,en,49569.0,,This is SBF on Meet the Press a few months ago...,[Education]
3,@2PPD,CmXeVHaS4LE,Don't be SAM! Sam was a Scam! #ftxhack,2022-11-12,10.0,en,10863.0,,#SamBankman-Fried #cryptonews #ftx\n\n1️⃣ Lear...,[Education]
4,@2PPD,IyUwdMCfcCk,Kevin O'Leary brags about #FTX,2022-11-12,5.0,en,23578.0,,1️⃣ Learn how I (Live Off Crypto)\n2️⃣ Learn h...,[Education]


In [3]:
# load 4_filtered_videos/filtered_videos.csv and merge with df to get yt_video_type column
df_2 = pd.read_csv("4_filtered_videos/filtered_videos.csv", sep=";", header=0, index_col=0)
df_2 = df_2[['video_id', 'yt_video_type']]
df = pd.merge(df, df_2, on="video_id", how="left")
del df_2
# drop duplicates which could have been introduced by merging
df = df.drop_duplicates(subset="video_id").reset_index(drop=True)
df.head()

Unnamed: 0,uploader_id,video_id,title,upload_date,duration,language,view_count,tags,description,categories,yt_video_type
0,@2PPD,5MzzziZaNYw,Automate your Trades on Coinbase Pro using thi...,2021-10-12,153.0,en,26015.0,"[coinbase pro trading bot, coinbase pro tradin...",Let me show you how this BOT created $211 prof...,[People & Blogs],video
1,@2PPD,iW-tdkuMOyQ,Ye's Bank Account Frozen (Another Reason we ne...,2022-11-23,23.0,en,32710.0,,1️⃣ Learn how I (Live Off Crypto)\n2️⃣ Learn h...,[Education],short
2,@2PPD,ol2o9VuEbkE,Whats wrong sbf? 💊 Sam Bankman-Fried shaking u...,2022-11-16,7.0,en,49569.0,,This is SBF on Meet the Press a few months ago...,[Education],short
3,@2PPD,CmXeVHaS4LE,Don't be SAM! Sam was a Scam! #ftxhack,2022-11-12,10.0,en,10863.0,,#SamBankman-Fried #cryptonews #ftx\n\n1️⃣ Lear...,[Education],short
4,@2PPD,IyUwdMCfcCk,Kevin O'Leary brags about #FTX,2022-11-12,5.0,en,23578.0,,1️⃣ Learn how I (Live Off Crypto)\n2️⃣ Learn h...,[Education],short


In [4]:
# check null values
df.isna().sum()

uploader_id          0
video_id             0
title                1
upload_date          1
duration             1
language          5555
view_count           1
tags             10295
description       5024
categories           3
yt_video_type        0
dtype: int64

In [5]:
# there is a single faulty video with missing data in all fields -> remove it
df = df.dropna(subset=["title"]).reset_index(drop=True)
# check null values again
df.isna().sum()

uploader_id          0
video_id             0
title                0
upload_date          0
duration             0
language          5554
view_count           0
tags             10294
description       5023
categories           2
yt_video_type        0
dtype: int64

In [6]:
# filter upload dates for our timeframe of interest (again, but this time with exact dates available)

start_cutoff = pd.to_datetime("2016-01-01")
end_cutoff = pd.to_datetime("2022-12-31")
print(f"Number of videos before filtering for upload dates: {len(df)}")
df = df[(df['upload_date'] >= start_cutoff) & (df['upload_date'] <= end_cutoff)].reset_index(drop=True)
print(f"Number of videos after filtering for upload dates: {len(df)}")


Number of videos before filtering for upload dates: 86132
Number of videos after filtering for upload dates: 68242


In [19]:
# filter out channels determined to be not relevant (i.e. not finance-related) 
# since there are only ~300 channels, manual review is possible (guided by analyzing most common tags and video categories)
from yt_search_lists import non_relevant_channels

print(f"Number of videos before filtering out non-relevant channels: {len(df)}")
df = df[~df.uploader_id.isin(non_relevant_channels)].reset_index(drop=True)
print(f"Number of videos after filtering out non-relevant channels: {len(df)}")

Number of videos before filtering out non-relevant channels: 68242
Number of videos after filtering out non-relevant channels: 52832


In [20]:
# count number of videos below/above duration threshold
for mins in [5, 20, 30, 45]:
    print(f"At or below/above {mins} min duration: {len(df[df.duration <= mins*60])}/{len(df[df.duration > mins*60])}")

# no stricter duration filter applied for now, 30+ min videos are not that common

At or below/above 5 min duration: 14401/38431
At or below/above 20 min duration: 46659/6173
At or below/above 30 min duration: 51040/1792
At or below/above 45 min duration: 52832/0


In [21]:
# language filters

#print(f"Number of videos with no language info: {len(df[df.language.isna()])}")
#print(f"Number of videos with non-english language: {len(df[df.language != 'en'])}")

print(f"Number of videos before filtering for language: {len(df)}")
df = df[df.language == 'en'].reset_index(drop=True)
print(f"Number of videos after filtering for language: {len(df)}")


Number of videos before filtering for language: 52832
Number of videos after filtering for language: 46108


In [22]:
# check for any missing transcript files and filter out those videos 
# note: after the language filters there shouldn't really be many missing transcripts
import os

print(f"Number of videos before filtering for missing transcript files: {len(df)}")
missing_transcripts = []
for idx, row in df.iterrows():
    video_id = row['video_id']
    uploader_id = row['uploader_id']
    if not os.path.exists(f"5_transcripts_and_metadata/transcripts_csvs/{uploader_id}_{video_id}.csv"):
        missing_transcripts.append(video_id)
        print(f"missing transcript for video {video_id} from channel {uploader_id}")

df = df[~df.video_id.isin(missing_transcripts)].reset_index(drop=True)
print(f"Number of videos after filtering for missing transcript files: {len(df)}")

Number of videos before filtering for missing transcripts: 46108
missing transcript for video T_NIXo3q09c from channel @CARachanaRanade
missing transcript for video R8ZWwkbS6Ww from channel @CARachanaRanade
Number of videos after filtering for missing transcripts: 46106


In [23]:
# filter out channels with less than 10 videos in the observation timeframe (these are likely to be very low viewership (only a few videos made it above the threshold) or inactive)
min_vids = 10
print(f"Number of videos before filtering out channels with less than {min_vids} videos: {len(df)}")
df = df.groupby("uploader_id").filter(lambda x: len(x) >= min_vids).reset_index(drop=True)
print(f"Number of videos after filtering out channels with less than {min_vids} videos: {len(df)}")
#"""

Number of videos before filtering out channels with less than 10 videos: 46106
Number of videos after filtering out channels with less than 10 videos: 45968


Now that filtering is finished, we save the final_filtered_index to csv, which just contains ``uploader_id`` and ``video_id``.

We also save a csv with selected metadata for each video, which might be useful for a LLM-based classification task later. (A single csv file will be much easier to upload to e.g. Google Colab than all the individual json files.)
The following will be included:

- video_id
- uploader_id
- title 
- description 
- first three tags (comma-separated string)
- 

In [24]:
# save filtered index
filtered_index_df = df[['uploader_id', 'video_id']]
filtered_index_df.to_csv("6_filtered_videos_final/filtered_index.csv", sep=";", header=True, index=False)

In [46]:
# create filtered metadata df 
filtered_metadata_df = df.loc[:, ['uploader_id', 'video_id', 'upload_date', 'yt_video_type', 'view_count', 'duration', 'language', 'title', 'description', 'categories', 'tags']]
# rename categories column
filtered_metadata_df = filtered_metadata_df.rename(columns={"categories": "yt_auto_categories"})
# first three (or fewer) tags column - stored as string for later use in prompts
def first_three_tags(tags):
    if not tags:
        return ""
    else:
        # tags should already be cleaned of separators/commas
        return ", ".join([tag for tag in tags[:3]])
filtered_metadata_df['first_three_tags'] = filtered_metadata_df['tags'].apply(first_three_tags)

# save to csv
filtered_metadata_df.to_csv("6_filtered_videos_final/filtered_metadata.csv", sep=";", header=True, index=False)


In [47]:
# test whether the loaded metadata df is the same as previously saved df (do weird description characters mess up csv file?)

# handle loading of lists from csv with ast.literal_eval()
import ast
def load_list(x):
    return ast.literal_eval(x) if x else None
loaded_df = pd.read_csv("6_filtered_videos_final/filtered_metadata.csv", sep=";", header=0, converters={"tags": load_list, 
                                                                                                        "yt_auto_categories": load_list})

if not loaded_df.equals(filtered_metadata_df):
    # show differences (only empty strings vs NaNs in the first_three_tags column should show up)
    diff = loaded_df.compare(filtered_metadata_df)
    print(f"Discrepancies found in the following columns: {diff.columns.levels[0][0]}")
    print(diff)

Discrepancies found in the following columns: first_three_tags
      first_three_tags      
                  self other
1                  NaN      
2                  NaN      
3                  NaN      
4                  NaN      
5                  NaN      
...                ...   ...
45601              NaN      
45602              NaN      
45603              NaN      
45604              NaN      
45967              NaN      

[4752 rows x 2 columns]


In [48]:
# finally, also save a version of the index which is sorted by average channel video view count, and videos of a channel are sorted by view count descending.

# load filtered index

filtered_index_sorted = df[['uploader_id', 'video_id', 'view_count', 'yt_video_type']]

# create a new variable 'channel_avg_view_count'
filtered_index_sorted['channel_avg_view_count'] = filtered_index_sorted.groupby('uploader_id')['view_count'].transform('mean')
# sort
filtered_index_sorted = filtered_index_sorted.sort_values(by=['channel_avg_view_count', 'view_count'], ascending=[False, False]).reset_index(drop=True)
# save
filtered_index_sorted.to_csv("6_filtered_videos_final/filtered_index_sorted_avg_channel_views.csv", sep=";", header=True, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_index_sorted['channel_avg_view_count'] = filtered_index_sorted.groupby('uploader_id')['view_count'].transform('mean')


Below: Code for exploration of categories, tags, etc. which helped to define the irrelevant channel list. Reload the dataframe from the start of this section before continuing.

In [104]:
# get value counts for different categories
n_vids_by_cat = df['categories'].apply(lambda x: None if not x else x[0]).value_counts()

# add number of channels as second column
cat_df = pd.DataFrame(n_vids_by_cat).reset_index()
cat_df.columns = ['category', 'n_videos']
cat_df['n_channels'] = cat_df['category'].apply(lambda x: len(df[df['categories'].apply(lambda y: None if not y else y[0]) == x]['uploader_id'].unique()))
cat_df = cat_df.sort_values(by="n_videos", ascending=False).reset_index(drop=True)
cat_df

Unnamed: 0,category,n_videos,n_channels
0,Education,30223,190
1,People & Blogs,13460,105
2,Science & Technology,3775,20
3,Entertainment,3102,64
4,News & Politics,2308,29
5,Howto & Style,2150,38
6,Gaming,460,8
7,Comedy,59,9
8,Autos & Vehicles,51,10
9,Sports,38,4


In [105]:
# for each category, get the top 10 channels by percentage of videos in that category

# get categories
categories = df['categories'].apply(lambda x: None if not x else x[0]).value_counts().index.to_list()

# get top 10 channels by percentage of videos in each category
top_channels = {}
for category in categories:
    top_channels[category] = df[df['categories'].apply(lambda x: None if not x else x[0]) == category]['uploader_id'].value_counts(normalize=True).head(10)

print(top_channels)

{'Education': uploader_id
@MeetKevin                 0.057969
@GrantCardone              0.052940
@FinancialEducation        0.038778
@VALUETAINMENT             0.037885
@GaryVeeVideoExperience    0.037488
@RickyGutierrezz           0.035900
@StockMoe                  0.029448
@DiscoverCrypto_           0.026867
@MoneyPurse                0.026139
@GrahamStephan             0.024352
Name: proportion, dtype: float64, 'People & Blogs': uploader_id
@TheRamseyShow            0.345394
@garyvee                  0.105201
@Incomeparent             0.062481
@Thetradingfraternity     0.044799
@MoneyGuyShow             0.040416
@TimothySykesTrader       0.029569
@josephhogue              0.029495
@investingbasicsyt        0.023923
@BulldogMindsetArchive    0.023031
@AkshatZayn               0.022214
Name: proportion, dtype: float64, 'Science & Technology': uploader_id
@AltcoinDaily               0.303311
@TheCryptoLark              0.284768
@DiscoverCrypto_            0.217483
@CryptoBusy        

In [26]:
# get top 10 uploader ids for each category
top_10_per_category = df['categories'].apply(lambda x: None if not x else x[0]).value_counts().head(10)
for cat in top_10_per_category.index:
    print(f"Top 10 uploaders for category {cat}:")
    print(df[df['categories'].apply(lambda x: None if not x else x[0]) == cat]['uploader_id'].value_counts().head(10))
    print("\n")

Top 10 uploaders for category Education:
uploader_id
@GrantCardone              1918
@MeetKevin                 1752
@TheMathSorcerer           1475
@VALUETAINMENT             1469
@FinancialEducation        1183
@GaryVeeVideoExperience    1174
@MoneyPurse                1116
@RickyGutierrezz           1115
@StockMoe                   890
@DiscoverCrypto_            812
Name: count, dtype: int64


Top 10 uploaders for category People & Blogs:
uploader_id
@TheRamseyShow           5393
@garyvee                 1735
@Incomeparent             929
@MoneyGuyShow             813
@ChrisWillx               794
@DWGALGO                  735
@BriansFarmingVideos      642
@Thetradingfraternity     617
@MarenAltman              539
@TimothySykesTrader       491
Name: count, dtype: int64


Top 10 uploaders for category Entertainment:
uploader_id
@discovery            2745
@ChrisSain1            718
@ffreedomapptelugu     404
@jackneel              400
@CryptoZach            385
@garyvee             

In [25]:
# category value counts for certain channels
uploader_id = "@thedicetower"
df[df.uploader_id == uploader_id]['categories'].apply(lambda x: None if not x else x[0]).value_counts()

categories
Gaming           4761
Entertainment       1
Name: count, dtype: int64

In [5]:
# check median and average number of tags
df['num_tags'] = df['tags'].apply(lambda x: 0 if not x else len(x))
print(f"median number of tags: {df['num_tags'].median()}")
print(f"average number of tags: {df['num_tags'].mean()}")
print(f"max number of tags: {df['num_tags'].max()}")

median number of tags: 17.0
average number of tags: 16.51677057573752
max number of tags: 80


In [28]:
keywords = ["stocks", "investing"]
print(f"given tag list: {keywords}")
print(f"n vids including at least one of the tags: {df['tags'].apply(lambda x: False if not x else any([kw in x for kw in keywords])).sum()}")
print(f"n vids including all of the tags: {df['tags'].apply(lambda x: False if not x else all([kw in x for kw in keywords])).sum()}")


given tag list: ['stocks', 'investing']
n vids including at least one of the tags: 10187
n vids including all of the tags: 3541


In [10]:
# check for most common tags
taglists = df['tags'].apply(lambda x: [] if not x else x).to_list()
tags = [tag for taglist in taglists for tag in taglist]
from collections import Counter
tag_counts = Counter(tags)
tag_counts.most_common(20)


[('stock market', 8476),
 ('crypto', 7491),
 ('review', 7412),
 ('investing', 7174),
 ('money', 6642),
 ('stocks', 6554),
 ('real estate', 5811),
 ('cryptocurrency', 5730),
 ('how to make money', 5431),
 ('bitcoin', 5390),
 ('dave ramsey', 4853),
 ('dice tower', 4752),
 ('game', 4751),
 ('board game', 4745),
 ('dice', 4736),
 ('vasel', 4731),
 ('tom vasel', 4719),
 ('credit card', 4714),
 ('catan', 4703),
 ('the dave ramsey show', 4486)]

In [64]:
from collections import Counter

# get uploader ids ordered by number of videos in dataset
counts_df = pd.DataFrame(df['uploader_id'].value_counts()).reset_index()
# add most common category and tags for each uploader
counts_df['most_common_category'] = None
counts_df['most_common_tag#1'] = None
counts_df['most_common_tag#2'] = None
counts_df['most_common_tag#3'] = None

for idx, row in counts_df.iterrows():
    uid = row['uploader_id']
    # get category list for uploader
    catlists = df[df['uploader_id'] == uid]['categories'].apply(lambda x: [] if not x else x)
    cats = [cat for catlist in catlists for cat in catlist]
    # get most common category
    counts_df.at[idx, 'most_common_category'] = max(set(cats), key=cats.count)

    # get tag list for uploader
    taglists = df[df['uploader_id'] == uid]['tags'].apply(lambda x: [] if not x else x)
    tags = [tag for taglist in taglists for tag in taglist]
    # get top 3 tags
    tags = [tag for tag, count in Counter(tags).most_common(3)]
    # get most common tags
    counts_df.at[idx, 'most_common_tag#1'] = tags[0] if tags else None
    counts_df.at[idx, 'most_common_tag#2'] = tags[1] if tags and len(tags) > 1 else None
    counts_df.at[idx, 'most_common_tag#3'] = tags[2] if tags and len(tags) > 2 else None


In [None]:
counts_df

In [None]:
# view results
#counts_df[:50]
#counts_df[50:100]
#counts_df[100:150]
#counts_df[150:200]
#counts_df[200:250]
counts_df[250:]
