pytube is susceptible to occasional outages. This program attempts to download a video three times, so in the event that all three attempts occurred during the outage, the video will not try to download again.

For videos which have not yet been downloaded and have three or more exceptions, all except the earliest two exceptions will be removed so that the script will attempt downloading again. This is intended as an ad-hoc exercise.

In [74]:
!pip install boto3



You should consider upgrading via the 'C:\Users\mhann\PycharmProjects\tube-video-archiver\.venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [75]:
import boto3
from botocore.exceptions import ClientError
import json

In [76]:
secret_name= 'prod/hanni'
region_name = 'us-east-1'
bucket_name = 'yta-bucket-name'
folder_name = 'yta-folder-name'

In [77]:
# Create a Secrets Manager client
session = boto3.session.Session()
client = session.client(service_name='secretsmanager', region_name=region_name)

try:
    get_secret_value_response = client.get_secret_value(
        SecretId=secret_name
    )
except ClientError as e:
    raise e

secrets = json.loads(get_secret_value_response['SecretString'])
bucket = secrets[bucket_name]
folder_name = secrets[folder_name]

In [78]:
s3_client = boto3.client('s3')

In [79]:
def get_s3_bucket_objects(_bucket,_prefix):
    object_list = []
    continuation_token = None
    while True:
        if continuation_token is not None:
            response = s3_client.list_objects_v2( Bucket=_bucket, Prefix=_prefix, ContinuationToken=continuation_token)
        else:
            response = s3_client.list_objects_v2( Bucket=_bucket, Prefix=_prefix)
        if 'Contents' in response:
            object_list.extend(response['Contents'])

        if 'NextContinuationToken' in response:
            continuation_token = response['NextContinuationToken']
        else:
            break
    return object_list

In [80]:
success_list = get_s3_bucket_objects(_bucket=bucket,_prefix=f'{folder_name}/json/')
exceptions_list = get_s3_bucket_objects(_bucket=bucket,_prefix=f'{folder_name}/exceptions/')

In [81]:
success_video_id_list = [obj['Key'].split('/')[-1][:-5] for obj in success_list if obj['Key'].endswith('.json')]

In [82]:
exception_by_video_id = {}
for exception in exceptions_list:
    exception_key = exception['Key']
    if not exception_key.endswith('.json'):
        continue
    video_id = exception_key.split('/')[2]
    if video_id in success_video_id_list:
        continue
    if video_id not in exception_by_video_id:
        exception_by_video_id[video_id] = []
    exception_by_video_id[video_id].append(exception)

In [83]:
for video_id in exception_by_video_id:
    exception_by_video_id[video_id].sort(key=lambda x: x['LastModified'],reverse=True)
    while len(exception_by_video_id[video_id]) > 2:
        exception = exception_by_video_id[video_id][0]
        print(f'{video_id}\t{exception["LastModified"]}\t{exception["Key"]}\t{len(exception_by_video_id[video_id])}')
        s3_client.delete_object(Bucket=bucket,Key=exception["Key"])
        exception_by_video_id[video_id].pop(0)