In [1]:
import argparse
import os
import re
import requests
import urllib.request
import subprocess
import json
import pandas as pd
from IPython.display import clear_output


In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--url-list', help='List of URL/filename pairs, delimited by tabs')
args = parser.parse_args(args=[])


In [3]:
f = open("videos.json")
file_contents = json.load(f)
data = file_contents["clips"]
f.close()
df = pd.DataFrame(data, columns=["config_no_autoplay", "title"])


df["title"] = [
    x.lower()
    .replace(" - ", "_")
    .replace(" ", "_")
    .replace("?", "")
    .replace(r"/", "_")
    .replace("\\", "_")
    .replace("%", "")
    .replace(")", "")
    .replace(r"(", "")
    .replace("$", "")
    .replace("\r", "_")
    .replace("\n", "_")
    .replace(":", "")
    .replace(".mp4", "")
    for x in df["title"]
]

df.head(3)

Unnamed: 0,config_no_autoplay,title
0,https://player.vimeo.com/video/830060850/confi...,clase_60_clase_en_vivo_cierre_ii_1022cdfsncn51...
1,https://player.vimeo.com/video/830060843/confi...,clase_60_clase_en_vivo_cierre_ii_1022cdfsncn51...
2,https://player.vimeo.com/video/830059182/confi...,clase_60_clase_en_vivo_cierre_ii_1022cdfsncn51...


In [4]:
# reverse title list and reset index
df = df[::-1].reset_index(drop = True)

# rename title with index number
#df['title'] = df.index.astype(str) + "_" + df['title']

# rename title with index number
for i in range(len(df)):
    if i < 10:
        df['title'][i] = "0" + str(i) + "_" + df['title'][i]
    else:
        df['title'][i] = str(i) + "_" + df['title'][i]


In [5]:
# check the duplicate values for the title column
seen = set()
dupes = [x for x in df['title'] if x in seen or seen.add(x)]
dupes

[]

In [6]:
# remove videos that are already
files_name = [os.path.splitext(filename)[0] for filename in os.listdir("./videos/")]
df = df[~df["title"].isin(files_name)].reset_index(drop = True)

In [7]:
%%time

# get the master url
def nested_get(input_dict, nested_key):
    internal_dict_value = input_dict
    for k in nested_key:
        internal_dict_value = internal_dict_value.get(k, None)
        if internal_dict_value is None:
            return None
    return internal_dict_value

def getURLmaster(row, url_col):
    resp = requests.get(row[url_col])
    content = resp.json()
    result = nested_get(content,['request','files','dash','cdns','akfire_interconnect_quic', 'avc_url'])
    return result


df['master_url'] = df.apply(getURLmaster, axis=1, url_col="config_no_autoplay")

CPU times: user 114 ms, sys: 16.2 ms, total: 131 ms
Wall time: 4.49 s


In [8]:
# get filenames in a directory without extension
#files_name = [x.replace(".mp4", "") for x in os.listdir("./videos/")]
files_name = [os.path.splitext(filename)[0] for filename in os.listdir("./videos/")]

# remove videos that are already
df = df[~df["title"].isin(files_name)].reset_index(drop = True)

In [9]:
for i in range(len(df)):
    master_json_url = df['master_url'][i]
    output_file = df['title'][i]
    print("Processing %s" % output_file)

    # Extract some stuff
    base_url = master_json_url.rsplit("/", 6)[0] + "/"
    resp = requests.get(master_json_url)
    content = resp.json()

    # Video download here
    heights = [(i, d["height"]) for (i, d) in enumerate(content["video"])]
    idx = max(heights, key=lambda x: x[1])[0]
    video = content["video"][idx]
    init_segment = re.sub(r"(\.\.\/)+(\w+\/)+|[&]range.+", "", video['index_segment'], 0, re.MULTILINE)
    video_url = (
        base_url + content["base_url"].rsplit("/", 2)[1] + "/" + re.findall(r"\w+\/",video["base_url"])[0] \
        + init_segment)
    print("Video url:", video_url)

    filenameVideo = "video_%s.mp4" % video["id"]
    print("Saving VIDEO to %s" % filenameVideo)

    
    if resp.status_code != 200:
        print('not 200!')
        print(resp)
        print(video_url)
        break
    else:
        urllib.request.urlretrieve(video_url, filenameVideo)

    # Audio download here
    bitrate = [(i, d["bitrate"]) for (i, d) in enumerate(content["audio"])]

    print("Bitrate", bitrate)

    idx = max(bitrate, key=lambda x: x[1])[0]
    audio = content["audio"][idx]
    init_segment = re.sub(r"(\.\.\/)+(\w+\/)+|[&]range.+", "", audio['index_segment'], 0, re.MULTILINE)
    audio_url = (
        base_url + content["base_url"].rsplit("/", 2)[1] + "/" + re.findall(r"\w+\/",audio["base_url"])[0] \
        + init_segment)
    
    print("Audio url:", audio_url)

    filenameAudio = "audio_%s.mp4" % audio["id"]
    print("Saving AUDIO to %s" % filenameAudio)


    if resp.status_code != 200:
        print('not 200!')
        print(resp)
        print(audio_url)
        break
    else:
        urllib.request.urlretrieve(audio_url, filenameAudio )

    # Combine audio and video here
    print('Combining video and audio...')
    cmd = 'ffmpeg -hide_banner -loglevel error -y -i '
    cmd += filenameAudio
    cmd += ' -i '
    cmd += filenameVideo
    cmd += ' ' 
    cmd += './videos/' + output_file + '.mp4'
    subprocess.call(cmd, shell=True)
    print('Mixing Done!')

    # Delete the remaining single audio and video files
    os.remove(filenameAudio)
    os.remove(filenameVideo)
    print("Temporary files removed!")

    # Log the conclusion of the operations
    print("*** VIDEO DOWNLOADED SUCCESSFULLY ***")
    clear_output(wait=True)

Processing 60_clase_51_trabajando_con_componentes_1022cdfsncn51laed
Video url: https://173vod-adaptive.akamaized.net/exp=1690363439~acl=%2F600e4d0b-9f4f-4269-94e2-b037cdb24b34%2F%2A~hmac=842cc39661ef6db2bb43826b21131de72d9d7d25160385c9557fda214419c293/600e4d0b-9f4f-4269-94e2-b037cdb24b34/parcel/video/985e1035.mp4?r=dXMtd2VzdDE%3D
Saving VIDEO to video_985e1035.mp4
Bitrate [(0, 194000), (1, 102000), (2, 69000)]
Audio url: https://173vod-adaptive.akamaized.net/exp=1690363439~acl=%2F600e4d0b-9f4f-4269-94e2-b037cdb24b34%2F%2A~hmac=842cc39661ef6db2bb43826b21131de72d9d7d25160385c9557fda214419c293/600e4d0b-9f4f-4269-94e2-b037cdb24b34/parcel/audio/bef10ee6.mp4?r=dXM%3D
Saving AUDIO to audio_bef10ee6.mp4
Combining video and audio...
