In [8]:
import os
import json
import pandas as pd
from random import randrange
from yt_dlp import YoutubeDL
from moviepy.editor import VideoFileClip
from tqdm import tqdm

In [9]:
def save_json(content, save_path):
    with open(save_path, 'w') as f:
        f.write(json.dumps(content))
def load_jsonl(filename):
    with open(filename, "r") as f:
        return [json.loads(l.strip("\n")) for l in f.readlines()]

In [10]:
#input json
#set medical video download folder
medvid_folder = "/home/hlpark/shared/MedVidQA/video"
if not os.path.exists(medvid_folder):
    os.makedirs(medvid_folder)

eval_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/evaluation/medivdqa"
if not os.path.exists(eval_folder):
    os.makedirs(eval_folder)

vid_json_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits/medvidqa"
if not os.path.exists(vid_json_folder):
    os.makedirs(vid_json_folder)

In [11]:
redownload_list = ['21s9tRvo_x4.mp4', '3C01Cqd1mHs.mp4', '1vnjCOq4rGQ.mp4', 'Qaez8OKf-eE.mp4', 'd8-GG4KD_Tw.mp4', 'LxwjHzTpXV4.mp4', 'wEfiJL-JS_4.mp4', 'U4u6jyJjDT4.mp4']

In [44]:
list_of_bad_videos = ['SztsZNp-jDM', 'OKXoHwkx55c', 'V9j5JkWGwI8', 'ehl2MPczYoQ', 'mMNloo140pU', 'QwhD5UTUW60', 'Ehan_VI7p4c']

In [36]:
train_path = '/home/hlpark/shared/MedVidQA/train.json'
val_path = '/home/hlpark/shared/MedVidQA/val.json'
test_path = '/home/hlpark/shared/MedVidQA/test.json'
txt_file_root = "/home/hlpark/shared/MedVidQA"

In [37]:
train = json.load(open(train_path))
val = json.load(open(val_path))
test = json.load(open(test_path))

In [38]:
new_train = {}
new_val = {}
new_test = {}
video_names = {}
vid_dict_train, vid_dict_val, vid_dict_test = {}, {}, {}
unav_train_query_cnt, unav_val_query_cnt, unav_test_query_cnt, total_test_cnt, total_val_cnt = 0, 0, 0, 0, 0

with open(os.path.join(txt_file_root, "medvidqa_train_queries.txt"), "w") as f:
    for i, qa in tqdm(enumerate(train)):
        if "aNct84-0WZk" in qa["video_id"]:
            #this video doesn't have audio and causes minilm text embedding extraction error due to empty string
            #will not be used for hirest training set 
            continue
        if qa['video_id'] in list_of_bad_videos:
            continue
        qa_dict = {}
        video_id = qa['video_id'] + ".mp4"
        qa_dict[video_id] = {}
        qa_dict[video_id]['relevant'] = True
        qa_dict[video_id]['clip'] = True
        qa_dict[video_id]['v_duration'] = qa['video_length']
        qa_dict[video_id]['bounds'] = [qa['answer_start_second'], qa['answer_end_second']]
        qa_dict[video_id]['steps'] = []
        qa_dict['qid'] = qa['sample_id']
        f.writelines(qa['question'] + "\n")
        total_test_cnt += 1
        if os.path.exists(os.path.join(medvid_folder, qa['video_id'] + ".mp4")):
            new_train[qa['question']] = qa_dict
            #only append when video is available for downloading
            if qa['video_id'] not in vid_dict_train:
                vid_dict_train[video_id] = qa_dict[video_id]['v_duration']
            if qa['video_id'] not in video_names:
                video_names[qa['video_id']] = []
            continue
        vid = qa['video_id']
        dl_ops = {
        'outtmpl': f'{medvid_folder}/{vid}.mp4'
        }
        try:
            with YoutubeDL(dl_ops) as ydl:
                ydl.download(qa['video_url'])
            new_train.append(qa_dict)
            #only append when video is available for downloading
            if qa['video_id'] not in vid_dict_train:
                vid_dict_train[video_id] = qa_dict[video_id]['v_duration']
            if qa['video_id'] not in video_names:
                video_names[qa['video_id']] = []
            
        except:
            print("Unvailable ", qa['video_id'])
            unav_train_query_cnt += 1

with open(os.path.join(txt_file_root, "medvidqa_test_queries.txt"), "w") as f:
    for i, qa in enumerate(test):
        qa_dict = {}
        video_id = qa['video_id'] + ".mp4"
        qa_dict[video_id] = {}
        qa_dict[video_id]['relevant'] = True
        qa_dict[video_id]['clip'] = True
        qa_dict[video_id]['v_duration'] = qa['video_length']
        qa_dict[video_id]['bounds'] = [qa['answer_start_second'], qa['answer_end_second']]
        qa_dict[video_id]['steps'] = []
        qa_dict['qid'] = qa['sample_id']
        f.writelines(qa['question'] + "\n")
        total_test_cnt += 1
        if os.path.exists(os.path.join(medvid_folder, qa['video_id'] + ".mp4")):
            new_test[qa['question']] = qa_dict
            #only append when video is available for downloading
            if qa['video_id'] not in vid_dict_test:
                vid_dict_test[video_id] = qa_dict[video_id]['v_duration']
            if qa['video_id'] not in video_names:
                video_names[qa['video_id']] = []
            continue
        vid = qa['video_id']
        dl_ops = {
        'outtmpl': f'{medvid_folder}/{vid}.mp4'
        }
        try:
            with YoutubeDL(dl_ops) as ydl:
                ydl.download(qa['video_url'])
            new_test.append(qa_dict)
            #only append when video is available for downloading
            if qa['video_id'] not in vid_dict_test:
                vid_dict_test[video_id] = qa_dict[video_id]['v_duration']
            if qa['video_id'] not in video_names:
                video_names[qa['video_id']] = []
            
        except:
            print("Unvailable ", qa['video_id'])
            unav_test_query_cnt += 1

with open(os.path.join(txt_file_root, "medvidqa_val_queries.txt"), "w") as f:
    for i, qa in enumerate(val):
        qa_dict = {}
        video_id = qa['video_id'] + ".mp4"
        qa_dict[video_id] = {}
        qa_dict[video_id]['relevant'] = True
        qa_dict[video_id]['clip'] = True
        qa_dict[video_id]['v_duration'] = qa['video_length']
        qa_dict[video_id]['bounds'] = [qa['answer_start_second'], qa['answer_end_second']]
        qa_dict[video_id]['steps'] = []
        qa_dict['qid'] = qa['sample_id']
        f.writelines(qa['question'] + "\n")
        total_val_cnt += 1
        if os.path.exists(os.path.join(medvid_folder, qa['video_id'] + ".mp4")):
            new_val[qa['question']] = qa_dict
            if qa['video_id'] not in vid_dict_val:
                vid_dict_val[video_id] = qa_dict[video_id]['v_duration']
            if qa['video_id'] not in video_names:
                video_names[qa['video_id']] = []
            continue
        vid = qa['video_id']
        dl_ops = {
        'outtmpl': f'{medvid_folder}/{vid}.mp4'
        }
        try:
            with YoutubeDL(dl_ops) as ydl:
                ydl.download(qa['video_url'])
            new_val.append(qa_dict)
            #only append when video is available for downloading
            if qa['video_id'] not in vid_dict_val:
                vid_dict_val[video_id] = qa_dict[video_id]['v_duration']
            if qa['video_id'] not in video_names:
                video_names[qa['video_id']] = []
        except:
            print("Unvailable ", qa['video_id'])
            unav_val_query_cnt += 1
print(f"{unav_val_query_cnt}/ {total_val_cnt}, {unav_test_query_cnt} / {total_test_cnt} ")

0it [00:00, ?it/s]

[youtube] Extracting URL: https://www.youtube.com/watch?v=zWG4LWPraYw
[youtube] zWG4LWPraYw: Downloading webpage
[youtube] zWG4LWPraYw: Downloading ios player API JSON
[youtube] zWG4LWPraYw: Downloading android player API JSON


ERROR: [youtube] zWG4LWPraYw: Video unavailable
635it [00:05, 112.60it/s]

Unvailable  zWG4LWPraYw
[youtube] Extracting URL: https://www.youtube.com/watch?v=WU9-MmRe4_g
[youtube] WU9-MmRe4_g: Downloading webpage
[youtube] WU9-MmRe4_g: Downloading ios player API JSON
[youtube] WU9-MmRe4_g: Downloading android player API JSON


ERROR: [youtube] WU9-MmRe4_g: Private video. Sign in if you've been granted access to this video
902it [00:06, 154.27it/s]

Unvailable  WU9-MmRe4_g
[youtube] Extracting URL: https://www.youtube.com/watch?v=cmUXo4Crrm0
[youtube] cmUXo4Crrm0: Downloading webpage
[youtube] cmUXo4Crrm0: Downloading ios player API JSON
[youtube] cmUXo4Crrm0: Downloading android player API JSON


ERROR: [youtube] cmUXo4Crrm0: Private video. Sign in if you've been granted access to this video


Unvailable  cmUXo4Crrm0
[youtube] Extracting URL: https://www.youtube.com/watch?v=cmUXo4Crrm0
[youtube] cmUXo4Crrm0: Downloading webpage
[youtube] cmUXo4Crrm0: Downloading ios player API JSON
[youtube] cmUXo4Crrm0: Downloading android player API JSON


ERROR: [youtube] cmUXo4Crrm0: Private video. Sign in if you've been granted access to this video


Unvailable  cmUXo4Crrm0
[youtube] Extracting URL: https://www.youtube.com/watch?v=cmUXo4Crrm0
[youtube] cmUXo4Crrm0: Downloading webpage
[youtube] cmUXo4Crrm0: Downloading ios player API JSON
[youtube] cmUXo4Crrm0: Downloading android player API JSON


ERROR: [youtube] cmUXo4Crrm0: Private video. Sign in if you've been granted access to this video


Unvailable  cmUXo4Crrm0
[youtube] Extracting URL: https://www.youtube.com/watch?v=cmUXo4Crrm0
[youtube] cmUXo4Crrm0: Downloading webpage
[youtube] cmUXo4Crrm0: Downloading ios player API JSON
[youtube] cmUXo4Crrm0: Downloading android player API JSON


ERROR: [youtube] cmUXo4Crrm0: Private video. Sign in if you've been granted access to this video


Unvailable  cmUXo4Crrm0
[youtube] Extracting URL: https://www.youtube.com/watch?v=fdel7Zvq5s4
[youtube] fdel7Zvq5s4: Downloading webpage
[youtube] fdel7Zvq5s4: Downloading ios player API JSON
[youtube] fdel7Zvq5s4: Downloading android player API JSON


ERROR: [youtube] fdel7Zvq5s4: Private video. Sign in if you've been granted access to this video


Unvailable  fdel7Zvq5s4
[youtube] Extracting URL: https://www.youtube.com/watch?v=JifKGgGGlMg
[youtube] JifKGgGGlMg: Downloading webpage
[youtube] JifKGgGGlMg: Downloading ios player API JSON
[youtube] JifKGgGGlMg: Downloading android player API JSON


ERROR: [youtube] JifKGgGGlMg: Private video. Sign in if you've been granted access to this video
931it [00:09, 84.16it/s] 

Unvailable  JifKGgGGlMg
[youtube] Extracting URL: https://www.youtube.com/watch?v=JifKGgGGlMg
[youtube] JifKGgGGlMg: Downloading webpage
[youtube] JifKGgGGlMg: Downloading ios player API JSON
[youtube] JifKGgGGlMg: Downloading android player API JSON


ERROR: [youtube] JifKGgGGlMg: Private video. Sign in if you've been granted access to this video


Unvailable  JifKGgGGlMg
[youtube] Extracting URL: https://www.youtube.com/watch?v=JifKGgGGlMg
[youtube] JifKGgGGlMg: Downloading webpage
[youtube] JifKGgGGlMg: Downloading ios player API JSON
[youtube] JifKGgGGlMg: Downloading android player API JSON


ERROR: [youtube] JifKGgGGlMg: Private video. Sign in if you've been granted access to this video


Unvailable  JifKGgGGlMg
[youtube] Extracting URL: https://www.youtube.com/watch?v=JifKGgGGlMg
[youtube] JifKGgGGlMg: Downloading webpage
[youtube] JifKGgGGlMg: Downloading ios player API JSON
[youtube] JifKGgGGlMg: Downloading android player API JSON


ERROR: [youtube] JifKGgGGlMg: Private video. Sign in if you've been granted access to this video


Unvailable  JifKGgGGlMg
[youtube] Extracting URL: https://www.youtube.com/watch?v=o5EJGeKIGrk
[youtube] o5EJGeKIGrk: Downloading webpage
[youtube] o5EJGeKIGrk: Downloading ios player API JSON
[youtube] o5EJGeKIGrk: Downloading android player API JSON


ERROR: [youtube] o5EJGeKIGrk: Video unavailable


Unvailable  o5EJGeKIGrk
[youtube] Extracting URL: https://www.youtube.com/watch?v=o5EJGeKIGrk
[youtube] o5EJGeKIGrk: Downloading webpage
[youtube] o5EJGeKIGrk: Downloading ios player API JSON
[youtube] o5EJGeKIGrk: Downloading android player API JSON


ERROR: [youtube] o5EJGeKIGrk: Video unavailable


Unvailable  o5EJGeKIGrk
[youtube] Extracting URL: https://www.youtube.com/watch?v=o5EJGeKIGrk
[youtube] o5EJGeKIGrk: Downloading webpage
[youtube] o5EJGeKIGrk: Downloading ios player API JSON
[youtube] o5EJGeKIGrk: Downloading android player API JSON


ERROR: [youtube] o5EJGeKIGrk: Video unavailable
947it [00:12, 52.39it/s]

Unvailable  o5EJGeKIGrk
[youtube] Extracting URL: https://www.youtube.com/watch?v=ID4nnTGrkc8
[youtube] ID4nnTGrkc8: Downloading webpage
[youtube] ID4nnTGrkc8: Downloading ios player API JSON
[youtube] ID4nnTGrkc8: Downloading android player API JSON


ERROR: [youtube] ID4nnTGrkc8: Private video. Sign in if you've been granted access to this video
1046it [00:12, 65.07it/s]

Unvailable  ID4nnTGrkc8
[youtube] Extracting URL: https://www.youtube.com/watch?v=ID4nnTGrkc8
[youtube] ID4nnTGrkc8: Downloading webpage
[youtube] ID4nnTGrkc8: Downloading ios player API JSON
[youtube] ID4nnTGrkc8: Downloading android player API JSON


ERROR: [youtube] ID4nnTGrkc8: Private video. Sign in if you've been granted access to this video


Unvailable  ID4nnTGrkc8
[youtube] Extracting URL: https://www.youtube.com/watch?v=ID4nnTGrkc8
[youtube] ID4nnTGrkc8: Downloading webpage
[youtube] ID4nnTGrkc8: Downloading ios player API JSON
[youtube] ID4nnTGrkc8: Downloading android player API JSON


ERROR: [youtube] ID4nnTGrkc8: Private video. Sign in if you've been granted access to this video
1060it [00:13, 55.03it/s]

Unvailable  ID4nnTGrkc8
[youtube] Extracting URL: https://www.youtube.com/watch?v=B8WEfHMeWwg
[youtube] B8WEfHMeWwg: Downloading webpage
[youtube] B8WEfHMeWwg: Downloading ios player API JSON
[youtube] B8WEfHMeWwg: Downloading android player API JSON


ERROR: [youtube] B8WEfHMeWwg: Private video. Sign in if you've been granted access to this video


Unvailable  B8WEfHMeWwg
[youtube] Extracting URL: https://www.youtube.com/watch?v=B8WEfHMeWwg
[youtube] B8WEfHMeWwg: Downloading webpage
[youtube] B8WEfHMeWwg: Downloading ios player API JSON
[youtube] B8WEfHMeWwg: Downloading android player API JSON


ERROR: [youtube] B8WEfHMeWwg: Private video. Sign in if you've been granted access to this video


Unvailable  B8WEfHMeWwg
[youtube] Extracting URL: https://www.youtube.com/watch?v=B8WEfHMeWwg
[youtube] B8WEfHMeWwg: Downloading webpage
[youtube] B8WEfHMeWwg: Downloading ios player API JSON
[youtube] B8WEfHMeWwg: Downloading android player API JSON


ERROR: [youtube] B8WEfHMeWwg: Private video. Sign in if you've been granted access to this video


Unvailable  B8WEfHMeWwg
[youtube] Extracting URL: https://www.youtube.com/watch?v=B8WEfHMeWwg
[youtube] B8WEfHMeWwg: Downloading webpage
[youtube] B8WEfHMeWwg: Downloading ios player API JSON
[youtube] B8WEfHMeWwg: Downloading android player API JSON


ERROR: [youtube] B8WEfHMeWwg: Private video. Sign in if you've been granted access to this video


Unvailable  B8WEfHMeWwg
[youtube] Extracting URL: https://www.youtube.com/watch?v=B8WEfHMeWwg
[youtube] B8WEfHMeWwg: Downloading webpage
[youtube] B8WEfHMeWwg: Downloading ios player API JSON
[youtube] B8WEfHMeWwg: Downloading android player API JSON


ERROR: [youtube] B8WEfHMeWwg: Private video. Sign in if you've been granted access to this video
1070it [00:16, 34.18it/s]

Unvailable  B8WEfHMeWwg
[youtube] Extracting URL: https://www.youtube.com/watch?v=9fTAW_HAN2Y
[youtube] 9fTAW_HAN2Y: Downloading webpage
[youtube] 9fTAW_HAN2Y: Downloading ios player API JSON
[youtube] 9fTAW_HAN2Y: Downloading android player API JSON


ERROR: [youtube] 9fTAW_HAN2Y: Video unavailable
1097it [00:16, 36.80it/s]

Unvailable  9fTAW_HAN2Y
[youtube] Extracting URL: https://www.youtube.com/watch?v=9fTAW_HAN2Y
[youtube] 9fTAW_HAN2Y: Downloading webpage
[youtube] 9fTAW_HAN2Y: Downloading ios player API JSON
[youtube] 9fTAW_HAN2Y: Downloading android player API JSON


ERROR: [youtube] 9fTAW_HAN2Y: Video unavailable


Unvailable  9fTAW_HAN2Y
[youtube] Extracting URL: https://www.youtube.com/watch?v=9fTAW_HAN2Y
[youtube] 9fTAW_HAN2Y: Downloading webpage
[youtube] 9fTAW_HAN2Y: Downloading ios player API JSON
[youtube] 9fTAW_HAN2Y: Downloading android player API JSON


ERROR: [youtube] 9fTAW_HAN2Y: Video unavailable


Unvailable  9fTAW_HAN2Y
[youtube] Extracting URL: https://www.youtube.com/watch?v=9fTAW_HAN2Y
[youtube] 9fTAW_HAN2Y: Downloading webpage
[youtube] 9fTAW_HAN2Y: Downloading ios player API JSON
[youtube] 9fTAW_HAN2Y: Downloading android player API JSON


ERROR: [youtube] 9fTAW_HAN2Y: Video unavailable
1104it [00:18, 27.29it/s]

Unvailable  9fTAW_HAN2Y
[youtube] Extracting URL: https://www.youtube.com/watch?v=tVZLqXaE7-8
[youtube] tVZLqXaE7-8: Downloading webpage
[youtube] tVZLqXaE7-8: Downloading ios player API JSON
[youtube] tVZLqXaE7-8: Downloading android player API JSON


ERROR: [youtube] tVZLqXaE7-8: Private video. Sign in if you've been granted access to this video
1117it [00:18, 27.20it/s]

Unvailable  tVZLqXaE7-8
[youtube] Extracting URL: https://www.youtube.com/watch?v=tVZLqXaE7-8
[youtube] tVZLqXaE7-8: Downloading webpage
[youtube] tVZLqXaE7-8: Downloading ios player API JSON
[youtube] tVZLqXaE7-8: Downloading android player API JSON


ERROR: [youtube] tVZLqXaE7-8: Private video. Sign in if you've been granted access to this video
1121it [00:18, 24.26it/s]

Unvailable  tVZLqXaE7-8
[youtube] Extracting URL: https://www.youtube.com/watch?v=AfTih6YgmvU
[youtube] AfTih6YgmvU: Downloading webpage
[youtube] AfTih6YgmvU: Downloading ios player API JSON
[youtube] AfTih6YgmvU: Downloading android player API JSON


ERROR: [youtube] AfTih6YgmvU: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.
1210it [00:19, 54.36it/s]

Unvailable  AfTih6YgmvU
[youtube] Extracting URL: https://www.youtube.com/watch?v=PwoymvaC258
[youtube] PwoymvaC258: Downloading webpage
[youtube] PwoymvaC258: Downloading ios player API JSON
[youtube] PwoymvaC258: Downloading android player API JSON


ERROR: [youtube] PwoymvaC258: Video unavailable
1305it [00:19, 85.48it/s]

Unvailable  PwoymvaC258
[youtube] Extracting URL: https://www.youtube.com/watch?v=mbho7AJOEyA
[youtube] mbho7AJOEyA: Downloading webpage
[youtube] mbho7AJOEyA: Downloading ios player API JSON
[youtube] mbho7AJOEyA: Downloading android player API JSON


ERROR: [youtube] mbho7AJOEyA: Video unavailable
1329it [00:20, 78.07it/s]

Unvailable  mbho7AJOEyA
[youtube] Extracting URL: https://www.youtube.com/watch?v=mbho7AJOEyA
[youtube] mbho7AJOEyA: Downloading webpage
[youtube] mbho7AJOEyA: Downloading ios player API JSON
[youtube] mbho7AJOEyA: Downloading android player API JSON


ERROR: [youtube] mbho7AJOEyA: Video unavailable


Unvailable  mbho7AJOEyA
[youtube] Extracting URL: https://www.youtube.com/watch?v=mbho7AJOEyA
[youtube] mbho7AJOEyA: Downloading webpage
[youtube] mbho7AJOEyA: Downloading ios player API JSON
[youtube] mbho7AJOEyA: Downloading android player API JSON


ERROR: [youtube] mbho7AJOEyA: Video unavailable
1339it [00:21, 53.18it/s]

Unvailable  mbho7AJOEyA
[youtube] Extracting URL: https://www.youtube.com/watch?v=IiJI-_VGNnk
[youtube] IiJI-_VGNnk: Downloading webpage
[youtube] IiJI-_VGNnk: Downloading ios player API JSON
[youtube] IiJI-_VGNnk: Downloading android player API JSON


ERROR: [youtube] IiJI-_VGNnk: Video unavailable
1508it [00:21, 121.72it/s]

Unvailable  IiJI-_VGNnk
[youtube] Extracting URL: https://www.youtube.com/watch?v=OTj3KxprO40
[youtube] OTj3KxprO40: Downloading webpage
[youtube] OTj3KxprO40: Downloading ios player API JSON
[youtube] OTj3KxprO40: Downloading android player API JSON


ERROR: [youtube] OTj3KxprO40: Private video. Sign in if you've been granted access to this video
1565it [00:22, 116.41it/s]

Unvailable  OTj3KxprO40
[youtube] Extracting URL: https://www.youtube.com/watch?v=tfDwIeBHo-M
[youtube] tfDwIeBHo-M: Downloading webpage
[youtube] tfDwIeBHo-M: Downloading ios player API JSON
[youtube] tfDwIeBHo-M: Downloading android player API JSON


ERROR: [youtube] tfDwIeBHo-M: Video unavailable
1836it [00:22, 230.01it/s]

Unvailable  tfDwIeBHo-M
[youtube] Extracting URL: https://www.youtube.com/watch?v=tfDwIeBHo-M
[youtube] tfDwIeBHo-M: Downloading webpage
[youtube] tfDwIeBHo-M: Downloading ios player API JSON
[youtube] tfDwIeBHo-M: Downloading android player API JSON


ERROR: [youtube] tfDwIeBHo-M: Video unavailable
1863it [00:23, 187.04it/s]

Unvailable  tfDwIeBHo-M
[youtube] Extracting URL: https://www.youtube.com/watch?v=T2YKnBmoq2E
[youtube] T2YKnBmoq2E: Downloading webpage
[youtube] T2YKnBmoq2E: Downloading ios player API JSON
[youtube] T2YKnBmoq2E: Downloading android player API JSON


ERROR: [youtube] T2YKnBmoq2E: Private video. Sign in if you've been granted access to this video
2093it [00:23, 260.70it/s]

Unvailable  T2YKnBmoq2E
[youtube] Extracting URL: https://www.youtube.com/watch?v=T2YKnBmoq2E
[youtube] T2YKnBmoq2E: Downloading webpage
[youtube] T2YKnBmoq2E: Downloading ios player API JSON
[youtube] T2YKnBmoq2E: Downloading android player API JSON


ERROR: [youtube] T2YKnBmoq2E: Private video. Sign in if you've been granted access to this video


Unvailable  T2YKnBmoq2E
[youtube] Extracting URL: https://www.youtube.com/watch?v=T2YKnBmoq2E
[youtube] T2YKnBmoq2E: Downloading webpage
[youtube] T2YKnBmoq2E: Downloading ios player API JSON
[youtube] T2YKnBmoq2E: Downloading android player API JSON


ERROR: [youtube] T2YKnBmoq2E: Private video. Sign in if you've been granted access to this video


Unvailable  T2YKnBmoq2E
[youtube] Extracting URL: https://www.youtube.com/watch?v=T2YKnBmoq2E
[youtube] T2YKnBmoq2E: Downloading webpage
[youtube] T2YKnBmoq2E: Downloading ios player API JSON
[youtube] T2YKnBmoq2E: Downloading android player API JSON


ERROR: [youtube] T2YKnBmoq2E: Private video. Sign in if you've been granted access to this video


Unvailable  T2YKnBmoq2E
[youtube] Extracting URL: https://www.youtube.com/watch?v=T2YKnBmoq2E
[youtube] T2YKnBmoq2E: Downloading webpage
[youtube] T2YKnBmoq2E: Downloading ios player API JSON
[youtube] T2YKnBmoq2E: Downloading android player API JSON


ERROR: [youtube] T2YKnBmoq2E: Private video. Sign in if you've been granted access to this video
2121it [00:25, 111.65it/s]

Unvailable  T2YKnBmoq2E
[youtube] Extracting URL: https://www.youtube.com/watch?v=uyqOCjCXzfY
[youtube] uyqOCjCXzfY: Downloading webpage
[youtube] uyqOCjCXzfY: Downloading ios player API JSON
[youtube] uyqOCjCXzfY: Downloading android player API JSON


ERROR: [youtube] uyqOCjCXzfY: Private video. Sign in if you've been granted access to this video
2198it [00:26, 113.61it/s]

Unvailable  uyqOCjCXzfY
[youtube] Extracting URL: https://www.youtube.com/watch?v=ADcotG388SI
[youtube] ADcotG388SI: Downloading webpage
[youtube] ADcotG388SI: Downloading ios player API JSON
[youtube] ADcotG388SI: Downloading android player API JSON


ERROR: [youtube] ADcotG388SI: Video unavailable


Unvailable  ADcotG388SI
[youtube] Extracting URL: https://www.youtube.com/watch?v=ADcotG388SI
[youtube] ADcotG388SI: Downloading webpage
[youtube] ADcotG388SI: Downloading ios player API JSON
[youtube] ADcotG388SI: Downloading android player API JSON


ERROR: [youtube] ADcotG388SI: Video unavailable


Unvailable  ADcotG388SI
[youtube] Extracting URL: https://www.youtube.com/watch?v=ADcotG388SI
[youtube] ADcotG388SI: Downloading webpage
[youtube] ADcotG388SI: Downloading ios player API JSON
[youtube] ADcotG388SI: Downloading android player API JSON


ERROR: [youtube] ADcotG388SI: Video unavailable
2216it [00:27, 72.88it/s] 

Unvailable  ADcotG388SI
[youtube] Extracting URL: https://www.youtube.com/watch?v=Jq2vsUBanR4
[youtube] Jq2vsUBanR4: Downloading webpage
[youtube] Jq2vsUBanR4: Downloading ios player API JSON
[youtube] Jq2vsUBanR4: Downloading android player API JSON


ERROR: [youtube] Jq2vsUBanR4: Video unavailable
2441it [00:28, 140.88it/s]

Unvailable  Jq2vsUBanR4
[youtube] Extracting URL: https://www.youtube.com/watch?v=Jq2vsUBanR4
[youtube] Jq2vsUBanR4: Downloading webpage
[youtube] Jq2vsUBanR4: Downloading ios player API JSON
[youtube] Jq2vsUBanR4: Downloading android player API JSON


ERROR: [youtube] Jq2vsUBanR4: Video unavailable
2465it [00:28, 125.44it/s]

Unvailable  Jq2vsUBanR4
[youtube] Extracting URL: https://www.youtube.com/watch?v=o0I9DWl7ARA
[youtube] o0I9DWl7ARA: Downloading webpage
[youtube] o0I9DWl7ARA: Downloading ios player API JSON
[youtube] o0I9DWl7ARA: Downloading android player API JSON


ERROR: [youtube] o0I9DWl7ARA: Video unavailable


Unvailable  o0I9DWl7ARA
[youtube] Extracting URL: https://www.youtube.com/watch?v=o0I9DWl7ARA
[youtube] o0I9DWl7ARA: Downloading webpage
[youtube] o0I9DWl7ARA: Downloading ios player API JSON
[youtube] o0I9DWl7ARA: Downloading android player API JSON


ERROR: [youtube] o0I9DWl7ARA: Video unavailable


Unvailable  o0I9DWl7ARA
[youtube] Extracting URL: https://www.youtube.com/watch?v=NuqoG0PALHs
[youtube] NuqoG0PALHs: Downloading webpage
[youtube] NuqoG0PALHs: Downloading ios player API JSON
[youtube] NuqoG0PALHs: Downloading android player API JSON


ERROR: [youtube] NuqoG0PALHs: Private video. Sign in if you've been granted access to this video


Unvailable  NuqoG0PALHs
[youtube] Extracting URL: https://www.youtube.com/watch?v=NuqoG0PALHs
[youtube] NuqoG0PALHs: Downloading webpage
[youtube] NuqoG0PALHs: Downloading ios player API JSON
[youtube] NuqoG0PALHs: Downloading android player API JSON


ERROR: [youtube] NuqoG0PALHs: Private video. Sign in if you've been granted access to this video


Unvailable  NuqoG0PALHs
[youtube] Extracting URL: https://www.youtube.com/watch?v=NuqoG0PALHs
[youtube] NuqoG0PALHs: Downloading webpage
[youtube] NuqoG0PALHs: Downloading ios player API JSON
[youtube] NuqoG0PALHs: Downloading android player API JSON


ERROR: [youtube] NuqoG0PALHs: Private video. Sign in if you've been granted access to this video


Unvailable  NuqoG0PALHs
[youtube] Extracting URL: https://www.youtube.com/watch?v=NuqoG0PALHs
[youtube] NuqoG0PALHs: Downloading webpage
[youtube] NuqoG0PALHs: Downloading ios player API JSON
[youtube] NuqoG0PALHs: Downloading android player API JSON


ERROR: [youtube] NuqoG0PALHs: Private video. Sign in if you've been granted access to this video


Unvailable  NuqoG0PALHs
[youtube] Extracting URL: https://www.youtube.com/watch?v=NuqoG0PALHs
[youtube] NuqoG0PALHs: Downloading webpage
[youtube] NuqoG0PALHs: Downloading ios player API JSON
[youtube] NuqoG0PALHs: Downloading android player API JSON


ERROR: [youtube] NuqoG0PALHs: Private video. Sign in if you've been granted access to this video


Unvailable  NuqoG0PALHs
[youtube] Extracting URL: https://www.youtube.com/watch?v=NuqoG0PALHs
[youtube] NuqoG0PALHs: Downloading webpage
[youtube] NuqoG0PALHs: Downloading ios player API JSON
[youtube] NuqoG0PALHs: Downloading android player API JSON


ERROR: [youtube] NuqoG0PALHs: Private video. Sign in if you've been granted access to this video
2710it [00:32, 83.14it/s] 

Unvailable  NuqoG0PALHs





[youtube] Extracting URL: https://www.youtube.com/watch?v=5gPyNNXP-wA
[youtube] 5gPyNNXP-wA: Downloading webpage
[youtube] 5gPyNNXP-wA: Downloading ios player API JSON
[youtube] 5gPyNNXP-wA: Downloading android player API JSON


ERROR: [youtube] 5gPyNNXP-wA: Private video. Sign in if you've been granted access to this video


Unvailable  5gPyNNXP-wA
[youtube] Extracting URL: https://www.youtube.com/watch?v=5gPyNNXP-wA
[youtube] 5gPyNNXP-wA: Downloading webpage
[youtube] 5gPyNNXP-wA: Downloading ios player API JSON
[youtube] 5gPyNNXP-wA: Downloading android player API JSON


ERROR: [youtube] 5gPyNNXP-wA: Private video. Sign in if you've been granted access to this video


Unvailable  5gPyNNXP-wA
[youtube] Extracting URL: https://www.youtube.com/watch?v=5gPyNNXP-wA
[youtube] 5gPyNNXP-wA: Downloading webpage
[youtube] 5gPyNNXP-wA: Downloading ios player API JSON
[youtube] 5gPyNNXP-wA: Downloading android player API JSON


ERROR: [youtube] 5gPyNNXP-wA: Private video. Sign in if you've been granted access to this video


Unvailable  5gPyNNXP-wA
[youtube] Extracting URL: https://www.youtube.com/watch?v=5gPyNNXP-wA
[youtube] 5gPyNNXP-wA: Downloading webpage
[youtube] 5gPyNNXP-wA: Downloading ios player API JSON
[youtube] 5gPyNNXP-wA: Downloading android player API JSON


ERROR: [youtube] 5gPyNNXP-wA: Private video. Sign in if you've been granted access to this video


Unvailable  5gPyNNXP-wA
[youtube] Extracting URL: https://www.youtube.com/watch?v=5gPyNNXP-wA
[youtube] 5gPyNNXP-wA: Downloading webpage
[youtube] 5gPyNNXP-wA: Downloading ios player API JSON
[youtube] 5gPyNNXP-wA: Downloading android player API JSON


ERROR: [youtube] 5gPyNNXP-wA: Private video. Sign in if you've been granted access to this video


Unvailable  5gPyNNXP-wA
[youtube] Extracting URL: https://www.youtube.com/watch?v=5gPyNNXP-wA
[youtube] 5gPyNNXP-wA: Downloading webpage
[youtube] 5gPyNNXP-wA: Downloading ios player API JSON
[youtube] 5gPyNNXP-wA: Downloading android player API JSON


ERROR: [youtube] 5gPyNNXP-wA: Private video. Sign in if you've been granted access to this video


Unvailable  5gPyNNXP-wA
[youtube] Extracting URL: https://www.youtube.com/watch?v=5gPyNNXP-wA
[youtube] 5gPyNNXP-wA: Downloading webpage
[youtube] 5gPyNNXP-wA: Downloading ios player API JSON
[youtube] 5gPyNNXP-wA: Downloading android player API JSON


ERROR: [youtube] 5gPyNNXP-wA: Private video. Sign in if you've been granted access to this video


Unvailable  5gPyNNXP-wA
[youtube] Extracting URL: https://www.youtube.com/watch?v=0YsX8qv0FaI
[youtube] 0YsX8qv0FaI: Downloading webpage
[youtube] 0YsX8qv0FaI: Downloading ios player API JSON
[youtube] 0YsX8qv0FaI: Downloading android player API JSON


ERROR: [youtube] 0YsX8qv0FaI: Private video. Sign in if you've been granted access to this video


Unvailable  0YsX8qv0FaI
[youtube] Extracting URL: https://www.youtube.com/watch?v=0YsX8qv0FaI
[youtube] 0YsX8qv0FaI: Downloading webpage
[youtube] 0YsX8qv0FaI: Downloading ios player API JSON
[youtube] 0YsX8qv0FaI: Downloading android player API JSON


ERROR: [youtube] 0YsX8qv0FaI: Private video. Sign in if you've been granted access to this video


Unvailable  0YsX8qv0FaI
[youtube] Extracting URL: https://www.youtube.com/watch?v=ZO_G0-jBVH8
[youtube] ZO_G0-jBVH8: Downloading webpage
[youtube] ZO_G0-jBVH8: Downloading ios player API JSON
[youtube] ZO_G0-jBVH8: Downloading android player API JSON


ERROR: [youtube] ZO_G0-jBVH8: Video unavailable


Unvailable  ZO_G0-jBVH8
[youtube] Extracting URL: https://www.youtube.com/watch?v=ZO_G0-jBVH8
[youtube] ZO_G0-jBVH8: Downloading webpage
[youtube] ZO_G0-jBVH8: Downloading ios player API JSON
[youtube] ZO_G0-jBVH8: Downloading android player API JSON


ERROR: [youtube] ZO_G0-jBVH8: Video unavailable


Unvailable  ZO_G0-jBVH8
4/ 145, 7 / 2844 


In [39]:
save_json(new_train, f'{vid_json_folder}/all_data_train.json')
save_json(new_val, f'{vid_json_folder}/all_data_val.json')
save_json(new_test, f'{vid_json_folder}/all_data_test.json')

In [50]:
#dont need to run it, this is for verifying vid length and frame extraction
raw_frames_path = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/medvidqa/raw_frames"
for r, d, f in os.walk(raw_frames_path):
    for dir in d:
        for _, _, files in os.walk(os.path.join(r, dir)):
            if dir in vid_dict_val and vid_dict_val[dir]!= len(files):
                print(dir, len(files), vid_dict_val[dir])
            elif dir in vid_dict_test and vid_dict_test[dir] != len(files):
                print(dir, len(files), vid_dict_test[dir])
            else:
                if dir not in vid_dict_val and dir not in vid_dict_test:
                    print("not in any")

O1kiguGUt3o.mp4 98 97
5GO18A5-ZtQ.mp4 573 572
CvW-Zq3NlkU.mp4 84 83
wkF8OwsyNNQ.mp4 118 117
nopgqBoUmYg.mp4 103 102
71LlfdW548U.mp4 312 311
gJOMV2mZ1B0.mp4 72 71
E92qqAftUak.mp4 78 77
s4wUKmr2xtA.mp4 827 826
pYzUQb79_Rw.mp4 326 325
tr7JKUxwqwA.mp4 303 302
2oXoiQfija4.mp4 408 407
AtKU8zw2Jxg.mp4 596 595
N59oOxyaE1A.mp4 618 617
RRLY6anXXQo.mp4 71 70
lwhYbWFtNuE.mp4 137 136
hJ3XICFLxvU.mp4 361 360
lfhDPB8LaTk.mp4 392 391
bUgV89cdfTM.mp4 105 104
6kQEDRQdJZ8.mp4 583 582
kSpggqOLgaU.mp4 451 450
0dr5yuoBOF4.mp4 377 376
M6WV95X0fRk.mp4 91 90
fsdchGejKmU.mp4 618 617
g-gNQPyxU4c.mp4 686 685
UZqktEPlxTU.mp4 119 118
LL854--GBy4.mp4 586 585
VykLxY9mzug.mp4 742 741
GUg4zMpchhk.mp4 530 529
1Nr6wPFz09A.mp4 227 226
B9yiDTONlOs.mp4 157 156
hekjiCqb9-g.mp4 105 104
-Zx3pWhMBjw.mp4 234 233
4h3V6F4Rl_k.mp4 406 405
vgW6ZD_QwwQ.mp4 411 410
CWRWeoaqi8k.mp4 423 422
R9oWYkK_l3M.mp4 165 164
xYFeFFzFftw.mp4 566 565
not in any
not in any
not in any
not in any
not in any
not in any
not in any
not in any
not in any
n

In [92]:
weak_label_path = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits/medvidqa/five_labeled_pred_med_from_gt_vid_dict.json"
weak_label = json.load(open(weak_label_path))

In [93]:
list_of_bad_videos = ['SztsZNp-jDM', 'OKXoHwkx55c', 'V9j5JkWGwI8', 'ehl2MPczYoQ', 'mMNloo140pU', 'QwhD5UTUW60', 'Ehan_VI7p4c']

In [106]:
error_videos = ['G2tP1mJHbAg', 'lUBQ7-uOWKw', 'GSbjwzyuUoo', 'f_w82yx87KI', 'ILoZc6rpdU8', 'Q8m25MkEr0o', 'EnAyXA3_Mo4', 'Ur7UCjQTV9E', '6_hrMBnA1VA', 'Mx0PPNwfdvE', 'Q1ifxZ3lR2o']

In [107]:
new_train_med = {}
new_train_nonmed = {}
new_train_both = {}
for i, qa in tqdm(enumerate(train)):
    if "aNct84-0WZk" in qa["video_id"]:
        #this video doesn't have audio and causes minilm text embedding extraction error due to empty string
        #will not be used for hirest training set 
        continue
    if qa['video_id'] in list_of_bad_videos + error_videos:
        continue
    qa_dict = {}
    video_id = qa['video_id'] + ".mp4"
    qa_dict[video_id] = {}
    qa_dict[video_id]['relevant'] = True
    qa_dict[video_id]['clip'] = True
    qa_dict[video_id]['v_duration'] = qa['video_length']
    qa_dict[video_id]['bounds'] = [qa['answer_start_second'], qa['answer_end_second']]
    qa_dict[video_id]['steps'] = []
    qa_dict['qid'] = qa['sample_id']
    
    if qa['video_id'] in weak_label:
        for idx, q in enumerate(weak_label[qa['video_id']]):
            if qa['question'] not in q:
                continue
            if q[qa['question']] == "med":
                new_train_med[qa['question']] = qa_dict
            elif q[qa['question']] == "nonmed":
                new_train_nonmed[qa['question']] = qa_dict
            new_train_both[qa['question']] = qa_dict
        


2710it [00:00, 74896.81it/s]


In [108]:
save_json(new_train_med, f'{vid_json_folder}/visual_medical_train.json')
save_json(new_train_nonmed, f'{vid_json_folder}/visual_nonmedical_train.json')
save_json(new_train_both, f'{vid_json_folder}/postprocessed_train.json')