In [1]:
import os
import json
import pandas as pd
from moviepy.editor import VideoFileClip
from tqdm import tqdm

In [2]:
def save_json(content, save_path):
    with open(save_path, 'w') as f:
        f.write(json.dumps(content))
def load_jsonl(filename):
    with open(filename, "r") as f:
        return [json.loads(l.strip("\n")) for l in f.readlines()]

In [3]:
#input json
#set medical video download folder
tvqa_folder = "/home/hlpark/shared/TVQA/video/video_files/"
if not os.path.exists(tvqa_folder):
    os.makedirs(tvqa_folder)

eval_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/evaluation/tvqa"
if not os.path.exists(eval_folder):
    os.makedirs(eval_folder)

vid_json_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits/tvqa"
if not os.path.exists(vid_json_folder):
    os.makedirs(vid_json_folder)

In [4]:
#video duration json file
vid_duration_json = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits/tvqa/video_duration.json"

In [5]:
#read all video duration
if not os.path.exists(vid_duration_json):
    video_duration_dict = {}
    for root, dir, files in os.walk(os.path.join(tvqa_folder)):
        for f in tqdm(files):
            video_path = os.path.join(tvqa_folder, f)
            if video_path.endswith(".mp4"):
                video = VideoFileClip(video_path)
            else:
                video = VideoFileClip(video_path + ".mp4")
            v_duration = video.duration
            video_duration_dict[f] = v_duration
    save_json(video_duration_dict, vid_duration_json)
else:
    video_duration_dict = load_jsonl(vid_duration_json)[0]

In [6]:
train_path = '/home/hlpark/shared/TVQA/tvqa_qa_release/tvqa_train.jsonl'
val_path = '/home/hlpark/shared/TVQA/tvqa_qa_release/tvqa_val.jsonl'
test_path = '/home/hlpark/shared/TVQA/tvqa_qa_release/tvqa_test_public.jsonl'

In [7]:
train = load_jsonl(train_path)
val = load_jsonl(val_path)
test = load_jsonl(test_path)

In [8]:
print(len(test))

7623


In [9]:
txt_file_root_path = "/home/hlpark/shared/TVQA"

In [14]:
new_train = {}
new_val = {}
new_test = {}
video_names = {}
vid_dict_train, vid_dict_val, vid_dict_test = {}, {}, {}
unav_train_query_cnt, unav_val_query_cnt, unav_test_query_cnt = 0, 0, 0
with open(os.path.join(txt_file_root_path, "tvqa_train_queries.txt"), "w") as f:
    for i, qa in enumerate(train):
        qa_dict = {}
        video_id = qa['vid_name'] + ".mp4"
        qa_dict[video_id] = {}
        qa_dict[video_id]['relevant'] = True
        qa_dict[video_id]['clip'] = True
        qa_dict[video_id]['bounds'] = [qa['ts'].split('-')[0], qa['ts'].split('-')[1]]
        qa_dict[video_id]['steps'] = []
        qa_dict['qid'] = qa['qid']
        qa_dict['answer'] = qa['a' + str(qa['answer_idx'])]
        
        qa_dict[video_id]['v_duration'] = video_duration_dict[video_id]
        new_train[qa['q']] = qa_dict
        f.writelines(qa['q'] + "\n")
with open(os.path.join(txt_file_root_path, "tvqa_test_queries.txt"), "w") as f:
    for i, qa in enumerate(test):
        qa_dict = {}
        video_id = qa['vid_name'] + ".mp4"
        qa_dict[video_id] = {}
        qa_dict[video_id]['relevant'] = True
        qa_dict[video_id]['clip'] = True
        qa_dict[video_id]['bounds'] = [qa['ts'].split('-')[0], qa['ts'].split('-')[1]]
        qa_dict[video_id]['steps'] = []
        qa_dict['qid'] = qa['qid']
        #qa_dict['answer'] = qa['a' + str(qa['answer_idx'])]
        qa_dict[video_id]['v_duration'] = video_duration_dict[video_id]
        new_test[qa['q']] = qa_dict
        f.writelines(qa['q'] + "\n")
with open(os.path.join(txt_file_root_path, "tvqa_val_queries.txt"), "w") as f:
    for i, qa in enumerate(val):
        qa_dict = {}
        video_id = qa['vid_name'] + ".mp4"
        qa_dict[video_id] = {}
        qa_dict[video_id]['relevant'] = True
        qa_dict[video_id]['clip'] = True
        qa_dict[video_id]['bounds'] = [qa['ts'].split('-')[0], qa['ts'].split('-')[1]]
        qa_dict[video_id]['steps'] = []
        qa_dict['qid'] = qa['qid']
        qa_dict['answer'] = qa['a' + str(qa['answer_idx'])]
        qa_dict[video_id]['v_duration'] = video_duration_dict[video_id]
        new_val[qa['q']] = qa_dict
        f.writelines(qa['q'] + "\n")

In [15]:
save_json(new_val, f'{vid_json_folder}/all_data_val.json')
save_json(new_test, f'{vid_json_folder}/all_data_test.json')
save_json(new_train, f'{vid_json_folder}/all_data_train.json')

In [16]:
weak_label_path = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits/tvqa/five_labeled_pred_med_train_from_gt_vid_dict.json"
weak_label = json.load(open(weak_label_path))

In [17]:
print(len(weak_label))
print(len(train))

17435
122039


In [18]:
new_train_med = {}
new_train_nonmed = {}
new_train_both = {}
for i, qa in tqdm(enumerate(train)):
    qa_dict = {}
    video_id = qa['vid_name'] + ".mp4"
    qa_dict[video_id] = {}
    qa_dict[video_id]['relevant'] = True
    qa_dict[video_id]['clip'] = True
    qa_dict[video_id]['v_duration'] = video_duration_dict[video_id]
    qa_dict[video_id]['bounds'] = [qa['ts'].split('-')[0], qa['ts'].split('-')[1]]
    qa_dict[video_id]['steps'] = []
    qa_dict['qid'] = qa['qid']
    
    if qa['vid_name'] in weak_label:
        for idx, q in enumerate(weak_label[qa['vid_name']]):
            if qa['q'] not in q:
                continue
            if q[qa['q']] == "med":
                new_train_med[qa['q']] = qa_dict
            elif q[qa['q']] == "nonmed":
                new_train_nonmed[qa['q']] = qa_dict
            new_train_both[qa['q']] = qa_dict
        


122039it [00:01, 100198.33it/s]


In [19]:
save_json(new_train_med, f'{vid_json_folder}/visual_medical_train.json')
save_json(new_train_nonmed, f'{vid_json_folder}/visual_nonmedical_train.json')
save_json(new_train_both, f'{vid_json_folder}/postprocessed_full_train.json')