In [9]:
import os
import json
import pandas as pd
from moviepy.editor import *
from decord import VideoReader
import numpy as np
import math

In [10]:
downsampling_rate= 8
def verify_frame_len(video_path, frame_idx):
    if video_path.endswith(".mp4"):
        video = VideoFileClip(video_path)
    else:
        video = VideoFileClip(video_path + ".mp4")
    n_frames = video.reader.nframes
    if n_frames // (downsampling_rate * video.fps) != max(frame_idx) + 1:
        print("ERROR")

In [11]:
downsampling_rate= 8

def smooth_interval(timespan, video_path):
    if video_path.endswith(".mp4"):
        video = VideoFileClip(video_path)
    else:
        video = VideoFileClip(video_path + ".mp4")
    duration = video.duration
    converted_timespan = []
    for start, end in timespan:
        new_start = start * downsampling_rate + downsampling_rate/2
        new_end = end * 8 + downsampling_rate/2
        if new_end > duration:
            print("ERROR")
        converted_timespan.append([new_start, new_end])
    return converted_timespan

In [12]:
def find_consecutive_timestamps(timestamps):
    """
    Function to find consecutive timestamps in a list and record the start and end time.
    """
    if not timestamps:
        return []

    # Initialize the first start time and the result list
    start = timestamps[0]
    result = []
    
    for i in range(1, len(timestamps)):
        # Check if the current timestamp is not consecutive
        if timestamps[i] != timestamps[i-1] + 1:
            # Record the previous consecutive sequence
            result.append([start, timestamps[i-1]])
            # Update the start for the new sequence
            start = timestamps[i]

    # Add the last sequence
    result.append([start, timestamps[-1]])

    return result

# Example usage
# timestamps = [0, 1, 2, 3, 4, 8, 10, 11, 12]
# find_consecutive_timestamps(timestamps)

In [13]:
def calculate_iou(ground_truth, predictions):
    """
    Calculate the Intersection over Union (IoU) for video moment retrieval.
    
    :param ground_truth: A tuple representing the ground truth interval (start, end).
    :param predictions: A list of tuples representing predicted intervals [(start1, end1), (start2, end2), ...].
    :return: IoU score.
    """
    GT_start, GT_end = ground_truth
    total_intersection = 0
    total_union = 0

    for (P_start, P_end) in predictions:
        # Calculate intersection
        intersection = max(0, min(GT_end, P_end) - max(GT_start, P_start))
        total_intersection += intersection

        # Calculate union for this predicted interval
        union = (P_end - P_start)  - intersection
        total_union += union
    total_union += (GT_end - GT_start)
    # Avoid division by zero
    if total_union == 0:
        return 0

    # Calculate IoU
    iou = total_intersection / total_union
    return iou

In [14]:
def save_json(content, save_path):
    with open(save_path, 'w') as f:
        f.write(json.dumps(content))
def load_jsonl(filename):
    with open(filename, "r") as f:
        return [json.loads(l.strip("\n")) for l in f.readlines()]
        # return json.loads(f)

In [15]:
#set folder path
root_path = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_data/medvidqa"
video_root = "/home/hlpark/shared/MedVidQA/video"
tvqa_result_root_val = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_medvidqa_result"
tvqa_result_root_test = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_medvidqa_result_test"

eval_path = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_data/medvidqa_evaluation_json"

In [16]:
# MedVidQA
medvidqa_val_json  = []
medvidqa_val_list = []
video_val_list = []

for _, dir, _ in os.walk(tvqa_result_root_val):
    for f in dir:
        medvidqa_val_json.append(os.path.join(root_path, f + "_val_gt.json"))
        video_name = f
        video_val_list.append(os.path.join(video_root, video_name))
        if os.path.exists(os.path.join(tvqa_result_root_val, video_name, "result", "test_epochbest.json")):
            medvidqa_val_list.append(os.path.join(tvqa_result_root_val, video_name, "result", "test_epochbest.json"))
        elif os.path.exists(os.path.join(tvqa_result_root_val, video_name, "result", "val_epochbest.json")):
            medvidqa_val_list.append(os.path.join(tvqa_result_root_val, video_name, "result", "val_epochbest.json"))
        elif os.path.exists(os.path.join(tvqa_result_root_val, video_name, "result", "train_epochbest.json")):
            medvidqa_val_list.append(os.path.join(tvqa_result_root_val, video_name, "result", "train_epochbest.json"))
            print("train file")
        else: 
            medvidqa_val_list.append("")
assert len(medvidqa_val_list) == len(medvidqa_val_json)

al_json_val = {}
fileerr = 0
for idx, val_json in enumerate(medvidqa_val_json):
    if medvidqa_val_list[idx] == '':
        continue
    try:
        tvqa = load_jsonl(medvidqa_val_list[idx])
        val = load_jsonl(val_json)
        tvqa_video = video_val_list[idx]
        
        
        for i, qa in enumerate(val[0]):
            dic = {}
            dic['ground_truth'] = [float(qa['start']), float(qa['end'])]
            dic['time_span_len'] = float(qa['end']) - float(qa['start'])
            al_json_val[qa['qid']] = dic

        max_frame_num = 0

        for i, qa in enumerate(tvqa[0]):
            max_frame_num = max(qa['frame_idx'])
            if qa['qid'] not in al_json_val:
                print("QID doesnt exist", qa['qid'])
            else:
                dic = al_json_val[qa['qid']] 
                pred = qa['frame_idx'][:int(np.ceil(dic['time_span_len'] // downsampling_rate))]
                pred = sorted(pred)
                # print(pred)
                pred_time_span = find_consecutive_timestamps(pred)
                smooth_pred_time_span = smooth_interval(pred_time_span, tvqa_video)
                #print(dic['ground_truth'],pred_time_span)
                dic['pred'] = smooth_pred_time_span
                iou = calculate_iou(dic['ground_truth'], smooth_pred_time_span)
                dic['iou'] = iou
                verify_frame_len(tvqa_video, qa['frame_idx'])
                #print(iou)
                
                
    except FileNotFoundError as e: 
        print("file not found ", e)
        fileerr +=1


In [17]:

medvidqa_test_json  = []
medvidqa_test_list = []
video_test_list = []
for _, dir, _ in os.walk(tvqa_result_root_test):
    for f in dir:
        medvidqa_test_json.append(os.path.join(root_path, f + "_test_gt.json"))
        video_name = f
        video_test_list.append(os.path.join(video_root, video_name))
        if os.path.exists(os.path.join(tvqa_result_root_test, video_name, "result", "test_epochbest.json")):
            medvidqa_test_list.append(os.path.join(tvqa_result_root_test, video_name, "result", "test_epochbest.json"))
        elif os.path.exists(os.path.join(tvqa_result_root_test, video_name, "result", "val_epochbest.json")):
            medvidqa_test_list.append(os.path.join(tvqa_result_root_test, video_name, "result", "val_epochbest.json"))
        elif os.path.exists(os.path.join(tvqa_result_root_test, video_name, "result", "train_epochbest.json")):
            medvidqa_test_list.append(os.path.join(tvqa_result_root_test, video_name, "result", "train_epochbest.json"))
            print("train file")
        else: 
            medvidqa_test_list.append("")
print(len(medvidqa_test_list) , len(medvidqa_test_json))
assert len(medvidqa_test_list) == len(medvidqa_test_json)
al_json_test = {}
fileerr = 0
for idx, test_json in enumerate(medvidqa_test_json):
    if medvidqa_test_list[idx] == '':
        continue
    try:
        tvqa = load_jsonl(medvidqa_test_list[idx])
        test = load_jsonl(test_json)
        tvqa_video = video_test_list[idx]
        
        
        for i, qa in enumerate(test[0]):
            dic = {}
            dic['ground_truth'] = [float(qa['start']), float(qa['end'])]
            dic['time_span_len'] = float(qa['end']) - float(qa['start'])
            al_json_test[qa['qid']] = dic

        max_frame_num = 0

        for i, qa in enumerate(tvqa[0]):
            max_frame_num = max(qa['frame_idx'])
            if qa['qid'] not in al_json_test:
                print("QID doesnt exist", qa['qid'])
            else:
                dic = al_json_test[qa['qid']] 
                pred = qa['frame_idx'][:int(np.ceil(dic['time_span_len'] // downsampling_rate))]
                pred = sorted(pred)
                # print(pred)
                pred_time_span = find_consecutive_timestamps(pred)
                smooth_pred_time_span = smooth_interval(pred_time_span, tvqa_video)
                #print(dic['ground_truth'],pred_time_span)
                dic['pred'] = smooth_pred_time_span
                iou = calculate_iou(dic['ground_truth'], smooth_pred_time_span)
                dic['iou'] = iou
                verify_frame_len(tvqa_video, qa['frame_idx'])
                #print(iou)
                
                
    except FileNotFoundError as e: 
        print("file not found ", e)
        fileerr +=1


98 98


In [18]:
train_path = f'{eval_path}/train.json'
val_path = f'{eval_path}/val.json'
test_path = f'{eval_path}/test.json'

In [19]:
# train = load_jsonl(train_path)
val = load_jsonl(val_path)
test = load_jsonl(test_path)

In [20]:
val_total_cnt = 0
val_dict = {}
for i, qa in enumerate(val[0]):
    val_total_cnt +=1
    val_dict[str(qa['qid'])] = 1
print(idx)
# train_dict = {}
# for i, qa in enumerate(train[0]):
#     train_dict[str(qa['qid'])] = 1
test_total_cnt = 0
test_dict = {}
print(al_json_test)
for i, qa in enumerate(test[0]):
    test_total_cnt+=1
    test_dict[str(qa['qid'])] = 1

97
{'MedVidQA_0': {'ground_truth': [7.0, 73.0], 'time_span_len': 66.0, 'pred': [[4.0, 4.0], [28.0, 36.0], [52.0, 52.0], [68.0, 76.0], [92.0, 92.0], [140.0, 140.0]], 'iou': 0.18840579710144928}, 'MedVidQA_1': {'ground_truth': [74.0, 132.0], 'time_span_len': 58.0, 'pred': [[20.0, 36.0], [52.0, 68.0], [100.0, 100.0]], 'iou': 0.0}, 'MedVidQA_2': {'ground_truth': [132.0, 181.0], 'time_span_len': 49.0, 'pred': [[68.0, 68.0], [100.0, 100.0], [140.0, 156.0], [172.0, 172.0]], 'iou': 0.32653061224489793}, 'MedVidQA_3': {'ground_truth': [56.0, 72.0], 'time_span_len': 16.0, 'pred': [[60.0, 68.0]], 'iou': 0.5}, 'MedVidQA_4': {'ground_truth': [75.0, 93.0], 'time_span_len': 18.0, 'pred': [[76.0, 84.0]], 'iou': 0.4444444444444444}, 'MedVidQA_5': {'ground_truth': [96.0, 112.0], 'time_span_len': 16.0, 'pred': [[68.0, 68.0], [108.0, 108.0]], 'iou': 0.0}, 'MedVidQA_6': {'ground_truth': [38.0, 105.0], 'time_span_len': 67.0, 'pred': [[28.0, 28.0], [44.0, 92.0]], 'iou': 0.7164179104477612}, 'MedVidQA_7': {'g

In [21]:
val_cnt, train_cnt, test_cnt = 0, 0, 0
cnt_3_thresh, cnt_5_thresh, cnt_7_thresh = 0, 0, 0
for key, v in al_json_val.items():
    if 'iou' not in v:
        continue
    #print(key)
    if key in val_dict:
        val_cnt += 1
        if v['iou'] > 0.3:
            cnt_3_thresh += 1
        if v['iou'] > 0.5:
            cnt_5_thresh += 1
        if v['iou'] > 0.7:
            cnt_7_thresh += 1
    
        
if val_cnt > 0:
    # assert len(val_dict) == val_cnt
    print(f"Val IoU=0.3: {cnt_3_thresh/(val_cnt) * 100}\nIoU=0.5: {cnt_5_thresh/(val_cnt) * 100}\nIoU=0.7: {cnt_7_thresh/(val_cnt) * 100}\ntotal queries:{(val_cnt)}/{val_total_cnt}")

val_cnt, train_cnt, test_cnt = 0, 0, 0
cnt_3_thresh, cnt_5_thresh, cnt_7_thresh = 0, 0, 0
for key, v in al_json_test.items():
    if 'iou' not in v:
        continue
    #print(key)
    if key in test_dict:
        test_cnt += 1
        if v['iou'] > 0.3:
            cnt_3_thresh += 1
        if v['iou'] > 0.5:
            cnt_5_thresh += 1
        if v['iou'] > 0.7:
            cnt_7_thresh += 1
if test_cnt > 0:
    # assert len(val_dict) == val_cnt
    print(f"Test IoU=0.3: {cnt_3_thresh/(test_cnt) * 100}\nIoU=0.5: {cnt_5_thresh/(test_cnt) * 100}\nIoU=0.7: {cnt_7_thresh/(test_cnt) * 100}\ntotal queries:{(test_cnt)}/{test_total_cnt}")


Val IoU=0.3: 24.822695035460992
IoU=0.5: 14.184397163120568
IoU=0.7: 7.092198581560284
total queries:141/141
Test IoU=0.3: 24.324324324324326
IoU=0.5: 10.81081081081081
IoU=0.7: 4.054054054054054
total queries:148/148


In [72]:
# clip_proposal = None

# vr = VideoReader(uri=tvqa_video+".mp4", height=224, width=224)
# vlen = len(vr)
# n_frms = min(max_frame_num, vlen)
# fps = vr.get_avg_fps() 
# if clip_proposal is None:
#     start, end = 0, vlen
# else:
#     start, end = int(clip_proposal[0]*fps), int(clip_proposal[1]*fps)
#     if start < 0:
#         start = 0
#     if end > vlen:
#         end = vlen

# intervals = np.linspace(start=start, stop=end, num=n_frms + 1).astype(int)
# ranges = []
# for idx, interv in enumerate(intervals[:-1]):
#     ranges.append((interv, intervals[idx + 1]))

# indices = [(x[0] + x[1]) // 2 for x in ranges]

# if len(indices) < n_frms:
#     rest = [indices[-1] for i in range(n_frms - len(indices))]
#     indices = indices + rest 
# # # get_batch -> T, H, W, C
# # frms = vr.get_batch(indices).permute(3, 0, 1, 2).float()  # (C, T, H, W)

# print(indices)
# print(len(indices))