In [1]:
import os
import json
import pandas as pd
from moviepy.editor import *
from decord import VideoReader
import numpy as np
import math
from sklearn.metrics import f1_score

In [2]:
downsampling_rate= 1
def verify_frame_len(video_path, frame_idx):
    if video_path.endswith(".mp4"):
        video = VideoFileClip(video_path)
    else:
        video = VideoFileClip(video_path + ".mp4")
    n_frames = video.reader.nframes
    if n_frames // (downsampling_rate * video.fps) != max(frame_idx) + 1:
        print("ERROR", video.fps, n_frames, max(frame_idx))

In [3]:
def find_consecutive_timestamps(timestamps):
    """
    Function to find consecutive timestamps in a list and record the start and end time.
    """
    if not timestamps:
        return []

    # Initialize the first start time and the result list
    start = timestamps[0]
    result = []
    
    for i in range(1, len(timestamps)):
        # Check if the current timestamp is not consecutive
        if timestamps[i] != timestamps[i-1] + 1:
            # Record the previous consecutive sequence
            result.append([start, timestamps[i-1]])
            # Update the start for the new sequence
            start = timestamps[i]

    # Add the last sequence
    result.append([start, timestamps[-1]])

    return result

# Example usage
# timestamps = [0, 1, 2, 3, 4, 8, 10, 11, 12]
# find_consecutive_timestamps(timestamps)

In [4]:
def calculate_iou(ground_truth, predictions):
    """
    Calculate the Intersection over Union (IoU) for video moment retrieval.
    
    :param ground_truth: A tuple representing the ground truth interval (start, end).
    :param predictions: A list of tuples representing predicted intervals [(start1, end1), (start2, end2), ...].
    :return: IoU score.
    """
    GT_start, GT_end = ground_truth
    total_intersection = 0
    total_union = 0

    for (P_start, P_end) in predictions:
        # Calculate intersection
        intersection = max(0, min(GT_end, P_end) - max(GT_start, P_start))
        total_intersection += intersection

        # Calculate union for this predicted interval
        union = (P_end - P_start)  - intersection
        total_union += union
    total_union += (GT_end - GT_start)
    # Avoid division by zero
    if total_union == 0:
        return 0

    # Calculate IoU
    iou = total_intersection / total_union
    return iou

In [5]:
def save_json(content, save_path):
    with open(save_path, 'w') as f:
        f.write(json.dumps(content))
def load_jsonl(filename):
    with open(filename, "r") as f:
        return [json.loads(l.strip("\n")) for l in f.readlines()]
        # return json.loads(f)

In [9]:
#set folder path
root_path = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_data/tvqa"
video_root = "/home/hlpark/shared/TVQA/video/video_files"

val_path = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_data/tvqa_evaluation_json/val_gt.json"
original = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_tvqa_hirest_prediction_gt/original"
finetuned_on_visual_nonmed = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_tvqa_hirest_prediction_gt/finetuned_on_visual_nonmed"
finetuned_on_visual_med = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_tvqa_hirest_prediction_gt/finetuned_on_visual_med"
finetuned_on_full_without_audio = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_tvqa_hirest_prediction_gt/finetuned_on_full_without_audio"

original = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_tvqa_hirest_prediction_gt_fulldata/original"
finetuned_on_visual_nonmed = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_tvqa_hirest_prediction_gt_fulldata/finetuned_on_visual_nonmed"
finetuned_on_visual_med = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_tvqa_hirest_prediction_gt_fulldata/finetuned_on_visual_med"
finetuned_on_full_without_audio = "/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_tvqa_hirest_prediction_gt_fulldata/finetuned_on_full_without_audio"


vid_json_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits/tvqa"
clip_pred_med = load_jsonl(f'{vid_json_folder}/five_labeled_pred_med_from_gt_vid_dict.json')
gt_validation = load_jsonl(val_path)

In [10]:
# TVQA 32 frames from ground truth
tvqa_val_json = [original, finetuned_on_visual_nonmed, finetuned_on_visual_med, finetuned_on_full_without_audio]
tvqa_pred_type = ["original", "finetuned_on_visual_nonmed", "finetuned_on_visual_med", "finetuned_on_full_without_audio"]
# tvqa_pred_type = [x + "_pt3" for x in tvqa_pred_type]
# tvqa_val_json = [x + "_pt3" for x in tvqa_val_json]
tvqa_list = []

split = "val"


fileerr = 0
for idx, val_json in enumerate(tvqa_val_json):
    target_list, pred_list = [], []
    med_target, med_pred = [], []
    nonmed_target, nonmed_pred = [], []
    wrong_queries_32, correct_queries_32 = [], []
    video_list = []
    
    try:
        tvqa = load_jsonl(os.path.join(tvqa_val_json[idx], "result", "val_epochbest.json"))

        max_frame_num = 0

        #remove duplicate queries in tvqa[0]
        seen_qid = []
        for qa in tvqa[0][:]:
            if qa['qid'] not in seen_qid:
                seen_qid.append(qa['qid'])
            else:
                tvqa[0].remove(qa)
                #print("remove duplicate ", qa['qid'])

        for i, qa in enumerate(tvqa[0]):
            max_frame_num = max(qa['frame_idx'])
            
            try:
                # for idx, value in enumerate(gt_validation[0]):
                #     print(value)
                qa['video'] = [value['video'] for idx, value in enumerate(gt_validation[0]) if value['qid'] == qa['qid']][0]
                qa['question'] = [value['question'] for idx, value in enumerate(gt_validation[0])  if value['qid'] == qa['qid']][0]
                #print(qa['video'], qa['question'])
            
                
                pred_list.append(qa['prediction'])
                target_list.append(qa['target'])
                ismed = False
                if qa['prediction'] == qa['target']:
                    correct_queries_32.append(qa)
                else:
                    wrong_queries_32.append(qa)
                for qc in clip_pred_med[0][qa['video']]:
                    #print(qc)
                    if qa['question'] in qc and qc[qa['question']] == "med":
                        ismed = True
                if ismed:
                #if clip_pred_med[0][qa['vid_name'] + ".mp4"] == "med":
                #if "house" in dic['vid_name'] or "grey" in dic['vid_name']:
                    med_pred.append(qa['prediction'])
                    med_target.append(qa['target'])
                else:
                    nonmed_pred.append(qa['prediction'])
                    nonmed_target.append(qa['target'])
            except ValueError as e:
                #print(dic['time_span_len'])
                print("ValueError ", qa['qid'], qa['frame_idx'], e)
                qa['pred'] = [0, 0]
                qa['iou'] = 0
                #uncomment this only when you want to check frame length for verification purpose
                #verify_frame_len(tvqa_video,  qa['frame_idx'])
                
                
                
    except FileNotFoundError as e: 
        print("file not found ", e)
        fileerr +=1
    print(f"\n\nNumber of processed queries for {tvqa_pred_type[idx]}: ", len(target_list))
    print("\naccuracy")
    print(f1_score(target_list, pred_list, average="micro"))
    print(f1_score(med_target, med_pred, average="micro"))
    print(f1_score(nonmed_target, nonmed_pred, average="micro"))
    print("\nF1")
    print(f1_score(target_list, pred_list, average="weighted"))
    print(f1_score(med_target, med_pred, average="weighted"))
    print(f1_score(nonmed_target, nonmed_pred, average="weighted"))




Number of processed queries for original:  15238

accuracy
0.23408583803648772
0.2376897445390596
0.23330940416367552

F1
0.1603892998991908
0.15488820728101885
0.16158369840786085
file not found  [Errno 2] No such file or directory: '/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_tvqa_hirest_prediction_gt_fulldata/finetuned_on_visual_nonmed/result/val_epochbest.json'


Number of processed queries for finetuned_on_visual_nonmed:  0

accuracy
0.0
0.0
0.0

F1
nan
nan
nan


Number of processed queries for finetuned_on_visual_med:  15238

accuracy
0.21603885024281402
0.22769344687152906
0.21352795724655024

F1
0.10961212854005087
0.11712718091139554
0.1080535911266263
file not found  [Errno 2] No such file or directory: '/home/hlpark/REDUCE/REDUCE_benchmarks/SeViLA/sevila_tvqa_hirest_prediction_gt_fulldata/finetuned_on_full_without_audio/result/val_epochbest.json'


Number of processed queries for finetuned_on_full_without_audio:  0

accuracy
0.0
0.0
0.0

F1
nan
nan
nan
