In [1]:
import os
import json
import pandas as pd
import pysrt
import re
import shutil
from openai import OpenAI, RateLimitError
import yaml
import math

In [2]:
experiment_name = "tvqa_gpt_query_type_textsim_experiment"

In [3]:
#select queries  

In [4]:
def find_match(phrase, paragraph):
    # Search for exact match
    exact_match = re.search(phrase, paragraph)
    if exact_match:
        print("Exact match found:", exact_match.group())
        return True

In [11]:
from sentence_transformers import SentenceTransformer, util
import torch
def semantic_similarity(sentences, query):
    
    embedder = SentenceTransformer("all-mpnet-base-v2")
    #embedder = SentenceTransformer("all-MiniLM-L6-v2")
    sentences_embeddings = embedder.encode(sentences, convert_to_tensor=True)
    top_k = min(5, len(sentences))
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, sentences_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")
    max_score = 0
    for score, idx in zip(top_results[0], top_results[1]):
        print(sentences[idx], "(Score: {:.4f})".format(score))
        if max_score < score:
            max_score = score
    return score


In [14]:
semantic_similarity(["The person electrocute himself"], "He suicide")
semantic_similarity(["Nope. Not only that, my sight has shifted from clear to milky green.'"], "he was sick")
semantic_similarity(["and I am producing sputum at an alarming rate."], "sick")
semantic_similarity(["You don't believe in me"], "upset")






Query: He suicide

Top 5 most similar sentences in corpus:
The person electrocute himself (Score: 0.4032)




Query: he was sick

Top 5 most similar sentences in corpus:
Nope. Not only that, my sight has shifted from clear to milky green.' (Score: 0.0320)




Query: sick

Top 5 most similar sentences in corpus:
and I am producing sputum at an alarming rate. (Score: 0.1575)




Query: upset

Top 5 most similar sentences in corpus:
You don't believe in me (Score: 0.2089)


tensor(0.2089, device='cuda:0')

In [37]:
def save_json(content, save_path):
    with open(save_path, 'w') as f:
        f.write(json.dumps(content))
def load_jsonl(filename):
    with open(filename, "r") as f:
        return [json.loads(l.strip("\n")) for l in f.readlines()]
def load_result_json(filename):
    with open(filename, "r") as f:
        data = json.load(f)
    print(data)
    return data

In [38]:
vid_json_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits/tvqa"
val = load_jsonl(f'{vid_json_folder}/all_data_val.json')
test = load_jsonl(f'{vid_json_folder}/all_data_test.json')
train = load_jsonl(f'{vid_json_folder}/all_data_train.json')

In [39]:
vid_duration_json = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits/tvqa/video_duration.json"
video_duration_dict = load_jsonl(vid_duration_json)[0]

In [40]:
root = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/tvqa/ASR"

In [41]:
option = "paragraph_semantic_search"

In [42]:
#load srt files and extract ground truth scripts
import pysrt
import numpy as np
json_save_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits"
json_folder = os.path.join(json_save_folder, experiment_name, 'baseline')
if os.path.exists(f'{json_folder}/val_audio_relevance_score_{option}.json'):
    val_json = load_jsonl(f'{json_folder}/val_audio_relevance_score_{option}.json')
    val_srt = val_json[0]
else:
    val_srt = []
    for key, value in val[0].items():
        qa_dict = {}
        gt_timestamp = value[list(value.keys())[0]]['bounds'] 
        gt_timestamp_start, gt_timestamp_end = float(gt_timestamp[0]), float(gt_timestamp[1])
        if math.isnan(gt_timestamp_start) or math.isnan(gt_timestamp_end):
            gt_timestamp_start = 0
            gt_timestamp_end = value[list(value.keys())[0]]['v_duration']
        
        sub_list = []
        subs =  pysrt.open(os.path.join(root, list(value.keys())[0].replace(".mp4", "") + ".srt"))
        for sub in subs:
            if (sub.start.seconds + sub.start.minutes * 60  >= int(gt_timestamp_start)) and (sub.start.seconds + sub.start.minutes * 60  <= int(np.ceil(gt_timestamp_end))):
                #print(sub.start, sub.text)
                sub_list.append({str(sub.start) : sub.text})
        qa_dict['question'] = key
        #print(value)
        qa_dict['answer'] = value['answer']
        qa_dict['video'] = list(value.keys())[0]
        qa_dict['sub'] = sub_list
        val_srt.append(qa_dict)


idx = 0
for value in val_srt:
    sentences = []
    prompt = ""
    if len(value['sub']) == 0:
        continue    
    if 'score' in value:
        continue
    for idx, sentence in enumerate(value['sub']):
        if option == "paragraph_semantic_search" or option == "exact_match":
            prompt += f"\n{list(sentence.values())[0]}"
        elif option == "sentence_semantic_search_max":
            sentences.append(list(sentence.values())[0])
        
    if option == "exact_match":
        if find_match(value['answer'], prompt):
            print("Q: ", value['question'], '\n', value['answer'], '\n', prompt)
            value['score'] = 3
        else:
            value['score'] = 1
    else:
        ret = semantic_similarity([prompt], value['question'])
        ret = ret.item()
        value['score'] = ret * 10

In [43]:
#load srt files and extract ground truth scripts
import pysrt
import numpy as np
json_save_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits"
json_folder = os.path.join(json_save_folder, experiment_name, 'baseline')
if os.path.exists(f'{json_folder}/test_audio_relevance_score_{option}.json'):
    test_json = load_jsonl(f'{json_folder}/test_audio_relevance_score_{option}.json')
    test_srt = test_json[0]
else:
    test_srt = []
    for key, value in test[0].items():
        qa_dict = {}
        gt_timestamp = value[list(value.keys())[0]]['bounds'] 
        gt_timestamp_start, gt_timestamp_end = float(gt_timestamp[0]), float(gt_timestamp[1])
        if math.isnan(gt_timestamp_start) or math.isnan(gt_timestamp_end):
            gt_timestamp_start = 0
            gt_timestamp_end = value[list(value.keys())[0]]['v_duration']
        
        sub_list = []
        subs =  pysrt.open(os.path.join(root, list(value.keys())[0].replace(".mp4", "") + ".srt"))
        for sub in subs:
            if (sub.start.seconds + sub.start.minutes * 60  >= int(gt_timestamp_start)) and (sub.start.seconds + sub.start.minutes * 60  <= int(np.ceil(gt_timestamp_end))):
                #print(sub.start, sub.text)
                sub_list.append({str(sub.start) : sub.text})
        qa_dict['question'] = key
        #print(value)
        qa_dict['answer'] = value['answer']
        qa_dict['video'] = list(value.keys())[0]
        qa_dict['sub'] = sub_list
        test_srt.append(qa_dict)


idx = 0
for value in test_srt:
    sentences = []
    prompt = ""
    if len(value['sub']) == 0:
        continue    
    if 'score' in value:
        continue
    for idx, sentence in enumerate(value['sub']):
        if option == "paragraph_semantic_search" or option == "exact_match":
            prompt += f"\n{list(sentence.values())[0]}"
        elif option == "sentence_semantic_search_max":
            sentences.append(list(sentence.values())[0])
        
    if option == "exact_match":
        if find_match(value['answer'], prompt):
            print("Q: ", value['question'], '\n', value['answer'], '\n', prompt)
            value['score'] = 3
        else:
            value['score'] = 1
    else:
        ret = semantic_similarity([prompt], value['question'])
        ret = ret.item()
        value['score'] = ret * 10

In [44]:
#load srt files and extract ground truth scripts
import pysrt
import numpy as np

json_save_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits"
json_folder = os.path.join(json_save_folder, experiment_name, 'baseline')
if os.path.exists(f'{json_folder}/train_audio_relevance_score_{option}.json'):
    train_json = load_jsonl(f'{json_folder}/train_audio_relevance_score_{option}.json')
    train_srt = train_json[0]
else:
    train_srt = []
    for key, value in train[0].items():
        qa_dict = {}
        gt_timestamp = value[list(value.keys())[0]]['bounds'] 
        gt_timestamp_start, gt_timestamp_end = float(gt_timestamp[0]), float(gt_timestamp[1])
        if math.isnan(gt_timestamp_start) or math.isnan(gt_timestamp_end):
            gt_timestamp_start = 0
            gt_timestamp_end = value[list(value.keys())[0]]['v_duration']
        
        sub_list = []
        subs =  pysrt.open(os.path.join(root, list(value.keys())[0].replace(".mp4", "") + ".srt"))
        for sub in subs:
            if (sub.start.seconds + sub.start.minutes * 60  >= int(gt_timestamp_start)) and (sub.start.seconds + sub.start.minutes * 60  <= int(np.ceil(gt_timestamp_end))):
                #print(sub.start, sub.text)
                sub_list.append({str(sub.start) : sub.text})
        qa_dict['question'] = key
        #print(value)
        qa_dict['answer'] = value['answer']
        qa_dict['video'] = list(value.keys())[0]
        qa_dict['sub'] = sub_list
        train_srt.append(qa_dict)

idx = 0
for value in train_srt:
    sentences = []
    prompt = ""
    if len(value['sub']) == 0:
        continue    
    
    if 'score' in value:
        continue

    for idx, sentence in enumerate(value['sub']):
        if option == "paragraph_semantic_search" or option == "exact_match":
            prompt += f"\n{list(sentence.values())[0]}"
        elif option == "sentence_semantic_search_max":
            sentences.append(list(sentence.values())[0])
        
    if option == "exact_match":
        if find_match(value['answer'], prompt):
            print("Q: ", value['question'], '\n', value['answer'], '\n', prompt)
            value['score'] = 3
        else:
            value['score'] = 1
    else:
        ret = semantic_similarity([prompt], value['question'])
        ret = ret.item()
        value['score'] = ret * 10

In [30]:
# create folder for each testing set (asr features have to be seprate folder as well)
json_save_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits"
with open(f'{vid_json_folder}/all_data_val.json') as f:
    val_json = json.load(f)
with open(f'{vid_json_folder}/all_data_test.json') as f:
    test_json = json.load(f)
with open(f'{vid_json_folder}/all_data_train.json') as f:
    train_json = json.load(f)

exp_list = ['baseline', 'visual_med', 'visual_med_with_audio', 'visual_nonmed']

for exp in exp_list:
    json_folder = os.path.join(json_save_folder, experiment_name, exp)

    if not os.path.exists(json_folder):
        os.makedirs(json_folder)

    save_json(test_json, f'{json_folder}/all_data_test.json')
    save_json(val_json, f'{json_folder}/all_data_val.json')
    save_json(train_json, f'{json_folder}/all_data_train.json')
    save_json(val_srt, f'{json_folder}/val_audio_relevance_score_{option}.json')
    save_json(train_srt, f'{json_folder}/train_audio_relevance_score_{option}.json')
    save_json(test_srt, f'{json_folder}/test_audio_relevance_score_{option}.json')


In [45]:
json_folder = "/home/hlpark/REDUCE/REDUCE_benchmarks/HiREST/data/splits/tvqa"
save_json(train_srt, f'{json_folder}/train_audio_relevance_score_{option}.json')