### Video Download

In [None]:
from pytube import YouTube
import jsonlines
import json
import os

with open('metafiles/mlda_data.json', 'r') as f:
    metas = json.loads(f.read())

list_of_keys = [k for k in metas.keys()]
for idx, key in enumerate(list_of_keys[:10]):
    url = metas[key]["url"]
    YouTube(url).streams.first().download(filename="download_videos/"+key+".mp4")

* Cut Videos using cut_videos_mlda.py (given python code)<br>
* Move the videos into a folder: data/original<br>
* The following code will split each clip into frames and save it in: data/image split

### Frames Splitting

In [None]:
import cv2
import os
import numpy as np
from tqdm import tqdm

root_dir = 'data/original/'

for subdir, dirs, files in os.walk(root_dir):
    save_folder = 'data/image split/' + (str(subdir).split("/")[-1])
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    for file in tqdm(files):
        filepath = str(subdir) + '/' + file
        vid = cv2.VideoCapture(filepath)
        total_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = vid.get(cv2.CAP_PROP_FPS)
        
        try:
            temp = int(total_frames/fps)
        except ZeroDivisionError:
            temp = 0
        
        frames_to_extract = temp if temp < 11 else 10
        uniform_frame_array = np.linspace(0, frames_to_extract, num=frames_to_extract+1, dtype=int).tolist()
        
        frame_counter = 0
        while True:
            ret, frame = vid.read()
            if ret == False:
                break
            if frame_counter in uniform_frame_array:
                save_path = f'{save_folder}/{file}_{frame_counter}.png'
                cv2.imwrite(save_path, frame)

            frame_counter += 1

### Video Captioning for each Frame using GPT

In [None]:
import json
from glob2 import glob
from transformers import pipeline
import os
from tqdm import tqdm
import torch

print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

caption_dict = {}

root_dir = 'data/image split'
for subdir, dirs, files in os.walk(root_dir):
   for dir in dirs:
    imgs_path = root_dir + '/' + dir + '/*'
    imgs = glob(imgs_path)
    image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=0)
      
    for img in tqdm(imgs):
        caption = image_to_text(img)[0]['generated_text']
        clip_id = img.rsplit("_",1)[0]
        if clip_id in caption_dict:
            caption_dict[clip_id].append(caption)
        else:
            caption_dict[clip_id] = [caption]
    print(caption_dict)

with open("VIT_GPT2_captions.json", "w") as outfile:
    json.dump(caption_dict, outfile)

### Evaluate Similarity Scores

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class SimilarityScores:
  
    def __init__(self, captions_list) -> None:
        self.captions = captions_list
        self.stop_words = set(stopwords.words('english'))
        self.processed_captions = [self.preprocess_caption(caption) for caption in self.captions]
        self.jaccard_matrix = [[self.jaccard_similarity(c1, c2) for c2 in self.processed_captions] for c1 in self.processed_captions]

    def preprocess_caption(self, caption):
        lowercase = caption.lower()
        tokens = word_tokenize(lowercase)
        filtered_tokens = [word for word in tokens if word not in self.stop_words]
        return filtered_tokens

    def jaccard_similarity(self, caption1, caption2):
        intersection = len(set(caption1) & set(caption2))
        union = len(set(caption1) | set(caption2))
        return intersection / union if union else 0

    def find_most_similar(self, matrix):
        average_similarities = [sum(row) / len(row) for row in matrix]
        return self.captions[average_similarities.index(max(average_similarities))]
    
    def get_Jaccard_similarity_scores_between_2_captions(self):
        val = self.jaccard_matrix[0][1]
        return val

    def get_highest_eval_by_Jaccard(self):
       most_similar_jaccard = self.find_most_similar(self.jaccard_matrix)
       return most_similar_jaccard

### Best Caption for each video and METEOR Scores comparing original caption and best caption

In [None]:
import json
from scores import SimilarityScores
from nltk.translate import meteor
from nltk import word_tokenize

captions_dict = {}
original_captions = {}
final_json = {}
with open('VIT_GPT2_captions.json') as json_data:
    temp = json.load(json_data)
    captions_dict = {key.split("\\")[-1].rsplit(".",1)[0]: temp[key] for key in temp.keys()}

with open('mlda_data_abridged.json') as json_data:
    temp = json.load(json_data)   
    final_json = temp 
    for key, video in temp.items():
        clips = video['clip']
        for name, val in clips.items():
            for scene in val['scene_split']:
                original_captions[scene['clip_id']] = scene['caption']    

for key, captions_list in captions_dict.items():
    best_caption_generator = SimilarityScores(captions_list=captions_list)
    best_caption = best_caption_generator.get_highest_eval_by_Jaccard()

    original_caption = original_captions[key]
    score = round(meteor([word_tokenize(best_caption)], word_tokenize(original_caption)),4)
    video_name = str(key.rsplit("_",1)[0]) + ".mp4"
    main_scene_split = final_json[key.rsplit(".",1)[0]]['clip'][video_name]['scene_split']
    for scene in main_scene_split:
        if scene['clip_id'] == key:
            if score < 0.15:
                main_scene_split.remove(scene)
            else:
                scene['score'] = score
    final_json[key.rsplit(".",1)[0]]['clip'][video_name]['scene_split'] = main_scene_split

with open("final_mlda_data.json", "w") as outfile:
    json.dump(final_json, outfile)