### **1. Import libraries**

In [1]:
# all necessary imports
import numpy as np 
import pandas as pd 
import re
from collections import Counter
import os
import json
import string
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import transformers
import torch
from torch.utils.data import Dataset
from datasets import Dataset
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch as T
import numpy as np
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
import torch.nn as nn
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from argparse import Namespace
from transformers import BertModel
from transformers import AdamW
from PIL import Image
import requests
import cv2 
from transformers import CLIPProcessor, CLIPModel
import shutil 
import math
import torch.optim as optim
from PIL import Image
import cv2 
from transformers import CLIPProcessor, CLIPModel
import pickle 
from sklearn.model_selection import train_test_split

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
!pip install datasets --upgrade

  pid, fd = os.forkpty()




In [4]:
!pip install ipywidgets --upgrade
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
Paths used for configuration of notebook: 
    	
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json


### **2.MSRVTT-RETRIEVAL PREPROCESSING**

In [5]:
MSRVTT_MC = "/kaggle/input/msrvtt-retrieval/mc_test.jsonl"
MSRVTT_TRAIN = "/kaggle/input/msrvtt-retrieval/train.jsonl"
MSRVTT_VAL = "/kaggle/input/msrvtt-retrieval/val.jsonl"
MSRVTT_TEST = "/kaggle/input/msrvtt-retrieval/test.jsonl"
# MSRVTT_TRAIN_VAL_VIDEO = "/kaggle/input/msrvttqa/archive/TrainValVideo"
MSRVTT_TEST_VIDEO = "/kaggle/input/msrvtt-test-video/MSRVTT/videos/all"
MSRVTT_OUTPUT = "/kaggle/working/msrvtt"
MSRVTT_VIDEO_PREPROCESSED = "/kaggle/working/msrvtt_video"

In [6]:
try:
    os.makedirs(MSRVTT_OUTPUT)
    print(f"Directory '{MSRVTT_OUTPUT}' created successfully.")
except FileExistsError:
    print(f"Directory '{MSRVTT_OUTPUT}' already exists.")

Directory '/kaggle/working/msrvtt' created successfully.


In [7]:
def convert_jsonl_to_json(input_jsonl_file, output_json_folder):
    # Ensure the output folder exists
    os.makedirs(output_json_folder, exist_ok=True)
    
    # Determine the output JSON filename
    base_name = os.path.splitext(os.path.basename(input_jsonl_file))[0]
    output_json_file = os.path.join(output_json_folder, base_name + '.json')
    
    # Read the JSONL file and aggregate the data
    data = []
    with open(input_jsonl_file, 'r') as jsonl_file:
        for line_number, line in enumerate(jsonl_file, start=1):
            line = line.strip()
            if not line:  # Skip empty lines
                continue
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_number}: {e}")
                continue
    
    # Write to the JSON file
    with open(output_json_file, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    
    print(f"Converted {input_jsonl_file} to {output_json_file}")
    return output_json_file


In [8]:
MSRVTT_MC_JSON = convert_jsonl_to_json(MSRVTT_MC, MSRVTT_OUTPUT)
MSRVTT_TRAIN_JSON = convert_jsonl_to_json(MSRVTT_TRAIN, MSRVTT_OUTPUT)
MSRVTT_VAL_JSON = convert_jsonl_to_json(MSRVTT_VAL, MSRVTT_OUTPUT)
MSRVTT_TEST_JSON = convert_jsonl_to_json(MSRVTT_TEST, MSRVTT_OUTPUT)

Converted /kaggle/input/msrvtt-retrieval/mc_test.jsonl to /kaggle/working/msrvtt/mc_test.json
Converted /kaggle/input/msrvtt-retrieval/train.jsonl to /kaggle/working/msrvtt/train.json
Converted /kaggle/input/msrvtt-retrieval/val.jsonl to /kaggle/working/msrvtt/val.json
Converted /kaggle/input/msrvtt-retrieval/test.jsonl to /kaggle/working/msrvtt/test.json


In [9]:
# Loading JSON Files 
with open(MSRVTT_MC_JSON, 'r') as msrvtt_mc, open(MSRVTT_TRAIN_JSON, 'r') as msrvtt_train, open(MSRVTT_VAL_JSON, 'r') as msrvtt_val, open(MSRVTT_TEST_JSON, 'r') as msrvtt_test :
    msrvtt_mc = json.load(msrvtt_mc)
    msrvtt_train = json.load(msrvtt_train)
    msrvtt_val = json.load(msrvtt_val)
    msrvtt_test = json.load(msrvtt_test)
#print(msrvtt)

# Converting JSON Files to DataFrame 
msrvtt_mc = pd.DataFrame(msrvtt_mc)
msrvtt_train = pd.DataFrame(msrvtt_train)
msrvtt_val = pd.DataFrame(msrvtt_val)
msrvtt_test = pd.DataFrame(msrvtt_test)

msrvtt_mc

Unnamed: 0,qid,clip_name,title,answer,options
0,mc0,video9770,msr143897,0,"[the boy is trying to fix the problem, a movie..."
1,mc1,video9771,msr169115,2,[a man dismisses a group of soldiers and a sce...
2,mc2,video7020,msr160593,2,[a wining team celebrates their victory at the...
3,mc3,video9773,msr145898,3,"[a man yells at a faucet then punches it, peop..."
4,mc4,video7026,msr162877,2,"[basically humans have helping intensity, a ro..."
...,...,...,...,...,...
2985,mc2985,video7937,msr195060,2,[a male singer performs before a live audience...
2986,mc2986,video7930,msr183709,4,"[a journalist talking to a guest, a woman head..."
2987,mc2987,video7931,msr174474,2,[animated characters from the beauty and the b...
2988,mc2988,video7932,msr178667,1,[a man is laying in bed next to a woman and sh...


In [10]:
# Converting msrvtt_mc to OUR STANDARD FORMAT 

msrvtt = msrvtt_mc.copy() 
# Split the 'options' column into 5 separate columns 'a1', 'a2', 'a3', 'a4', 'a5'
msrvtt[['a1', 'a2', 'a3', 'a4', 'a5']] = pd.DataFrame(msrvtt['options'].tolist(), index=msrvtt.index)

# Rename the 'qid' column to 'id' (as specified in the new order)
msrvtt = msrvtt.rename(columns={'qid': 'id'})

# Reorder the columns as requested
msrvtt = msrvtt[['title','id', 'clip_name', 'a1', 'a2', 'a3', 'a4', 'a5', 'answer']]

# Print the reordered DataFrame
msrvtt

Unnamed: 0,title,id,clip_name,a1,a2,a3,a4,a5,answer
0,msr143897,mc0,video9770,the boy is trying to fix the problem,a movie trailer shows various scenes from a movie,asian man discusses technology in the younger ...,two men on wave runner in ocean rescuing a surfer,a group is dancing,0
1,msr169115,mc1,video9771,a man dismisses a group of soldiers and a scen...,a woman pushing a stroller,a young girl is in the gym,a woman is putting items into a miniature toy ...,a game show host hosting a game,2
2,msr160593,mc2,video7020,a wining team celebrates their victory at the ...,halo warriors music video,a woman wraps a baby doll in some fake leaves,people are playingg match,a person solving the rubik s cube,2
3,msr145898,mc3,video9773,a man yells at a faucet then punches it,people on a video laughing,a group of people on stage on the voice,an advertisement for a driving video game,a person is playing a video game,3
4,msr162877,mc4,video7026,basically humans have helping intensity,a rock band preforming a song,the announcer talks about the interior feature...,many women are walking on a runway in brown an...,a man carries a green block,2
...,...,...,...,...,...,...,...,...,...
2985,msr195060,mc2985,video7937,a male singer performs before a live audience,a male narrating a video game in the image,a curious gerbil peeks out of a white tube loo...,a group of women preform on stage,several people taste testing something,2
2986,msr183709,mc2986,video7930,a journalist talking to a guest,a woman headbutts a man,two girls going in a taxi while both speaking ...,a cartoon character is carrying a gun,a young woman in a red and white striped shirt...,4
2987,msr174474,mc2987,video7931,animated characters from the beauty and the be...,the man talks about how 7 million people are l...,the person talks about the woman,a boy is describing the back of a computer,someone is making food,2
2988,msr178667,mc2988,video7932,a man is laying in bed next to a woman and she...,a promo for a tv show or movie with spoken wor...,a young man in an ill fitting suit gives a pre...,there is a brown hair woman talking from the k...,a person zooms into the welcome board of a pla...,1


In [11]:
# Concatenating both val and test 
msrvtt_total = pd.concat([msrvtt_val,msrvtt_test])

# Cross Join between msrvtt and msrvtt_total so to associate each caption to its answers 
final_msrvtt = pd.merge(msrvtt, msrvtt_total[['clip_name', 'caption']], on='clip_name', how='left')
print(final_msrvtt.isnull().sum())
final_msrvtt

title          0
id             0
clip_name      0
a1             0
a2             0
a3             0
a4             0
a5             0
answer         0
caption      990
dtype: int64


Unnamed: 0,title,id,clip_name,a1,a2,a3,a4,a5,answer,caption
0,msr143897,mc0,video9770,the boy is trying to fix the problem,a movie trailer shows various scenes from a movie,asian man discusses technology in the younger ...,two men on wave runner in ocean rescuing a surfer,a group is dancing,0,a person is connecting something to system
1,msr169115,mc1,video9771,a man dismisses a group of soldiers and a scen...,a woman pushing a stroller,a young girl is in the gym,a woman is putting items into a miniature toy ...,a game show host hosting a game,2,a little girl does gymnastics
2,msr160593,mc2,video7020,a wining team celebrates their victory at the ...,halo warriors music video,a woman wraps a baby doll in some fake leaves,people are playingg match,a person solving the rubik s cube,2,a woman creating a fondant baby and flower
3,msr145898,mc3,video9773,a man yells at a faucet then punches it,people on a video laughing,a group of people on stage on the voice,an advertisement for a driving video game,a person is playing a video game,3,a boy plays grand theft auto 5
4,msr162877,mc4,video7026,basically humans have helping intensity,a rock band preforming a song,the announcer talks about the interior feature...,many women are walking on a runway in brown an...,a man carries a green block,2,a man is giving a review on a vehicle
...,...,...,...,...,...,...,...,...,...,...
2985,msr195060,mc2985,video7937,a male singer performs before a live audience,a male narrating a video game in the image,a curious gerbil peeks out of a white tube loo...,a group of women preform on stage,several people taste testing something,2,
2986,msr183709,mc2986,video7930,a journalist talking to a guest,a woman headbutts a man,two girls going in a taxi while both speaking ...,a cartoon character is carrying a gun,a young woman in a red and white striped shirt...,4,
2987,msr174474,mc2987,video7931,animated characters from the beauty and the be...,the man talks about how 7 million people are l...,the person talks about the woman,a boy is describing the back of a computer,someone is making food,2,
2988,msr178667,mc2988,video7932,a man is laying in bed next to a woman and she...,a promo for a tv show or movie with spoken wor...,a young man in an ill fitting suit gives a pre...,there is a brown hair woman talking from the k...,a person zooms into the welcome board of a pla...,1,


In [12]:
# Final Dataframe, completed with Caption -> dropped Videos with Nan Captions (990 videos )
final_msrvtt = final_msrvtt.rename(columns={'caption': 'question'})
final_msrvtt = final_msrvtt.dropna(subset=["question"])

# Reorder the columns as requested
final_msrvtt = final_msrvtt[['title', 'id', 'clip_name', 'question', 'a1', 'a2', 'a3', 'a4', 'a5', 'answer']]

print(final_msrvtt.isnull().sum())
final_msrvtt

title        0
id           0
clip_name    0
question     0
a1           0
a2           0
a3           0
a4           0
a5           0
answer       0
dtype: int64


Unnamed: 0,title,id,clip_name,question,a1,a2,a3,a4,a5,answer
0,msr143897,mc0,video9770,a person is connecting something to system,the boy is trying to fix the problem,a movie trailer shows various scenes from a movie,asian man discusses technology in the younger ...,two men on wave runner in ocean rescuing a surfer,a group is dancing,0
1,msr169115,mc1,video9771,a little girl does gymnastics,a man dismisses a group of soldiers and a scen...,a woman pushing a stroller,a young girl is in the gym,a woman is putting items into a miniature toy ...,a game show host hosting a game,2
2,msr160593,mc2,video7020,a woman creating a fondant baby and flower,a wining team celebrates their victory at the ...,halo warriors music video,a woman wraps a baby doll in some fake leaves,people are playingg match,a person solving the rubik s cube,2
3,msr145898,mc3,video9773,a boy plays grand theft auto 5,a man yells at a faucet then punches it,people on a video laughing,a group of people on stage on the voice,an advertisement for a driving video game,a person is playing a video game,3
4,msr162877,mc4,video7026,a man is giving a review on a vehicle,basically humans have helping intensity,a rock band preforming a song,the announcer talks about the interior feature...,many women are walking on a runway in brown an...,a man carries a green block,2
...,...,...,...,...,...,...,...,...,...,...
2972,msr141250,mc2972,video9737,a news clip from rt of a terror attack,a bloomberg analyst talking about growth in ma...,three girls at a public place and young man sh...,the girl is inside the gaming side and the jol...,a news video showing the aftermath of a terror...,an education analyst speaks about the data sci...,3
2977,msr152848,mc2977,video8002,a man is folding a piece of paper,a person is folding origami,a man explains about a disease called kuru dis...,shrimp is being shown and it seems that they a...,slide show of couples in love,nba star tracy mcgrady is being interviewed in...,0
2981,msr159368,mc2981,video8006,a person flying in a helicopter and is going t...,a woman talking about the movie brooklyn nine ...,a person is preparing a food on a pan,a reel of sports highlights,a video game plane is flying over water,a person is folding paper,3
2982,msr158401,mc2982,video7934,some women are dancing on a stage,a video game about car race and shooting,red headed woman is travelling in europe by ai...,a girl group performance on stage,chefs and waiters move about in a very large k...,game scene is shown here,2


In [13]:
# Checking for empty strings 
print(final_msrvtt.equals(""))

False


In [14]:
# Converting to Lowercase
text_columns = ['title','id', 'clip_name','question', 'a1', 'a2', 'a3', 'a4', 'a5']

final_msrvtt[text_columns] = final_msrvtt[text_columns].apply(lambda x: x.str.lower())

final_msrvtt
# stop_words = set(stopwords.words('english'))
#cleaned_msrvtt = " ".join([word for word in text.split() if word not in stop_words])

Unnamed: 0,title,id,clip_name,question,a1,a2,a3,a4,a5,answer
0,msr143897,mc0,video9770,a person is connecting something to system,the boy is trying to fix the problem,a movie trailer shows various scenes from a movie,asian man discusses technology in the younger ...,two men on wave runner in ocean rescuing a surfer,a group is dancing,0
1,msr169115,mc1,video9771,a little girl does gymnastics,a man dismisses a group of soldiers and a scen...,a woman pushing a stroller,a young girl is in the gym,a woman is putting items into a miniature toy ...,a game show host hosting a game,2
2,msr160593,mc2,video7020,a woman creating a fondant baby and flower,a wining team celebrates their victory at the ...,halo warriors music video,a woman wraps a baby doll in some fake leaves,people are playingg match,a person solving the rubik s cube,2
3,msr145898,mc3,video9773,a boy plays grand theft auto 5,a man yells at a faucet then punches it,people on a video laughing,a group of people on stage on the voice,an advertisement for a driving video game,a person is playing a video game,3
4,msr162877,mc4,video7026,a man is giving a review on a vehicle,basically humans have helping intensity,a rock band preforming a song,the announcer talks about the interior feature...,many women are walking on a runway in brown an...,a man carries a green block,2
...,...,...,...,...,...,...,...,...,...,...
2972,msr141250,mc2972,video9737,a news clip from rt of a terror attack,a bloomberg analyst talking about growth in ma...,three girls at a public place and young man sh...,the girl is inside the gaming side and the jol...,a news video showing the aftermath of a terror...,an education analyst speaks about the data sci...,3
2977,msr152848,mc2977,video8002,a man is folding a piece of paper,a person is folding origami,a man explains about a disease called kuru dis...,shrimp is being shown and it seems that they a...,slide show of couples in love,nba star tracy mcgrady is being interviewed in...,0
2981,msr159368,mc2981,video8006,a person flying in a helicopter and is going t...,a woman talking about the movie brooklyn nine ...,a person is preparing a food on a pan,a reel of sports highlights,a video game plane is flying over water,a person is folding paper,3
2982,msr158401,mc2982,video7934,some women are dancing on a stage,a video game about car race and shooting,red headed woman is travelling in europe by ai...,a girl group performance on stage,chefs and waiters move about in a very large k...,game scene is shown here,2


In [15]:
# Removing White Spaces 
final_msrvtt[text_columns] = final_msrvtt[text_columns].apply(lambda x: x.str.strip())

# Removing Punctuations 
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

final_msrvtt[text_columns] = final_msrvtt[text_columns].map(remove_punctuation)
final_msrvtt.head()

Unnamed: 0,title,id,clip_name,question,a1,a2,a3,a4,a5,answer
0,msr143897,mc0,video9770,a person is connecting something to system,the boy is trying to fix the problem,a movie trailer shows various scenes from a movie,asian man discusses technology in the younger ...,two men on wave runner in ocean rescuing a surfer,a group is dancing,0
1,msr169115,mc1,video9771,a little girl does gymnastics,a man dismisses a group of soldiers and a scen...,a woman pushing a stroller,a young girl is in the gym,a woman is putting items into a miniature toy ...,a game show host hosting a game,2
2,msr160593,mc2,video7020,a woman creating a fondant baby and flower,a wining team celebrates their victory at the ...,halo warriors music video,a woman wraps a baby doll in some fake leaves,people are playingg match,a person solving the rubik s cube,2
3,msr145898,mc3,video9773,a boy plays grand theft auto 5,a man yells at a faucet then punches it,people on a video laughing,a group of people on stage on the voice,an advertisement for a driving video game,a person is playing a video game,3
4,msr162877,mc4,video7026,a man is giving a review on a vehicle,basically humans have helping intensity,a rock band preforming a song,the announcer talks about the interior feature...,many women are walking on a runway in brown an...,a man carries a green block,2


In [16]:
# Text Cleaning 

# Check if any cell in the DataFrame contains a special character
final_msrvtt.apply(lambda x: x.str.contains('[^a-zA-Z0-9_\- ]', na=False))

# Extract rows with special characters 
final_msrvtt[final_msrvtt.apply(lambda x: x.str.contains('[^a-zA-Z0-9_\- ]', na=False)).any(axis=1)]

# Remove special characters from all columns
final_msrvtt = final_msrvtt.apply(lambda x: x.str.replace('[^a-zA-Z0-9_\- ]', '', regex=True))

# To check if special characters were removed, then try to extract again rows that contain special characters -> if None is found, then the special character has been correctly replaced 
# final_msrvtt[final_msrvtt.apply(lambda x: x.str.contains('[^a-zA-Z0-9_\- ]', na=False)).any(axis=1)]

In [17]:
# Removing Stopwords 
stop = set(stopwords.words('english'))
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop])
final_msrvtt[['title','id', 'clip_name','question', 'a1', 'a2', 'a3', 'a4', 'a5']] = final_msrvtt[['title','id', 'clip_name','question', 'a1', 'a2', 'a3', 'a4', 'a5']].map(remove_stopwords)

In [18]:
# Resetting the index 
last_msrvtt = final_msrvtt.copy() 
last_msrvtt.reset_index(drop=True, inplace=True)
last_msrvtt

Unnamed: 0,title,id,clip_name,question,a1,a2,a3,a4,a5,answer
0,msr143897,mc0,video9770,person connecting something system,boy trying fix problem,movie trailer shows various scenes movie,asian man discusses technology younger generat...,two men wave runner ocean rescuing surfer,group dancing,0
1,msr169115,mc1,video9771,little girl gymnastics,man dismisses group soldiers scene soldiers wa...,woman pushing stroller,young girl gym,woman putting items miniature toy oven,game show host hosting game,2
2,msr160593,mc2,video7020,woman creating fondant baby flower,wining team celebrates victory end game,halo warriors music video,woman wraps baby doll fake leaves,people playingg match,person solving rubik cube,2
3,msr145898,mc3,video9773,boy plays grand theft auto 5,man yells faucet punches,people video laughing,group people stage voice,advertisement driving video game,person playing video game,3
4,msr162877,mc4,video7026,man giving review vehicle,basically humans helping intensity,rock band preforming song,announcer talks interior features car,many women walking runway brown red dresses,man carries green block,2
...,...,...,...,...,...,...,...,...,...,...
1995,msr141250,mc2972,video9737,news clip rt terror attack,bloomberg analyst talking growth major cities,three girls public place young man shouts medi...,girl inside gaming side jolly mood,news video showing aftermath terrorist attack ...,education analyst speaks data science,3
1996,msr152848,mc2977,video8002,man folding piece paper,person folding origami,man explains disease called kuru disease,shrimp shown seems cooked,slide show couples love,nba star tracy mcgrady interviewed beyond glor...,0
1997,msr159368,mc2981,video8006,person flying helicopter going shoot people,woman talking movie brooklyn nine nine,person preparing food pan,reel sports highlights,video game plane flying water,person folding paper,3
1998,msr158401,mc2982,video7934,women dancing stage,video game car race shooting,red headed woman travelling europe airplane,girl group performance stage,chefs waiters move large kitchen,game scene shown,2


In [19]:
# Dictionary to store frequency of words 
combined_text = last_msrvtt[['question', 'a1', 'a2', 'a3', 'a4', 'a5']].apply(lambda x: ' '.join(x), axis=1)
word_counts = Counter(' '.join(combined_text).split())
word_count_dict = dict(word_counts)
#word_count_dict 

# Sort the dictionary by the word counts in descending order and convert back to a dictionary
sorted_word_count_dict = dict(sorted(word_counts.items(), key=lambda x: x[1], reverse=True))
#sorted_word_count_dict

In [20]:
tokenized_text = last_msrvtt.copy() 
def tokenize_text(text):
    if isinstance(text, str):  # Ensure the input is a string
        return word_tokenize(text)
    return []  # Return empty list for non-string input


tokenized_text[text_columns] = tokenized_text[text_columns].map(tokenize_text)

tokenized_text


Unnamed: 0,title,id,clip_name,question,a1,a2,a3,a4,a5,answer
0,[msr143897],[mc0],[video9770],"[person, connecting, something, system]","[boy, trying, fix, problem]","[movie, trailer, shows, various, scenes, movie]","[asian, man, discusses, technology, younger, g...","[two, men, wave, runner, ocean, rescuing, surfer]","[group, dancing]",0
1,[msr169115],[mc1],[video9771],"[little, girl, gymnastics]","[man, dismisses, group, soldiers, scene, soldi...","[woman, pushing, stroller]","[young, girl, gym]","[woman, putting, items, miniature, toy, oven]","[game, show, host, hosting, game]",2
2,[msr160593],[mc2],[video7020],"[woman, creating, fondant, baby, flower]","[wining, team, celebrates, victory, end, game]","[halo, warriors, music, video]","[woman, wraps, baby, doll, fake, leaves]","[people, playingg, match]","[person, solving, rubik, cube]",2
3,[msr145898],[mc3],[video9773],"[boy, plays, grand, theft, auto, 5]","[man, yells, faucet, punches]","[people, video, laughing]","[group, people, stage, voice]","[advertisement, driving, video, game]","[person, playing, video, game]",3
4,[msr162877],[mc4],[video7026],"[man, giving, review, vehicle]","[basically, humans, helping, intensity]","[rock, band, preforming, song]","[announcer, talks, interior, features, car]","[many, women, walking, runway, brown, red, dre...","[man, carries, green, block]",2
...,...,...,...,...,...,...,...,...,...,...
1995,[msr141250],[mc2972],[video9737],"[news, clip, rt, terror, attack]","[bloomberg, analyst, talking, growth, major, c...","[three, girls, public, place, young, man, shou...","[girl, inside, gaming, side, jolly, mood]","[news, video, showing, aftermath, terrorist, a...","[education, analyst, speaks, data, science]",3
1996,[msr152848],[mc2977],[video8002],"[man, folding, piece, paper]","[person, folding, origami]","[man, explains, disease, called, kuru, disease]","[shrimp, shown, seems, cooked]","[slide, show, couples, love]","[nba, star, tracy, mcgrady, interviewed, beyon...",0
1997,[msr159368],[mc2981],[video8006],"[person, flying, helicopter, going, shoot, peo...","[woman, talking, movie, brooklyn, nine, nine]","[person, preparing, food, pan]","[reel, sports, highlights]","[video, game, plane, flying, water]","[person, folding, paper]",3
1998,[msr158401],[mc2982],[video7934],"[women, dancing, stage]","[video, game, car, race, shooting]","[red, headed, woman, travelling, europe, airpl...","[girl, group, performance, stage]","[chefs, waiters, move, large, kitchen]","[game, scene, shown]",2


In [21]:
last_msrvtt.iloc[1999]

title                                         msr166104
id                                               mc2984
clip_name                                     video7936
question        soccer players hugging celebrating goal
a1           old footage wwf wrestling match hulk hogan
a2                                person preparing food
a3                                 soccer game progress
a4                               old clip baseball game
a5                              man garage showing cars
answer                                                2
Name: 1999, dtype: object

### **3. CLIP Model & Frame Embedding** 

In [22]:
# Load CLIP model and processor from Hugging Face
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [23]:
def get_single_image_embedding(my_image):
    image = processor(
    		text = None,
    		images = my_image,
    		return_tensors="pt"
    		)["pixel_values"].to(device)
    embedding = model.get_image_features(image)
    embedding_as_np = embedding.cpu().detach().numpy()
	
    return embedding_as_np


def frames_embeddings(video_path):
    """ This function returns all the embeddings associated to a video """
    all_embeddings = []
    
    video_obj = cv2.VideoCapture(video_path)
    #print("video_obj", video_obj)


    fps = video_obj.get(cv2.CAP_PROP_FPS)
    interval_factor = 3 
    frame_interval = int(fps / interval_factor)
    
    frame_count = 0 
    saved_count = 0 
    
    while True: 
        success, frame = video_obj.read()
        if not success: 
            break

        if frame_count % frame_interval == 0:
            single_frame_embedding = get_single_image_embedding(frame)
            all_embeddings.append(single_frame_embedding)   
            # Clear memory (important step to avoid running out of memory)
            torch.cuda.empty_cache()
            saved_count+=1
        
        frame_count +=1

    video_obj.release()

    #return np.array(all_embeddings).squeeze(axis=1)
    #print("saved_count", saved_count)
    #print("frame_count", frame_count)
    return np.array(all_embeddings).squeeze(1)
    
        
video_path = "/kaggle/input/msrvttqa/archive/TrainValVideo/video1.mp4"
frames = frames_embeddings(video_path)
print("frames", frames.shape)   
# (num_frames, batch, embedding_size)    # the embedding_size is determined by model.get_image_features  

frames (69, 512)


### **4. MSRVTT Text Encoder**

### **4.1 Defining Tokenizer**

In [24]:
# Load a pre-trained model of Bert and download the vocabulary
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

# Example on what does the Tokenizer do -> The Tokenizer outputs 'input_ids', 'token_type_ids', 'attention_mask'
# to use the Tokenizer to transform the text into token representation
#tokenizer("Hello, this is a sentence!", "And this sentence goes with it.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### **4.2 Defining MultipleChoiceDataset for Text Input** 

In [25]:
class MultipleChoiceDataset(Dataset):
    def __init__(self, dataset, tokenizer, options, max_length=None, padding=True):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.options = options
        self.max_length = max_length
        self.padding = padding
        self.encoded_data = self.preprocess_function()

    def preprocess_function(self):
        processed_examples = []
        for example in self.dataset:
            # Prepare the first sentence (the context) repeated for each option
            #print("example[question]", example)
            first_sentences = [example["question"]] * len(self.options)
            second_sentences = [example[option] for option in self.options]
            
            # Debugging: Print first_sentences and second_sentences types and contents
            #print("first_sentences", type(first_sentences), first_sentences)
            #print("second_sentences", type(second_sentences), second_sentences)
            
            # Tokenize
            tokenized = self.tokenizer(
                first_sentences, second_sentences, 
                padding="max_length", 
                truncation=True, 
                max_length=self.max_length, 
                return_tensors="pt"
            )

            # Convert tokenized inputs to tensors and add to the processed data
            processed_examples.append({
                "input_ids": tokenized["input_ids"],
                "attention_mask": tokenized["attention_mask"],
                "label": int(example["label"])  # Ensure the label is an integer
            })
        return processed_examples
    
    def __len__(self):
        return len(self.encoded_data)

    
    def __getitem__(self, idx):
        #print(f"Indices passed to __getitem__: {idx}")
        if isinstance(idx, list):  # If a batch of indices is passed
            batch = [self.encoded_data[i] for i in idx]
            #print("batch", batch)
            input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch])
            attention_mask = torch.stack([item["attention_mask"].squeeze(0) for item in batch])
            labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)
            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "label": labels,
            }
        else:  # If a single index is passed
            return {
                "input_ids": self.encoded_data[idx]["input_ids"].squeeze(0),  # Remove the batch dimension
                "attention_mask": self.encoded_data[idx]["attention_mask"].squeeze(0),
                "label": torch.tensor(self.encoded_data[idx]["label"], dtype=torch.long),  # Convert to long tensor
            }

#max_length = 128 
#options = ["a1", "a2", "a3", "a4", "a5"]  # Specify your options here

# Assuming 'datasets' is your input dataset, and 'tokenizer' is the initialized tokenizer

#datasets = Dataset.from_pandas(last_msrvtt)
#datasets = datasets.rename_column("answer", "label")
#print("datasets",datasets)
#print("datasets[0]",datasets[0])
            
#print(type(datasets))
#dataset = MultipleChoiceDataset(datasets, tokenizer, options, max_length=max_length)
#print(len(dataset))   # 2000 -> we have 2000 questions 
#sample = dataset[1]
#print(sample)
#sample = datasets[0]
#print("sample", sample)
#dataset = MultipleChoiceDataset([sample], tokenizer, options, max_length = max_length)
#print("dataset", dataset[0])

In [26]:
''' 
datasets = Dataset.from_pandas(last_msrvtt)
# Testing the class with your dataset
max_length = 128 
options = ["a1", "a2", "a3", "a4", "a5"]  # Specify your options here

# Assuming 'datasets' is your input dataset, and 'tokenizer' is the initialized tokenizer
print(type(datasets))
dataset = MultipleChoiceDataset(datasets, tokenizer, options, max_length=max_length)

# Access a sample to see the output
sample = dataset[1]
print(sample)
print(len(dataset))


# Create DataLoader without using collate_fn
batch_size = 8 
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
print(len(train_dataloader))
#val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)



for batch_idx, batch in enumerate(train_dataloader):
    print("Input IDs:", batch["input_ids"].shape, batch["input_ids"])
    print("Attention Mask:", batch["attention_mask"].shape, batch["attention_mask"])
    print("Labels:", batch["label"].shape, batch["label"])
    break  # To get only the first batch
''' 

' \ndatasets = Dataset.from_pandas(last_msrvtt)\n# Testing the class with your dataset\nmax_length = 128 \noptions = ["a1", "a2", "a3", "a4", "a5"]  # Specify your options here\n\n# Assuming \'datasets\' is your input dataset, and \'tokenizer\' is the initialized tokenizer\nprint(type(datasets))\ndataset = MultipleChoiceDataset(datasets, tokenizer, options, max_length=max_length)\n\n# Access a sample to see the output\nsample = dataset[1]\nprint(sample)\nprint(len(dataset))\n\n\n# Create DataLoader without using collate_fn\nbatch_size = 8 \ntrain_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)\nprint(len(train_dataloader))\n#val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)\n\n\n\nfor batch_idx, batch in enumerate(train_dataloader):\n    print("Input IDs:", batch["input_ids"].shape, batch["input_ids"])\n    print("Attention Mask:", batch["attention_mask"].shape, batch["attention_mask"])\n    print("Labels:", batch["label"].shape, batch

In [27]:
#for batch_idx, batch in enumerate(train_dataloader):
#    print(f"Batch {batch_idx}:")
#    print("Input IDs:", batch["input_ids"])
#    print("Attention Mask:", batch["attention_mask"])
#    print("Labels:", batch["label"])

In [28]:
# Decode the input IDs to see the original text   -> 
#decoded_input = tokenizer.decode(sample['input_ids'][0], skip_special_tokens=True)
#print("Decoded Input:", decoded_input)

#### **4.2 EncTextNew and CrossAttention, out.shape [batch_size*num_choice, seq_len, embed_bert]**

In [29]:
class EncTxtNew(torch.nn.Module):
    def __init__(self, bert_checkpoint, hidden_size, num_attention_heads):
        super().__init__()

        self.bert_checkpoint = bert_checkpoint
        self.hidden_size = hidden_size 
        self.num_attention_heads = num_attention_heads 
        bert = transformers.AutoModel.from_pretrained(bert_checkpoint)
        self.emb_txt = bert.embeddings
        self.txt_trsfr = bert.encoder
        self.mask_ext = bert.get_extended_attention_mask
        self.size_vocab = bert.config.vocab_size
        del bert

        self.initialize_weights()

    def initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
            elif isinstance(module, nn.LayerNorm):
                nn.init.ones_(module.weight)
                nn.init.zeros_(module.bias)

    def get_attn_mask(self, mask_txt, attn_mask_type="full"):
        # Using full attention and not "seq2seq" attention_mask_type
        return mask_txt
        
    def forward(self, questions, answers, mask_questions=None, mask_answers=None, token_type_ids=None, position_ids=None, attn_mask_type="full"):
        #print("questions", questions.shape)
        #print("answers", answers.shape)
        #print("mask_questions init", mask_questions.shape)
        #print("mask_answers init", mask_answers.shape)
        batch_size = questions.size(0)
        
        # Flatten the input for multiple choices
        questions = questions[:1].view(-1, questions.size(-1))  # Take only one question
        #print("questions after slicing", questions.shape)
        answers = answers.view(-1, answers.size(-1))  # Flatten answers
        #print("answers after view", answers.shape)
        
        # Flatten other tensors if they exist
        mask_questions = mask_questions[:1].view(-1, mask_questions.size(-1)) if mask_questions is not None else None
        #print("mask_questions", mask_questions.shape)
        mask_answers = mask_answers.view(-1, mask_answers.size(-1)) if mask_answers is not None else None
        #print("mask_answers", mask_answers.shape)
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        
        # Encode questions
        f_questions = self.emb_txt(questions, token_type_ids=token_type_ids, position_ids=position_ids)
        #print("f_questions", f_questions.shape)
        
        if self.txt_trsfr is not None:
            mask_questions = mask_questions if mask_questions is not None else torch.ones_like(questions)
            #print("mask_questions", mask_questions.shape)
            
            m_questions = self.get_attn_mask(mask_questions, attn_mask_type)
            #print("m_questions get_attn_mask", m_questions.shape)

            m_questions = self.mask_ext(m_questions, m_questions.shape, m_questions.device)
            #print("m_questions_mask_ext", m_questions.shape)
            
            m_questions = m_questions.to(dtype=f_questions.dtype, device=f_questions.device)
            
            f_questions = self.txt_trsfr(f_questions, m_questions, output_attentions=False)['last_hidden_state']
            #print("f_questions after bert.encoder", f_questions.shape)
    
        # Repeat for answers
        f_answers = self.emb_txt(answers)
        #print("f_answers", f_answers.shape)
        
        if self.txt_trsfr is not None:
            mask_answers = mask_answers if mask_answers is not None else torch.ones_like(answers)
            #print("mask_answers", mask_answers.shape)
            
            m_answers = self.get_attn_mask(mask_answers, attn_mask_type)
            #print("m_answers get_attn_mask", m_answers.shape)

            m_answers = self.mask_ext(m_answers, m_answers.shape, m_answers.device)
            #print("m_answers_mask_ext", m_answers.shape)
            
            m_answers = m_answers.to(dtype=f_answers.dtype, device=f_answers.device)
            
            f_answers = self.txt_trsfr(f_answers, m_answers, output_attentions=False)['last_hidden_state']
            #print("f_answers after bert.encoder", f_answers.shape)
        
        return f_questions, f_answers
    

In [30]:
datasets = Dataset.from_pandas(last_msrvtt)
datasets = datasets.rename_column("answer", "label")

print("datasets",datasets)
print("datasets[0]",datasets[0])
pd_datasets = datasets.to_pandas()
# pd_datasets

# Assuming your DataFrame is named df
first_20_rows = pd_datasets.iloc[:20]
#first_20_rows

datasets Dataset({
    features: ['title', 'id', 'clip_name', 'question', 'a1', 'a2', 'a3', 'a4', 'a5', 'label'],
    num_rows: 2000
})
datasets[0] {'title': 'msr143897', 'id': 'mc0', 'clip_name': 'video9770', 'question': 'person connecting something system', 'a1': 'boy trying fix problem', 'a2': 'movie trailer shows various scenes movie', 'a3': 'asian man discusses technology younger generations', 'a4': 'two men wave runner ocean rescuing surfer', 'a5': 'group dancing', 'label': '0'}


In [31]:
datasets = Dataset.from_pandas(last_msrvtt)
datasets = datasets.rename_column("answer", "label")

print("datasets",datasets)
print("datasets[0]",datasets[0])
pd_datasets = datasets.to_pandas()

train_df, test_df = train_test_split(pd_datasets, test_size=0.4, random_state=42)
#val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(type(train_df))
print(len(train_df))
print(len(test_df))
#print(len(val_df))

datasets Dataset({
    features: ['title', 'id', 'clip_name', 'question', 'a1', 'a2', 'a3', 'a4', 'a5', 'label'],
    num_rows: 2000
})
datasets[0] {'title': 'msr143897', 'id': 'mc0', 'clip_name': 'video9770', 'question': 'person connecting something system', 'a1': 'boy trying fix problem', 'a2': 'movie trailer shows various scenes movie', 'a3': 'asian man discusses technology younger generations', 'a4': 'two men wave runner ocean rescuing surfer', 'a5': 'group dancing', 'label': '0'}
<class 'pandas.core.frame.DataFrame'>
1200
800


### **5. Matchy Dataset**

In [32]:
# Passing questions as questions and answers as values 
# -> text_embedding.shape = [batch_size * num_choices, variable_query_tokens_len, 768]
def MatchyDataset(dict_dataset, model_txt, model_clip, processor, tokenizer, options, max_length, device, output_file):
    model_txt.eval()
    model_clip.eval()

    matchy_dataset = []
    for i in range(len(dict_dataset)):
        print("Processing sample", i)
        sample = dict_dataset.iloc[i].to_dict()  # Convert row to dictionary

        video_path = os.path.join("/kaggle/input/video-nlp/msrvtt_video/", sample['clip_name'] + ".mp4")
        
        sample_dataset = MultipleChoiceDataset([sample], tokenizer, options, max_length=max_length)

        input_ids = sample_dataset[0]["input_ids"].to(device)  # [batch_size, num_choices, max_length]
        #print("input_ids multiple choice dataset", input_ids.shape)
        attention_mask = sample_dataset[0]["attention_mask"].to(device)
        #print("attention_mask multiple choice dataset", attention_mask.shape)
        labels = sample_dataset[0]["label"].to(device)
        #print("labels multiple choice dataset", labels.shape)

        # Reshape for multiple choices
        input_ids = input_ids.view(-1, max_length)  # [batch_size * num_choices, max_length]
        #print("input_ids reshaping", input_ids.shape)
        attention_mask = attention_mask.view(-1, max_length)  # Same shape adjustment
        #print("attention mask reshaping", attention_mask.shape)
        
        # Split into questions (question tokens) and answers (answer tokens)
        sep_token_id = tokenizer.sep_token_id  # Get the ID for the [SEP] token
        #print("sep_token_id", sep_token_id)
        questions, answers = [], []
        questions_mask, answers_mask = [], []

        for idx in range(input_ids.size(0)):
            input_row = input_ids[idx]
            mask_row = attention_mask[idx]

            # Find the position of the first [SEP]
            sep_index = (input_row == sep_token_id).nonzero(as_tuple=True)[0][0].item()

            # Extract question (query) and answer (keys/values) tokens
            question_tokens = input_row[:sep_index]  # Tokens before the first [SEP]
            answer_tokens = input_row[sep_index + 1:]  # Tokens after the first [SEP]

            # Create attention masks
            question_mask = mask_row[:sep_index]
            answer_mask = mask_row[sep_index + 1:]

            # Append to lists (pad to maintain max_length)
            questions.append(question_tokens)
            answers.append(answer_tokens)
            questions_mask.append(question_mask)
            answers_mask.append(answer_mask)

        # Pad sequences to max_length
        questions = torch.nn.utils.rnn.pad_sequence(questions, batch_first=True, padding_value=0).to(device)
        answers = torch.nn.utils.rnn.pad_sequence(answers, batch_first=True, padding_value=0).to(device)
        questions_mask = torch.nn.utils.rnn.pad_sequence(questions_mask, batch_first=True, padding_value=0).to(device)
        answers_mask = torch.nn.utils.rnn.pad_sequence(answers_mask, batch_first=True, padding_value=0).to(device)
        #print("questions", questions)
        #print("answers", answers)
        #print("questions_mask", questions_mask)
        #print("answers_mask", answers_mask)
        
        # Process with the text model
        with torch.no_grad():
            questions_embeddings, answers_embeddings = model_txt(
                questions=questions,
                answers=answers,
                mask_questions=questions_mask,
                mask_answers=answers_mask
            )
            video_embeddings = frames_embeddings(video_path)  # Process video embeddings
        #print("text_embeddings", text_embeddings.shape)
        #print("video_embeddings", video_embeddings.shape)
        #print("label", labels)
        # Add to dataset
        matchy_dataset.append({
            "questions_embeddings" : questions_embeddings.detach().cpu(), 
            "answers_embeddings" : answers_embeddings.detach().cpu(),
            "video_embeddings": torch.tensor(video_embeddings, dtype=torch.float).detach().cpu(),
            "label": labels.detach().cpu()
        })

        # Save matchy_dataset to a pickle file
        output = os.path.join("/kaggle/working/", output_file)
        with open(output, 'wb') as f:
            pickle.dump(matchy_dataset, f)

    return matchy_dataset


In [33]:
# Testing Matchy Dataset
device = "cuda" if torch.cuda.is_available() else "cpu"
bert_checkpoint = "bert-base-uncased"
max_length = 128
options = ["a1", "a2", "a3", "a4", "a5"]
num_choices = len(options)
hidden_size = 768 
num_attention_heads = 12 

# Load CLIP model and processor from Hugging Face
model_clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
model_txt = EncTxtNew(bert_checkpoint, hidden_size, num_attention_heads).to(device)

matchy_train = MatchyDataset(train_df, model_txt, model_clip, processor, tokenizer, options, max_length, device, output_file = "train_df.pkl")
matchy_test = MatchyDataset(test_df, model_txt, model_clip, processor, tokenizer, options, max_length, device, output_file = "test_df.pkl")
#matchy_val = MatchyDataset(val_df, model_txt, model_clip, processor, tokenizer, options, max_length, device, output_file = "test_df.pkl")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Processing sample 0




Processing sample 1
Processing sample 2
Processing sample 3
Processing sample 4
Processing sample 5
Processing sample 6
Processing sample 7
Processing sample 8
Processing sample 9
Processing sample 10
Processing sample 11
Processing sample 12
Processing sample 13
Processing sample 14
Processing sample 15
Processing sample 16
Processing sample 17
Processing sample 18
Processing sample 19
Processing sample 20
Processing sample 21
Processing sample 22
Processing sample 23
Processing sample 24
Processing sample 25
Processing sample 26
Processing sample 27
Processing sample 28
Processing sample 29
Processing sample 30
Processing sample 31
Processing sample 32
Processing sample 33
Processing sample 34
Processing sample 35
Processing sample 36
Processing sample 37
Processing sample 38
Processing sample 39
Processing sample 40
Processing sample 41
Processing sample 42
Processing sample 43
Processing sample 44
Processing sample 45
Processing sample 46
Processing sample 47
Processing sample 48
P