# Prepare pkl cache file for the deep coption (dc) textual input 

## import the deep captions .csv file and initialize the data structure

In [1]:
import re
import os
import _pickle as cPickle
import numpy as np
import pandas as pd
import torch
from pytorch_transformers.tokenization_bert import BertTokenizer

In [2]:
deep_coptions_path = "/MediaEval/alto_titles_danny.csv"

In [3]:
deep_coptions_df = pd.read_csv(deep_coptions_path)
deep_coptions_df.head()

Unnamed: 0.1,Unnamed: 0,caption,video
0,0,man playing guitar singing snow-and-string-lig...,1
1,1,bird sitting cage cage pan-to-cat-in-animal-sh...,2
2,2,man sitting bed smiling blonde-woman-is-massag...,3
3,3,man playing piano room roulette-table-spinning...,4
4,4,group people walking along beach snow-shoe-hik...,5


In [4]:
dc_entries = []
for r in deep_coptions_df.itertuples():
    sample = {}
    vid_id = int(r.video)
    caption = r.caption.rstrip().replace('-', ' ')
    sample['video_id'] = vid_id
    sample['caption'] = caption
    dc_entries.append(sample)

In [5]:
train_df = pd.read_csv('/MediaEval/dev-set/ground-truth/ground-truth_dev-set.csv')
test_df = pd.read_csv('/MediaEval/test-set/ground-truth/ground-truth_test-set.csv')
test_df.head()

Unnamed: 0,video,short-term_memorability,nb_short-term_annotations,long-term_memorability,nb_long-term_annotations
0,video1.webm,0.954,34,0.727,11
1,video1000.webm,0.918,33,0.889,18
2,video1002.webm,0.837,61,0.714,14
3,video1008.webm,0.98,67,0.789,19
4,video1010.webm,0.923,33,0.786,14


In [6]:
score_dict = {}
for r in train_df.itertuples():
    vid_id = re.findall(r'\d+', r.video)[0]
    vid_id = int(vid_id)
    score_dict[vid_id] = [r._2, r._4]

In [7]:
test_score_dict = {}
for r in test_df.itertuples():
    vid_id = re.findall(r'\d+', r.video)[0]
    vid_id = int(vid_id)
    test_score_dict[vid_id] = [r._2, r._4]

In [8]:
dc_entries[0]

{'video_id': 1,
 'caption': 'man playing guitar singing snow and string lights person playing video game'}

In [10]:
test_score_list = []
train_score_list = []

for sample in dc_entries:
    if sample['video_id'] in score_dict:
        sample['scores'] = score_dict[sample['video_id']]
        train_score_list.append(sample)
    else:
        sample['scores'] = test_score_dict[sample['video_id']]
        test_score_list.append(sample)

In [11]:
len(train_score_list), len(test_score_list)

(8000, 2000)

In [12]:
train_score_list[0], test_score_list[0]

({'video_id': 3,
  'caption': 'man sitting bed smiling blonde woman is massaged tilt down person bed',
  'scores': [0.924, 0.846]},
 {'video_id': 1,
  'caption': 'man playing guitar singing snow and string lights person playing video game',
  'scores': [0.9540000000000001, 0.727]})

## Add Tokens

In [13]:
def assert_eq(real, expected):
    assert real == expected, "%s (true) vs %s (expected)" % (real, expected)

In [14]:
# the same tokenize function from BERT

def tokenize(entries, tokenizer, max_length=16, padding_index=0):
    """Tokenizes the questions.

    This will add q_token in each entry of the dataset.
    -1 represent nil, and should be treated as padding_index in embedding
    """
    for entry in entries:
        tokens = tokenizer.encode(entry["caption"])
        tokens = tokens[: max_length - 2]
        tokens = tokenizer.add_special_tokens_single_sentence(tokens)

        segment_ids = [0] * len(tokens)
        input_mask = [1] * len(tokens)

        if len(tokens) < max_length:
            # Note here we pad in front of the sentence
            padding = [padding_index] * (max_length - len(tokens))
            tokens = tokens + padding
            input_mask += padding
            segment_ids += padding

        assert_eq(len(tokens), max_length)
        entry["c_token"] = tokens
        entry["c_input_mask"] = input_mask
        entry["c_segment_ids"] = segment_ids

In [15]:
bert_model = 'bert-base-uncased'
do_lower_case = True

tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)

In [16]:
max_length = 23

tokenize(train_score_list, tokenizer, max_length=max_length)
tokenize(test_score_list, tokenizer, max_length=max_length)

In [17]:
train_score_list[0].keys()

dict_keys(['video_id', 'caption', 'scores', 'c_token', 'c_input_mask', 'c_segment_ids'])

## Tensorize

In [18]:
# the same tensorize function from BERT

def tensorize(entries, split='trainval'):

    for entry in entries:
        caption = torch.from_numpy(np.array(entry["c_token"]))
        entry["c_token"] = caption

        c_input_mask = torch.from_numpy(np.array(entry["c_input_mask"]))
        entry["c_input_mask"] = c_input_mask

        c_segment_ids = torch.from_numpy(np.array(entry["c_segment_ids"]))
        entry["c_segment_ids"] = c_segment_ids

        if "scores" in entry:
            scores = np.array(entry["scores"], dtype=np.float32)
            scores = torch.from_numpy(scores)
            entry["scores"] = scores


In [19]:
train_split = 'trainval'
val_split = 'minval'
test_split = 'test'
dataroot = 'datasets/ME'
train_cache_path = os.path.join(dataroot, 'cache', 'ME' + '_' + train_split + '_' + str(max_length) + '_cleaned' + '.pkl')
val_cache_path = os.path.join(dataroot, 'cache', 'ME' + '_' + val_split + '_' + str(max_length) + '_cleaned' + '.pkl')
test_cache_path = os.path.join(dataroot, 'cache', 'ME' + '_' + test_split + '_' + str(max_length) + '_cleaned' + '.pkl')

In [20]:
tensorize(train_score_list, split='trainval')
tensorize(test_score_list, split='test')

In [21]:
test_score_list[0]

{'video_id': 1,
 'caption': 'man playing guitar singing snow and string lights person playing video game',
 'scores': tensor([0.9540, 0.7270]),
 'c_token': tensor([ 101, 2158, 2652, 2858, 4823, 4586, 1998, 5164, 4597, 2711, 2652, 2678,
         2208,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0]),
 'c_input_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'c_segment_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

## save on disk

In [22]:
test_cache_path

'datasets/ME/cache/ME_test_23_cleaned.pkl'

In [23]:
# this done only once to save entries to the disk
cPickle.dump(train_score_list, open(train_cache_path, 'wb'))
# cPickle.dump(val_entries, open(val_cache_path, 'wb'))
cPickle.dump(test_score_list, open(test_cache_path, 'wb'))

In [24]:
cPickle.load(open(test_cache_path, 'rb'))

[{'video_id': 1,
  'caption': 'man playing guitar singing snow and string lights person playing video game',
  'scores': tensor([0.9540, 0.7270]),
  'c_token': tensor([ 101, 2158, 2652, 2858, 4823, 4586, 1998, 5164, 4597, 2711, 2652, 2678,
          2208,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0]),
  'c_input_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  'c_segment_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
 {'video_id': 2,
  'caption': 'bird sitting cage cage pan to cat in animal shelter cage man talking camera',
  'scores': tensor([0.8930, 0.5560]),
  'c_token': tensor([ 101, 4743, 3564, 7980, 7980, 6090, 2000, 4937, 1999, 4111, 7713, 7980,
          2158, 3331, 4950,  102,    0,    0,    0,    0,    0,    0,    0]),
  'c_input_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]),
  'c_segment_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Extract 30 frames from the video

In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import random
import re
import numpy as np

In [2]:
train_video_dir = '/aloui/MediaEval/dev-set/sources/'
test_video_dir = '/aloui/MediaEval/test-set/sources/'

train_image_dir = 'datasets/ME/images/dc/train/'
test_image_dir = 'datasets/ME/images/dc/test/'

In [4]:
np.random.seed(42)

train_im_dict = dict()

for k, filename in enumerate(tqdm(os.listdir(train_video_dir))):
    if filename.endswith(".webm"):
        vid_id = re.findall(r'\d+', filename)[0]
        vid_id = int(vid_id)
        video_path = os.path.join(train_video_dir, filename)
        cap = cv2.VideoCapture(video_path)
        frameIds = cap.get(cv2.CAP_PROP_FRAME_COUNT) * np.random.uniform(size=30)
        for i, fid in enumerate(frameIds):
            cap.set(cv2.CAP_PROP_POS_FRAMES, fid)
            ret, frame = cap.read()
            cv2.imwrite(train_image_dir + str(vid_id) + '_' + str(i) + '.jpg', frame)
        # plt.imshow(cv2.cvtColor(averageFrame, cv2.COLOR_BGR2RGB))
        # plt.show()

100%|██████████| 8000/8000 [7:50:25<00:00,  3.94s/it]  


In [5]:
np.random.seed(42)

test_im_dict = dict()

for k, filename in enumerate(tqdm(os.listdir(test_video_dir))):
    if filename.endswith(".webm"):
        vid_id = re.findall(r'\d+', filename)[0]
        vid_id = int(vid_id)
        video_path = os.path.join(test_video_dir, filename)
        cap = cv2.VideoCapture(video_path)
        frameIds = cap.get(cv2.CAP_PROP_FRAME_COUNT) * np.random.uniform(size=30)
        for i, fid in enumerate(frameIds):
            cap.set(cv2.CAP_PROP_POS_FRAMES, fid)
            ret, frame = cap.read()
            cv2.imwrite(test_image_dir + str(vid_id) + '_' + str(i) + '.jpg', frame)
        # plt.imshow(cv2.cvtColor(averageFrame, cv2.COLOR_BGR2RGB))
        # plt.show()

100%|██████████| 2000/2000 [1:56:47<00:00,  4.01s/it]  
