In [2]:
import os
import json
import pandas as pd
import pickle
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import trange, tqdm
import random

In [3]:
path = "kmeans/k30_c30_seed_7.pkl"
with open(path, 'rb') as f:
    kmeans_dict = pickle.load(f)
kmeans_dict

{'video946': [14, 12, 25],
 'video6295': [25, 14, 3],
 'video8565': [6, 13, 0],
 'video7555': [15, 23, 0],
 'video3809': [8, 1, 1, 0],
 'video181': [28, 25, 0],
 'video1080': [21, 7, 0],
 'video2414': [21, 14, 0],
 'video4159': [14, 29, 0],
 'video354': [8, 25, 0],
 'video8764': [8, 8, 0],
 'video1572': [28, 10, 0],
 'video1574': [27, 14, 0],
 'video2706': [18, 19, 12, 0],
 'video2626': [13, 8, 0],
 'video4865': [1, 13, 0],
 'video4745': [28, 15, 0],
 'video7719': [6, 13, 1],
 'video5771': [2, 23, 0],
 'video2219': [23, 14, 19],
 'video6153': [4, 14, 1, 0],
 'video970': [9, 21, 0],
 'video8070': [9, 7, 0],
 'video6118': [24, 2, 0],
 'video2578': [4, 13, 26],
 'video9633': [1, 26, 0],
 'video5614': [28, 17, 0],
 'video5374': [18, 24, 0],
 'video5681': [25, 4, 0],
 'video2063': [22, 16, 0],
 'video2288': [29, 18, 0],
 'video5728': [10, 29, 25],
 'video3423': [23, 12, 0],
 'video5716': [17, 24, 0],
 'video334': [8, 4, 25],
 'video7582': [17, 27, 0],
 'video4395': [18, 3, 0],
 'video1345':

In [4]:
new_kmeans_dict = {}
for old_docid in kmeans_dict.keys():
    new_kmeans_dict[str(old_docid)] = '-'.join(str(elem) for elem in kmeans_dict[old_docid])
new_kmeans_dict

{'video946': '14-12-25',
 'video6295': '25-14-3',
 'video8565': '6-13-0',
 'video7555': '15-23-0',
 'video3809': '8-1-1-0',
 'video181': '28-25-0',
 'video1080': '21-7-0',
 'video2414': '21-14-0',
 'video4159': '14-29-0',
 'video354': '8-25-0',
 'video8764': '8-8-0',
 'video1572': '28-10-0',
 'video1574': '27-14-0',
 'video2706': '18-19-12-0',
 'video2626': '13-8-0',
 'video4865': '1-13-0',
 'video4745': '28-15-0',
 'video7719': '6-13-1',
 'video5771': '2-23-0',
 'video2219': '23-14-19',
 'video6153': '4-14-1-0',
 'video970': '9-21-0',
 'video8070': '9-7-0',
 'video6118': '24-2-0',
 'video2578': '4-13-26',
 'video9633': '1-26-0',
 'video5614': '28-17-0',
 'video5374': '18-24-0',
 'video5681': '25-4-0',
 'video2063': '22-16-0',
 'video2288': '29-18-0',
 'video5728': '10-29-25',
 'video3423': '23-12-0',
 'video5716': '17-24-0',
 'video334': '8-4-25',
 'video7582': '17-27-0',
 'video4395': '18-3-0',
 'video1345': '13-3-0',
 'video200': '1-12-0',
 'video8647': '12-19-0',
 'video165': '15-1

In [5]:
len(kmeans_dict), len(new_kmeans_dict)

(10000, 10000)

## Generate Training Data

In [7]:
with open("../data/MSRVTT/train_list_jsfusion.txt", "r") as f:
    train_list_id = f.readlines()
for i, item in enumerate(train_list_id):
    if '\n' in item:
        item = item.replace('\n', '')
    train_list_id[i] = item
train_list_id[0:5]

['video0', 'video1', 'video2', 'video3', 'video4']

In [8]:
with open("../data/MSRVTT/annotation/MSR_VTT.json", "r") as f:
    msr_vtt = json.load(f)
print(msr_vtt.keys())
print(len(msr_vtt['images'])) # 1000个视频的id
print(len(msr_vtt['annotations'])) # 199994 每个视频对应的20分描述
print(len(msr_vtt['annotations'])) # list 199994
print(msr_vtt['annotations'][0].keys()) # ['caption', 'id', 'image_id']
print(msr_vtt['annotations'][0])
annotations = msr_vtt['annotations']

dict_keys(['info', 'images', 'licenses', 'type', 'annotations'])
10000
199994
199994
dict_keys(['caption', 'id', 'image_id'])
{'caption': 'a cartoon animals runs through an ice cave in a video game', 'id': 0, 'image_id': 'video2960'}


In [11]:
file_train = open("train.tsv", 'w')

for videoid in tqdm(train_list_id):
    kmeans_id = new_kmeans_dict[videoid]
    for item in annotations:
        if item['image_id'] == videoid:
            caption = item['caption']
            file_train.write('\t'.join([caption, videoid, kmeans_id]) + '\n')
            file_train.flush()

100%|██████████| 9000/9000 [04:09<00:00, 36.02it/s]


## Generate Val Dataset

In [12]:
with open("/root/autodl-nas/generateSearch/data/MSRVTT/val_list_jsfusion.txt", "r") as f:
    val_list_id = f.readlines()
for i, item in enumerate(val_list_id):
    if '\n' in item:
        item = item.replace('\n', '')
    val_list_id[i] = item
len(val_list_id)

1000

In [21]:
with open("/root/autodl-nas/generateSearch/data/MSRVTT/raw-captions.pkl", "rb") as f:
    raw_captions = pickle.load(f)
with open("/root/autodl-nas/generateSearch/data/MSRVTT/structured-symlinks/jsfusion_val_caption_idx.pkl", "rb") as f:
    jsfusion_val_caption_idx = pickle.load(f)

In [34]:
len(jsfusion_val_caption_idx.keys())  # 包含1000个测试集数据
jsfusion_val_caption_idx['video7020'] # 每个视频显示对应哪个caption id
" ".join(raw_captions['video7020'][int(jsfusion_val_caption_idx['video7020'])]) # 对应的这个caption取出
new_kmeans_dict['video7020']

'17-6-9'

In [32]:
file_train = open("val.tsv", 'w')

for videoid in tqdm(val_list_id):
    kmeans_id = new_kmeans_dict[videoid]
    caption = " ".join(raw_captions[videoid][int(jsfusion_val_caption_idx[videoid])])
    file_train.write('\t'.join([caption, videoid, kmeans_id]) + '\n')
    file_train.flush()

100%|██████████| 1000/1000 [00:00<00:00, 49391.24it/s]


---

In [4]:
jsfusion_val_caption_idx

{'video9770': 11,
 'video9771': 7,
 'video7020': 2,
 'video9773': 0,
 'video7026': 6,
 'video9775': 4,
 'video9776': 16,
 'video7025': 3,
 'video9778': 4,
 'video9779': 4,
 'video7028': 6,
 'video7029': 4,
 'video9772': 19,
 'video7021': 10,
 'video9774': 11,
 'video7027': 19,
 'video9731': 1,
 'video7024': 13,
 'video9777': 10,
 'video8913': 3,
 'video8912': 1,
 'video8911': 6,
 'video8910': 12,
 'video8917': 13,
 'video8916': 6,
 'video8915': 15,
 'video8914': 12,
 'video8919': 4,
 'video8918': 0,
 'video9545': 3,
 'video7704': 8,
 'video7118': 15,
 'video7119': 15,
 'video7116': 1,
 'video7117': 14,
 'video7114': 0,
 'video7115': 11,
 'video7112': 12,
 'video7113': 16,
 'video7110': 1,
 'video9542': 11,
 'video9679': 18,
 'video8978': 17,
 'video8464': 2,
 'video7701': 14,
 'video7438': 19,
 'video8899': 14,
 'video8895': 6,
 'video7431': 13,
 'video9549': 11,
 'video8829': 14,
 'video8828': 7,
 'video8827': 13,
 'video8826': 16,
 'video8825': 5,
 'video9548': 9,
 'video8823': 3,
 '

# 生成数据集

In [2]:
import os
import json
import pandas as pd
import pickle
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import trange, tqdm
import random

In [2]:
import pickle
path = "kmeans/k30_c30_seed_7_CLIPembedding.pkl"
with open(path, 'rb') as f:
    kmeans_dict = pickle.load(f)
kmeans_dict

{'video8674': [1, 16, 0],
 'video5883': [11, 1, 0, 0],
 'video3053': [26, 13, 0],
 'video8152': [11, 3, 2, 0],
 'video3740': [20, 10, 0],
 'video1529': [27, 22, 0],
 'video1077': [6, 17, 0],
 'video9599': [6, 21, 0],
 'video3537': [7, 21, 0],
 'video3738': [19, 4, 8],
 'video4255': [24, 12, 0],
 'video8145': [11, 4, 0],
 'video8106': [4, 8, 0],
 'video5169': [27, 27, 0],
 'video2154': [10, 23, 0],
 'video5717': [15, 5, 0],
 'video5927': [4, 4, 0],
 'video2009': [21, 10, 4, 0],
 'video5312': [0, 26, 0],
 'video4130': [13, 14, 0],
 'video7593': [7, 22, 9, 0],
 'video4075': [2, 23, 0],
 'video3474': [11, 25, 0],
 'video5731': [11, 21, 24],
 'video6902': [23, 21, 0],
 'video6438': [8, 8, 0],
 'video1805': [23, 5, 0],
 'video8783': [0, 16, 0],
 'video5161': [21, 22, 1, 0],
 'video3669': [22, 1, 0],
 'video4789': [27, 16, 25],
 'video8026': [28, 0, 0],
 'video4308': [26, 9, 0],
 'video554': [21, 12, 0],
 'video7485': [5, 4, 0],
 'video3818': [29, 14, 0],
 'video144': [23, 14, 1, 0],
 'video6

In [3]:
new_kmeans_dict = {}
for old_docid in kmeans_dict.keys():
    new_kmeans_dict[str(old_docid)] = '-'.join(str(elem) for elem in kmeans_dict[old_docid])
new_kmeans_dict

{'video8674': '1-16-0',
 'video5883': '11-1-0-0',
 'video3053': '26-13-0',
 'video8152': '11-3-2-0',
 'video3740': '20-10-0',
 'video1529': '27-22-0',
 'video1077': '6-17-0',
 'video9599': '6-21-0',
 'video3537': '7-21-0',
 'video3738': '19-4-8',
 'video4255': '24-12-0',
 'video8145': '11-4-0',
 'video8106': '4-8-0',
 'video5169': '27-27-0',
 'video2154': '10-23-0',
 'video5717': '15-5-0',
 'video5927': '4-4-0',
 'video2009': '21-10-4-0',
 'video5312': '0-26-0',
 'video4130': '13-14-0',
 'video7593': '7-22-9-0',
 'video4075': '2-23-0',
 'video3474': '11-25-0',
 'video5731': '11-21-24',
 'video6902': '23-21-0',
 'video6438': '8-8-0',
 'video1805': '23-5-0',
 'video8783': '0-16-0',
 'video5161': '21-22-1-0',
 'video3669': '22-1-0',
 'video4789': '27-16-25',
 'video8026': '28-0-0',
 'video4308': '26-9-0',
 'video554': '21-12-0',
 'video7485': '5-4-0',
 'video3818': '29-14-0',
 'video144': '23-14-1-0',
 'video6391': '9-11-0',
 'video3049': '6-3-0',
 'video6173': '6-11-0',
 'video1358': '16

In [4]:
with open("../data/MSRVTT/train_list_jsfusion.txt", "r") as f:
    train_list_id = f.readlines()
for i, item in enumerate(train_list_id):
    if '\n' in item:
        item = item.replace('\n', '')
    train_list_id[i] = item
train_list_id

['video0',
 'video1',
 'video2',
 'video3',
 'video4',
 'video5',
 'video6',
 'video7',
 'video8',
 'video9',
 'video10',
 'video11',
 'video12',
 'video13',
 'video14',
 'video15',
 'video16',
 'video17',
 'video18',
 'video19',
 'video20',
 'video21',
 'video22',
 'video23',
 'video24',
 'video25',
 'video26',
 'video27',
 'video28',
 'video29',
 'video30',
 'video31',
 'video32',
 'video33',
 'video34',
 'video35',
 'video36',
 'video37',
 'video38',
 'video39',
 'video40',
 'video41',
 'video42',
 'video43',
 'video44',
 'video45',
 'video46',
 'video47',
 'video48',
 'video49',
 'video50',
 'video51',
 'video52',
 'video53',
 'video54',
 'video55',
 'video56',
 'video57',
 'video58',
 'video59',
 'video60',
 'video61',
 'video62',
 'video63',
 'video64',
 'video65',
 'video66',
 'video67',
 'video68',
 'video69',
 'video70',
 'video71',
 'video72',
 'video73',
 'video74',
 'video75',
 'video76',
 'video77',
 'video78',
 'video79',
 'video80',
 'video81',
 'video82',
 'video83',
 '

In [5]:
with open("../data/MSRVTT/annotation/MSR_VTT.json", "r") as f:
    msr_vtt = json.load(f)
print(msr_vtt.keys())
print(len(msr_vtt['images'])) # 1000个视频的id
print(len(msr_vtt['annotations'])) # 199994 每个视频对应的20分描述
print(len(msr_vtt['annotations'])) # list 199994
print(msr_vtt['annotations'][0].keys()) # ['caption', 'id', 'image_id']
print(msr_vtt['annotations'][0])
annotations = msr_vtt['annotations']

dict_keys(['info', 'images', 'licenses', 'type', 'annotations'])
10000
199994
199994
dict_keys(['caption', 'id', 'image_id'])
{'caption': 'a cartoon animals runs through an ice cave in a video game', 'id': 0, 'image_id': 'video2960'}


In [6]:
with open("../data/MSRVTT/val_list_jsfusion.txt", "r") as f:
    val_list_id = f.readlines()
for i, item in enumerate(val_list_id):
    if '\n' in item:
        item = item.replace('\n', '')
    val_list_id[i] = item
len(val_list_id)

1000

In [3]:
with open("../data/MSRVTT/raw-captions.pkl", "rb") as f:
    raw_captions = pickle.load(f)
with open("../data/MSRVTT/structured-symlinks/jsfusion_val_caption_idx.pkl", "rb") as f:
    jsfusion_val_caption_idx = pickle.load(f)

In [15]:
with open("/root/autodl-nas/git2/GenerativeImage2Text/videocaptioning/results_20.json", 'r') as f:
    qg = json.load(f)
len(qg)
lll = list(qg.keys())[0:5]
print(lll)
lll[1][0: lll[1].rfind('.')]

['video9674.mp4', 'video8216.mp4', 'video9867.mp4', 'video9808.mp4', 'video2921.mp4']


'video8216'

In [19]:
"""
训练集
"""

count_train_anno = 0
count_val_anno = 0

file_train = open("generateQuery/train.tsv", 'w')
for videoid in tqdm(new_kmeans_dict.keys()):
    if videoid in train_list_id:
        flg = True
    else:
        flg = False
    kmeans_id = new_kmeans_dict[videoid]
    for item in annotations:
        if flg:
            if item['image_id'] == videoid:
                caption = item['caption']
                file_train.write('\t'.join([caption, videoid, kmeans_id]) + '\n')
                file_train.flush()
                count_train_anno += 1
        else: # 在测试集中
            if item['image_id'] == videoid:
                caption = item['caption']
                # 当前caption与测试caption不符 则加入到测试数据
                if caption != " ".join(raw_captions[videoid][int(jsfusion_val_caption_idx[videoid])]):
                    file_train.write('\t'.join([caption, videoid, kmeans_id]) + '\n')
                    file_train.flush()
                    count_val_anno += 1
print(count_train_anno)
print(count_val_anno)
count_qg = 0
for videoid in tqdm(qg.keys()):
    videoid_k = videoid[0: videoid.rfind('.')]
    kmeans_id = new_kmeans_dict[videoid_k]
    for caption in qg[videoid]:
        file_train.write('\t'.join([caption, videoid_k, kmeans_id]) + '\n')
        file_train.flush()
        count_qg += 1
print(count_qg)

100%|██████████| 10000/10000 [04:47<00:00, 34.83it/s]


179996
18897


100%|██████████| 10000/10000 [00:01<00:00, 6488.67it/s]

200000





In [18]:
new_kmeans_dict['video9674']

'3-8-11-2'

In [20]:
file_train = open("generateQuery/test.tsv", 'w')

for videoid in tqdm(val_list_id):
    kmeans_id = new_kmeans_dict[videoid]
    caption = " ".join(raw_captions[videoid][int(jsfusion_val_caption_idx[videoid])])
    file_train.write('\t'.join([caption, videoid, kmeans_id]) + '\n')
    file_train.flush()

100%|██████████| 1000/1000 [00:00<00:00, 54124.24it/s]


In [2]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5Config,
    get_linear_schedule_with_warmup
)
model = T5ForConditionalGeneration.from_pretrained("t5-base")

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [1]:
import pickle

with open("/root/autodl-tmp/generateSearch/MSRVTTdataset/k30_c30_C4C/test.tsv", 'r') as f:
    test = f.readlines()

In [4]:
dic = {}
for item in test:
    idseq = item.split('\t')[2]
    if '\n' in idseq:
        idseq.replace('\n','')
    if idseq not in dic.keys():
        dic[item.split('\t')[2]] = [item.split('\t')[1]]
    else:
        dic[item.split('\t')[2]].append(item.split('\t')[1])

In [9]:
count = 0
for k in dic:
    if len(dic[k]) >= 2:
        count+=1

In [10]:
count

107

In [1]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5Config,
    get_linear_schedule_with_warmup
)
model = T5ForConditionalGeneration.from_pretrained("t5-large")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [2]:
with open("/root/autodl-tmp/generateSearch/MSRVTTdataset/k30_c30_C4C/test.tsv", 'r') as f:
    test = f.readlines()
with open("/root/autodl-tmp/generateSearch/MSRVTTdataset/k30_c30_C4C/train.tsv", 'r') as f:
    train = f.readlines()

In [3]:
print(test[0])
print(train[0])

a woman creating a fondant baby and flower	video7020	24-24-13

a car is shown	video0	26-28-28



In [6]:
from tqdm import tqdm
res = []
for i in tqdm(range(len(test))):
    test_vid = test[i].split('\t')[1]
    seq = test[i].split('\t')[2]
    all_captions = []
    all_vids = []
    tt_flag = []
    for item in train:
        if seq == item.split('\t')[2]:
            # all_captions.append(f"{item.split('\t')[0]}-{item.split('\t')[1]}-train")
            caption = item.split('\t')[0]
            vid = item.split('\t')[1]
            all_captions.append(f"{caption} \t {vid} \t train")
            #all_vids.append(item.split('\t')[1])
            #tt_flag.append("train_set")
    for item in test:
        if seq == item.split('\t')[2] and test_vid != item.split('\t')[1]:
            caption = item.split('\t')[0]
            vid = item.split('\t')[1]
            all_captions.append(f"{caption} \t {vid} \t test")
            # all_captions.append(item.split('\t')[0])
            # all_vids.append(item.split('\t')[1])
            # tt_flag.append("test_set")
    if len(all_captions) > 0:
        res.append([test[i].split('\t')[1], test[i].split('\t')[0], all_captions])

import json
with open("t2t.json", 'w') as f:
    json.dump(res, f)

100%|██████████| 1000/1000 [00:50<00:00, 19.79it/s]


In [5]:
len(res)

1000

In [6]:
res[0]

['video7020',
 'a woman creating a fondant baby and flower',
 ['there is a woman is making a dish',
  'a woman recreates a cartoon character with fondant',
  'a woman kneeds dough in an arts and crafts project',
  'a doll is being made using a flexible material',
  'a clay is on the screen and cream added',
  'a cake chef demonstrates a technique to decorate a cake',
  'spongebob square pants making tutorial showing how to make the fondant quick drying for spongebob s body',
  'showing a making of clay',
  'a little bit of powder is sprinkled onto a yellow dough and then worked in',
  'a woman is crafting a figure from yellow cake fondant',
  'a lady creating spongebob cake  out of yellow fondant',
  'a woman making some model with the clay',
  'a  girl and a bowl  spoon mixing dish inside kitchen to prepare to serve to eat displaying on screen',
  'a woman talks about her delicious looking dessert',
  'synthetic modelling clay can be molded to any shape or form as required',
  'human 

In [4]:
import json
import pickle

In [5]:


with open("../data/MSRVTT/train_list_jsfusion.txt", "r") as f:
    train_list_id = f.readlines()
for i, item in enumerate(train_list_id):
    if '\n' in item:
        item = item.replace('\n', '')
    train_list_id[i] = item
    
with open("../data/MSRVTT/annotation/MSR_VTT.json", "r") as f:
    msr_vtt = json.load(f)
annotations = msr_vtt['annotations']

with open("../data/MSRVTT/val_list_jsfusion.txt", "r") as f:
    val_list_id = f.readlines()
for i, item in enumerate(val_list_id):
    if '\n' in item:
        item = item.replace('\n', '')
    val_list_id[i] = item
    
with open("../data/MSRVTT/raw-captions.pkl", "rb") as f:
    raw_captions = pickle.load(f)
with open("../data/MSRVTT/structured-symlinks/jsfusion_val_caption_idx.pkl", "rb") as f:
    jsfusion_val_caption_idx = pickle.load(f)

with open("/root/autodl-tmp/mPLUG-Owl/new_v2c.json", 'r') as f:
    v2c = json.load(f)

In [7]:
videoids = [f"video{i}" for i in range(0, 10000)]
len(videoids), videoids[-1]

(10000, 'video9999')

In [8]:
from tqdm import tqdm

In [9]:
video2caption_msrvtt = {}
for videoid in tqdm(videoids):
    video2caption_msrvtt[videoid] = []
    if videoid in train_list_id:
        flg = True
    else:
        flg = False
    for item in annotations:
        if flg:
            if item['image_id'] == videoid:
                caption = item['caption']
                video2caption_msrvtt[videoid].append([caption, True])
        else: # 在测试集中
            if item['image_id'] == videoid:
                caption = item['caption']
                # 当前caption与测试caption不符 则加入到测试数据
                if caption != " ".join(raw_captions[videoid][int(jsfusion_val_caption_idx[videoid])]):
                    video2caption_msrvtt[videoid].append([caption, False])

100%|██████████| 10000/10000 [04:22<00:00, 38.04it/s]


In [10]:
with open("msrvtt_video2caption.json", 'w') as f:
    json.dump(video2caption_msrvtt, f)

# MSVD DG

In [22]:
import pickle
import os
import glob

video_path = "/root/autodl-tmp/generateSearch/data/MSVD/YouTubeClips"
video_files = glob.glob(os.path.join(video_path, "*.avi"))
with open("/root/autodl-tmp/generateSearch/data/MSVD/msvd_data/raw-captions.pkl", 'rb') as f:
    raw_captions = pickle.load(f) 

In [23]:
raw_captions.keys()

dict_keys(['-4wsuPCjDBc_5_15', '-7KMZQEsJW4_205_208', '-8y1Q0rA3n8_108_115', '-8y1Q0rA3n8_95_102', '-9CUm-2cui8_39_44', '-AwoiGR6c8M_10_14', '-Cv5LsqKUXc_17_25', '-Cv5LsqKUXc_71_76', '-DKuLXYoY3g_14_20', '-DRy7rBg0IQ_31_37', '-FugkxLmGO4_5_16', '-Ms9tsWmhyU_80_95', '-YI0cxuNcq8_262_272', '-_aaMGK6GGw_57_61', '-_hbPLsZvvo_172_179', '-_hbPLsZvvo_18_25', '-_hbPLsZvvo_19_25', '-_hbPLsZvvo_19_26', '-_hbPLsZvvo_211_219', '-_hbPLsZvvo_269_275', '-_hbPLsZvvo_288_305', '-_hbPLsZvvo_323_328', '-_hbPLsZvvo_43_55', '-_hbPLsZvvo_49_55', '-_hbPLsZvvo_5_8', '-bjOB4zS0uE_100_105', '-dm-ds5rRaM_44_52', '-joBOHlg5J0_72_80', '-mAoVOhKy0c_4_9', '-pUwIypksfE_13_23', '-rkErLY0rRc_26_35', '-rkErLY0rRc_63_67', '-rvjK0lE3z4_33_43', '-s4-6QTT7HE_235_241', '-t-ZWaJeH-o_0_15', '-uT_1VDvXok_8_15', '-vKO3uSG6Do_3_14', '-vg3vR86fu0_1_6', '-wa0umYJVGg_100_115', '-wa0umYJVGg_117_123', '-wa0umYJVGg_139_157', '-wa0umYJVGg_168_176', '-wa0umYJVGg_23_41', '-wa0umYJVGg_271_276', '-wa0umYJVGg_286_290', '-zOrV-5vh1A_69_76', '

In [24]:
raw_captions['-4wsuPCjDBc_5_15']

[['a', 'squirrel', 'is', 'eating', 'a', 'peanut', 'in', 'it', 's', 'shell'],
 ['a', 'chipmunk', 'is', 'eating'],
 ['a', 'chipmunk', 'is', 'eating', 'a', 'peanut'],
 ['a', 'chipmunk', 'is', 'eating', 'a', 'nut'],
 ['a', 'squirrel', 'is', 'eating', 'a', 'nut'],
 ['a', 'squirrel', 'is', 'eating', 'a', 'whole', 'peanut'],
 ['a', 'squirrel', 'is', 'eating', 'a', 'peanut'],
 ['a', 'squirrel', 'l', 'is', 'eating', 'nuts'],
 ['a', 'small', 'squirrel', 'is', 'eating', 'a', 'peanut'],
 ['a', 'small', 'animal', 'is', 'chewing', 'on', 'a', 'nut'],
 ['an', 'ardilla', 'is', 'eating'],
 ['a', 'squirrel', 'is', 'eating', 'a', 'nut'],
 ['a', 'chipmunk', 'is', 'eating', 'some', 'food'],
 ['the', 'squirrel', 'is', 'eating'],
 ['a', 'ferret', 'eats', 'a', 'leaf'],
 ['a', 'hamster', 'is', 'eating', 'a', 'peanut'],
 ['a', 'ardilla', 'coreana', 'eating'],
 ['the', 'rabbit', 'is', 'eating'],
 ['a', 'chipmunk', 'is', 'eating', 'a', 'peanut'],
 ['a', 'rabbit', 'is', 'eating'],
 ['a', 'rabbit', 'is', 'eating'],


In [4]:
for item in video_files:
    print(item)
    break

/root/autodl-tmp/generateSearch/data/MSVD/YouTubeClips/-4wsuPCjDBc_5_15.avi


In [2]:
type(raw_captions)

dict

In [3]:
raw_captions.keys()

dict_keys(['-4wsuPCjDBc_5_15', '-7KMZQEsJW4_205_208', '-8y1Q0rA3n8_108_115', '-8y1Q0rA3n8_95_102', '-9CUm-2cui8_39_44', '-AwoiGR6c8M_10_14', '-Cv5LsqKUXc_17_25', '-Cv5LsqKUXc_71_76', '-DKuLXYoY3g_14_20', '-DRy7rBg0IQ_31_37', '-FugkxLmGO4_5_16', '-Ms9tsWmhyU_80_95', '-YI0cxuNcq8_262_272', '-_aaMGK6GGw_57_61', '-_hbPLsZvvo_172_179', '-_hbPLsZvvo_18_25', '-_hbPLsZvvo_19_25', '-_hbPLsZvvo_19_26', '-_hbPLsZvvo_211_219', '-_hbPLsZvvo_269_275', '-_hbPLsZvvo_288_305', '-_hbPLsZvvo_323_328', '-_hbPLsZvvo_43_55', '-_hbPLsZvvo_49_55', '-_hbPLsZvvo_5_8', '-bjOB4zS0uE_100_105', '-dm-ds5rRaM_44_52', '-joBOHlg5J0_72_80', '-mAoVOhKy0c_4_9', '-pUwIypksfE_13_23', '-rkErLY0rRc_26_35', '-rkErLY0rRc_63_67', '-rvjK0lE3z4_33_43', '-s4-6QTT7HE_235_241', '-t-ZWaJeH-o_0_15', '-uT_1VDvXok_8_15', '-vKO3uSG6Do_3_14', '-vg3vR86fu0_1_6', '-wa0umYJVGg_100_115', '-wa0umYJVGg_117_123', '-wa0umYJVGg_139_157', '-wa0umYJVGg_168_176', '-wa0umYJVGg_23_41', '-wa0umYJVGg_271_276', '-wa0umYJVGg_286_290', '-zOrV-5vh1A_69_76', '

In [5]:
key_error = []
video_ids = [item.split('/')[-1].split('.')[0] for item in video_files]

In [6]:
video_ids[0:5]

['-4wsuPCjDBc_5_15',
 '-7KMZQEsJW4_205_208',
 '-8y1Q0rA3n8_108_115',
 '-8y1Q0rA3n8_95_102',
 '-9CUm-2cui8_39_44']

In [7]:
for k in raw_captions:
    if k not in video_ids:
        key_error.append(k)

In [8]:
len(key_error)

0

In [9]:
len(video_ids), len(raw_captions)

(1970, 1970)

In [10]:
with open("/root/autodl-tmp/CLIP4Clip/res/MSVD/embeddings_all_videos.pkl", 'rb') as f:
    embeddings = pickle.load(f)

In [12]:
len(embeddings)

1970

In [14]:
len(raw_captions['-9CUm-2cui8_39_44'])

28

In [15]:
raw_captions['-9CUm-2cui8_39_44'][0]

['a',
 'woman',
 'puts',
 'four',
 'okra',
 'in',
 'a',
 'pan',
 'of',
 'boiling',
 'water']

[kmeans_id], [captions]

In [16]:
import numpy as np
def cosine_similarity(vec1, vec2):
    vec1 = vec1.squeeze()
    vec2 = vec2.squeeze()

    # 计算两个向量的余弦相似度
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm1 * norm2)
    return similarity

In [17]:
with open("/root/autodl-tmp/CLIP4Clip/res/embeddings_all_videos.pkl",'rb') as f:
    embeddings_msrvtt = pickle.load(f) # 10000 个视频的特征 (1,512)

In [18]:
with open("/root/autodl-tmp/generateSearch/MSRVTTdataset/kmeans/k10_c10_seed_34_C4C.pkl", 'rb') as f:
    mapping = pickle.load(f)

In [29]:
mapping['video0'], type(mapping['video0']), type(mapping['video0'][0])

([9, 0, 4, 0, 0], list, numpy.int32)

In [20]:
embedding_msvd = embeddings["-4wsuPCjDBc_5_15"]
embedding_msvd.shape

(1, 512)

In [23]:
embeddings_msrvtt['1'].shape

(1, 512)

In [31]:
from tqdm import tqdm
import json
def find_most_similar_id(target_vector, id_to_vector):
    # target_vector = id_to_vector[target_id]
    most_similar_id = None
    highest_similarity = -1  # 初始化为负数，表示相似度的最小值

    for id, vector in id_to_vector.items():
        similarity = cosine_similarity(target_vector, vector)
        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar_id = id

    return most_similar_id

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
    
msvd_id2cap = []
for msvd_id in tqdm(embeddings):
    embedding_msvd = embeddings[msvd_id]
    captions = raw_captions[msvd_id]
    most_similar_id = find_most_similar_id(embedding_msvd, embeddings_msrvtt)
    tvid = f"video{most_similar_id}"
    tokenseq = mapping[tvid]
    for item in captions:
        cap = " ".join(item)
        msvd_id2cap.append([cap, tvid, tokenseq])

with open("msvd_id2cap.json", 'w') as f:
    json.dump(msvd_id2cap, f, cls=NpEncoder)

100%|██████████| 1970/1970 [02:57<00:00, 11.10it/s]


# DiDeMo DG

In [12]:
import pickle
import os
import glob
import json

# video_path = "/root/autodl-tmp/generateSearch/data/MSVD/YouTubeClips"
# video_files = glob.glob(os.path.join(video_path, "*.mp4"))
with open("/root/autodl-tmp/LocalizingMoments/vid2caps.json", 'r') as f:
    raw_captions = json.load(f)

In [14]:
raw_captions['10015567@N08_3655084291_d8b58466fa.mov'], len(raw_captions)

(['the lighter chick runs quickly left off frame.',
  'camera zooms out',
  'the white bird runs off camera.',
  'chick in the back races out of the frame',
  'white chicken leaves screen'],
 10642)

In [13]:
raw_captions['10015567@N08_3655084291_d8b58466fa.mov']

['the lighter chick runs quickly left off frame.',
 'camera zooms out',
 'the white bird runs off camera.',
 'chick in the back races out of the frame',
 'white chicken leaves screen']

In [3]:
import numpy as np
def cosine_similarity(vec1, vec2):
    vec1 = vec1.squeeze()
    vec2 = vec2.squeeze()

    # 计算两个向量的余弦相似度
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm1 * norm2)
    return similarity

In [4]:
with open("/root/autodl-tmp/CLIP4Clip/res/embeddings_all_videos.pkl",'rb') as f:
    embeddings_msrvtt = pickle.load(f) # 10000 个视频的特征 (1,512)

In [5]:
num = 10
with open(f"/root/autodl-tmp/generateSearch/MSRVTTdataset/kmeans/k{10}_c{10}_seed_34_C4C.pkl", 'rb') as f:
    mapping = pickle.load(f)

In [6]:
mapping['video0'], type(mapping['video0']), type(mapping['video0'][0])

([9, 0, 4, 0, 0], list, numpy.int32)

In [17]:
with open("/root/autodl-tmp/CLIP4Clip/res/DiDeMo/embeddings_all_videos.pkl", 'rb') as f:
    embeddings = pickle.load(f)

In [13]:
list(embeddings.keys())[0:5], len(embeddings)

(['84609891@N00_3674552257_4550e56f45',
  '26390358@N02_4693487113_b32ef0d63c',
  '58659672@N03_6868709812_46df9c11b2',
  '44925192@N00_2599858575_ee7fb5901f',
  '24473807@N07_2898331297_fac5992dcf'],
 10450)

In [15]:
embeddings['84609891@N00_3674552257_4550e56f45'].shape

(1, 512)

In [19]:
vid2key = {}
for k in raw_captions:
    vid2key[k.split('.')[0]] = k
print(len(vid2key))


10464


In [22]:
from tqdm import tqdm
import json
def find_most_similar_id(target_vector, id_to_vector):
    # target_vector = id_to_vector[target_id]
    most_similar_id = None
    highest_similarity = -1  # 初始化为负数，表示相似度的最小值

    for id, vector in id_to_vector.items():
        similarity = cosine_similarity(target_vector, vector)
        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar_id = id

    return most_similar_id

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
    
msvd_id2cap = []
for msvd_id in tqdm(embeddings):
    embedding_msvd = embeddings[msvd_id]
    captions = raw_captions[vid2key[msvd_id]]
    most_similar_id = find_most_similar_id(embedding_msvd, embeddings_msrvtt)
    tvid = f"video{most_similar_id}"
    tokenseq = mapping[tvid]
    for item in captions:
        # cap = " ".join(item)
        msvd_id2cap.append([item, tvid, tokenseq])

with open(f"didemo_id2cap_{num}.json", 'w') as f:
    json.dump(msvd_id2cap, f, cls=NpEncoder)

100%|██████████| 10450/10450 [15:40<00:00, 11.11it/s]


In [1]:
import json
with open("msrvtt_video2caption.json", 'r') as f:
    msrvtt_video2caption = json.load(f)

In [2]:
msrvtt_video2caption.keys()

dict_keys(['video0', 'video1', 'video2', 'video3', 'video4', 'video5', 'video6', 'video7', 'video8', 'video9', 'video10', 'video11', 'video12', 'video13', 'video14', 'video15', 'video16', 'video17', 'video18', 'video19', 'video20', 'video21', 'video22', 'video23', 'video24', 'video25', 'video26', 'video27', 'video28', 'video29', 'video30', 'video31', 'video32', 'video33', 'video34', 'video35', 'video36', 'video37', 'video38', 'video39', 'video40', 'video41', 'video42', 'video43', 'video44', 'video45', 'video46', 'video47', 'video48', 'video49', 'video50', 'video51', 'video52', 'video53', 'video54', 'video55', 'video56', 'video57', 'video58', 'video59', 'video60', 'video61', 'video62', 'video63', 'video64', 'video65', 'video66', 'video67', 'video68', 'video69', 'video70', 'video71', 'video72', 'video73', 'video74', 'video75', 'video76', 'video77', 'video78', 'video79', 'video80', 'video81', 'video82', 'video83', 'video84', 'video85', 'video86', 'video87', 'video88', 'video89', 'video90'

In [3]:
msrvtt_video2caption['video0']

[['a car is shown', True],
 ['a group is dancing', True],
 ['a man drives a vehicle through the countryside', True],
 ['a man drives down the road in an audi', True],
 ['a man driving a car', True],
 ['a man is driving a car', True],
 ['a man is driving down a road', True],
 ['a man is driving in a car as part of a commercial', True],
 ['a man is driving', True],
 ['a man riding the car speedly in a narrow road', True],
 ['a man showing the various features of a car', True],
 ['a man silently narrates his experience driving an audi', True],
 ['a person is driving his car around curves in the road', True],
 ['a person telling about a car', True],
 ['guy driving a car down the road', True],
 ['man talking about a car while driving', True],
 ['the man drives the car', True],
 ['the man driving the audi as smooth as possible', True],
 ['a man is driving', True],
 ['guy driving a car down the road', True]]

In [5]:
with open(f"didemo_id2cap_{10}.json", 'r') as f:
    msvd_id2cap = json.load(f)

In [7]:
msvd_id2cap[0]

['frame that shows only water and mountains in the background.',
 'video3225',
 [2, 7, 2, 9]]

In [9]:
import pickle
path = f"kmeans/k{30}_c{30}_seed_34_allvideo.pkl"
with open(path, 'rb') as f:
    kmeans_dict = pickle.load(f)
new_kmeans_dict = {}
for old_docid in kmeans_dict.keys():
    new_kmeans_dict[str(old_docid)] = '-'.join(str(elem) for elem in kmeans_dict[old_docid])


In [10]:
len(new_kmeans_dict)

22420

In [11]:
new_kmeans_dict['11640161@N03_3980522295_68d3dba2f4']

'8-29-17'

In [14]:
with open("/root/autodl-tmp/LocalizingMoments/vid2caps.json", 'r') as f:
    didemo = json.load(f)
didemo_video2caption = {}
for k in didemo:
    didemo_video2caption[k.split('.')[0]] = didemo[k]


In [15]:
didemo_video2caption.keys()

dict_keys(['54322086@N00_2408598493_274c77d26a', '99051133@N00_2502628368_d14bd317de', '67801451@N00_5358663022_243bd90fbc', '64379474@N00_4479342537_7b5a3d3f1d', '63122283@N06_9978694646_e72011157f', '38928586@N07_12110094074_85bfd8e4bb', '32005048@N06_5252757341_6749a9a2bd', '38438821@N05_3967168792_18cfde0fc5', '51371187@N00_9093287754_675a283d41', '55016393@N00_3335074996_39dd2e0278', '12289718@N00_4955434740_80ae47ef41', '34418903@N00_5271280795_b561e95d46', '76236359@N00_4283045069_40f7510c03', '20148960@N00_3951736137_8aee69f6f7', '39346508@N00_3097148199_ee152ea273', '64194819@N00_2481079905_c0df9212a9', '87719130@N00_4156236069_0c530f3d65', '10955425@N00_5507632479_b5b8f6cd0b', '8071066@N04_4240441600_2f984e817e', '92431035@N00_6990277885_8b3cdfaf50', '34418903@N00_6095169007_d2ed0735b8', '95239135@N00_3600500186_0e4cbe517f', '37007021@N00_2699696028_a88257e84d', '22837634@N02_3853299361_1570049e56', '19251296@N00_4478203456_d0f06bf1ba', '25958034@N03_4542467095_30fde9b376', '

In [16]:
didemo_video2caption['10015567@N08_3655084291_d8b58466fa']

['the lighter chick runs quickly left off frame.',
 'camera zooms out',
 'the white bird runs off camera.',
 'chick in the back races out of the frame',
 'white chicken leaves screen']

In [17]:
with open("msrvtt_video2caption.json", 'r') as f:
    msrvtt_video2caption = json.load(f)

In [20]:
msrvtt_video2caption['video90']

[['a little dog runs across the floor', True],
 ['a little white puppy runs around on a floor in a room and occasionally interacts with the cameraman s finger',
  True],
 ['a person is filming is puppy playing', True],
 ['a person is playing with a small dog', True],
 ['a puppy is playing', True],
 ['a puppy is running to and away from the camera that is laid on the floor',
  True],
 ['a puppy is running', True],
 ['a puppy runs plays and explores', True],
 ['a really cute little puppy running around the floor', True],
 ['a small puppy running toward a camera then running away', True],
 ['a small white dog is scampeing across the floor', True],
 ['a small white puppy is running around on the floor', True],
 ['a woman talking on talkshow', True],
 ['cheer leaders are entertaining audience', True],
 ['clip of dog on floor', True],
 ['dogs running around room', True],
 ['puppies are in a room running around and looking at the camera', True],
 ['puppy runs around excitedly', True],
 ['smal

In [21]:
for caption in didemo_video2caption["10015567@N08_3655084291_d8b58466fa"]:
    print(caption)
    break

the lighter chick runs quickly left off frame.


# WebVid QG

In [1]:
# 获取raw_captions:
import glob

files = glob.glob("/root/autodl-tmp/WebVid/embeddings/emb/*.txt")

In [3]:
import pickle
print(len(files))
with open("/root/autodl-tmp/WebVid/res/vid2emb_webvid.pkl", 'rb') as f:
    embeddings_webvid = pickle.load(f)
print(len(embeddings_webvid))

10719575
10719575


In [12]:
with open("/root/autodl-tmp/WebVid/res/vid2emb_msrvtt.pkl", 'rb') as f:
    embeddings_msrvtt = pickle.load(f)
print(len(embeddings_msrvtt))

10000


In [4]:
files[0]

'/root/autodl-tmp/WebVid/embeddings/emb/1012591205.txt'

In [9]:
from tqdm import tqdm

raw_captions = {}
for path in tqdm(files):
    vid = path.split("/")[-1].split(".")[0]
    with open(path, "r") as f:
        caption = f.readline()
        #assert len(caption) == 1
    raw_captions[vid] = caption

100%|██████████| 10719575/10719575 [17:01<00:00, 10496.93it/s]


In [13]:
print(len(raw_captions))
print(raw_captions['1066907437'])
print(embeddings_webvid['1066907437'].shape)
print(embeddings_msrvtt['video0'].shape)

10719575
Winter in the forest with fir trees on the background of sunset
(512,)
(512,)


In [10]:
import numpy as np
def cosine_similarity(vec1, vec2):
    # vec1 = vec1.squeeze()
    # vec2 = vec2.squeeze()

    # 计算两个向量的余弦相似度
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm1 * norm2)
    return similarity


In [15]:
num = 30
with open(f"/root/autodl-tmp/generateSearch/MSRVTTdataset/kmeans/k{30}_c{30}_seed_34_webvid.pkl", 'rb') as f:
    mapping = pickle.load(f)

In [16]:
mapping['video0'], type(mapping['video0']), type(mapping['video0'][0])

([21, 10, 6], list, numpy.int32)

In [17]:
len(mapping)

10000

In [18]:
import json
with open("/root/autodl-tmp/WebVid/res/webvid_rawcaptions.json", "w") as f:
    json.dump(raw_captions, f)

In [20]:
from tqdm import tqdm
import json
def find_most_similar_id(target_vector, id_to_vector):
    # target_vector = id_to_vector[target_id]
    most_similar_id = None
    highest_similarity = -1  # 初始化为负数，表示相似度的最小值

    for id, vector in id_to_vector.items():
        similarity = cosine_similarity(target_vector, vector)
        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar_id = id

    return most_similar_id

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
    
webvid_id2cap = []
for vid in tqdm(embeddings_webvid):
    embedding = embeddings_webvid[vid]
    caption = raw_captions[vid]
    most_similar_id = find_most_similar_id(embedding, embeddings_msrvtt)
    tokenseq = mapping[most_similar_id]
    webvid_id2cap.append([caption, most_similar_id, tokenseq])

with open(f"webvid_id2cap_{30}.json", 'w') as f:
    json.dump(webvid_id2cap, f, cls=NpEncoder)

  0%|          | 471/10719575 [01:36<609:49:34,  4.88it/s]


KeyboardInterrupt: 

In [4]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
webvid_vid = []
webvid_emb = []
msrvtt_vid = []
msrvtt_emb = []
for k in tqdm(embeddings_webvid):
    webvid_vid.append(k)
    webvid_emb.append(embeddings_webvid[k])
for k in tqdm(embeddings_msrvtt):
    msrvtt_vid.append(k)
    msrvtt_emb.append(embeddings_msrvtt[k])

#similarities = cosine_similarity(embeddings_webvid, embeddings_msrvtt)

100%|██████████| 10719575/10719575 [00:07<00:00, 1380403.91it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1587909.44it/s]


In [23]:
webvid_emb = np.array(webvid_emb)
msrvtt_emb = np.array(msrvtt_emb)
similarities = cosine_similarity(webvid_emb, msrvtt_emb)

: 

In [3]:
from tqdm import tqdm
import json
import numpy as np
import pickle
from scipy.spatial import KDTree

with open("/root/autodl-tmp/WebVid/res/vid2emb_webvid.pkl", 'rb') as f:
    embeddings_webvid = pickle.load(f)

with open("/root/autodl-tmp/WebVid/res/vid2emb_msrvtt.pkl", 'rb') as f:
    embeddings_msrvtt = pickle.load(f)

with open("/root/autodl-tmp/WebVid/res/webvid_rawcaptions.json", 'r') as f:
    raw_captions = json.load(f)

num = 30
with open(f"/root/autodl-tmp/generateSearch/MSRVTTdataset/kmeans/k{30}_c{30}_seed_34_webvid.pkl", 'rb') as f:
    mapping = pickle.load(f)

webvid_vid = []
webvid_emb = []
msrvtt_vid = []
msrvtt_emb = []
for k in tqdm(embeddings_webvid):
    webvid_vid.append(k)
    webvid_emb.append(embeddings_webvid[k])
for k in tqdm(embeddings_msrvtt):
    msrvtt_vid.append(k)
    msrvtt_emb.append(embeddings_msrvtt[k])

# 构建 KD 树
kdtree = KDTree(msrvtt_emb)



100%|██████████| 10719575/10719575 [00:07<00:00, 1452833.63it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1680949.02it/s]


In [8]:
webvid_id2cap = []
for vid in tqdm(embeddings_webvid):
    caption = raw_captions[vid]
    
    # 查找最近邻的向量索引
    nearest_neighbor_idx = kdtree.query(embeddings_webvid[vid], k=1)[1]
    msrvid = msrvtt_vid[nearest_neighbor_idx]
    tokenseq = mapping[msrvid]
    webvid_id2cap.append([caption, msrvid, tokenseq])
    if (len(webvid_id2cap) == 10):
        break

with open(f"webvid_id2cap_{30}.json", 'w') as f:
    json.dump(webvid_id2cap, f, cls=NpEncoder)

  0%|          | 9/10719575 [00:00<17:01:01, 174.98it/s]


In [11]:
from tqdm import tqdm
import json
def find_most_similar_id(target_vector, id_to_vector):
    # target_vector = id_to_vector[target_id]
    most_similar_id = None
    highest_similarity = -1  # 初始化为负数，表示相似度的最小值

    for id, vector in id_to_vector.items():
        similarity = cosine_similarity(target_vector, vector)
        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar_id = id

    return most_similar_id

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
    
webvid_id2cap = []
for vid in tqdm(embeddings_webvid):
    embedding = embeddings_webvid[vid]
    caption = raw_captions[vid]
    most_similar_id = find_most_similar_id(embedding, embeddings_msrvtt)
    tokenseq = mapping[most_similar_id]
    webvid_id2cap.append([caption, most_similar_id, tokenseq])

    if (len(webvid_id2cap) == 10):
        break

with open(f"webvid_id2cap_{30}.json", 'w') as f:
    json.dump(webvid_id2cap, f, cls=NpEncoder)

  0%|          | 9/10719575 [00:02<752:37:07,  3.96it/s]


In [1]:
from tqdm import tqdm
import json
import numpy as np
import pickle
from scipy.spatial import KDTree

# with open("/root/autodl-tmp/WebVid/res/vid2emb_webvid.pkl", 'rb') as f:
#     embeddings_webvid = pickle.load(f)

with open("/root/autodl-tmp/WebVid/res/vid2emb_msrvtt.pkl", 'rb') as f:
    embeddings_msrvtt = pickle.load(f)

with open("/root/autodl-tmp/WebVid/res/vid2emb_didemo.pkl", 'rb') as f:
    embeddings_didemo = pickle.load(f)

with open("/root/autodl-tmp/LocalizingMoments/vid2caps.json", 'r') as f:
    raw_captions = json.load(f)

num = 30
with open(f"/root/autodl-tmp/generateSearch/MSRVTTdataset/kmeans/k{30}_c{30}_seed_34_webvid.pkl", 'rb') as f:
    mapping = pickle.load(f)

In [3]:
rc = {}
for k in raw_captions:
    rc[k.split('.')[0]] = raw_captions[k]

In [5]:
rc['54322086@N00_2408598493_274c77d26a']

["a brown rat goes into someone's hand then onto a cage.",
 'a brown rat climbs on top of its cage.',
 'A person helps the light colored mouse move off the chair and onto the cage.',
 'grey rat eats from dish',
 'a rat jumps into a mans hand.']

In [8]:
from tqdm import tqdm
import json
def find_most_similar_id(target_vector, id_to_vector):
    # target_vector = id_to_vector[target_id]
    most_similar_id = None
    highest_similarity = -1  # 初始化为负数，表示相似度的最小值

    for id, vector in id_to_vector.items():
        similarity = cosine_similarity(target_vector, vector)
        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar_id = id

    return most_similar_id

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

import numpy as np
def cosine_similarity(vec1, vec2):
    # vec1 = vec1.squeeze()
    # vec2 = vec2.squeeze()

    # 计算两个向量的余弦相似度
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm1 * norm2)
    return similarity


didemo_id2cap = []
for vid in tqdm(embeddings_didemo):
    embedding = embeddings_didemo[vid]
    caption = rc[vid]
    most_similar_id = find_most_similar_id(embedding, embeddings_msrvtt)
    tokenseq = mapping[most_similar_id]
    didemo_id2cap.append([caption, most_similar_id, tokenseq])

    if (len(didemo_id2cap) == 10):
        break

with open(f"webvid/didemo_id2cap_{30}.json", 'w') as f:
    json.dump(didemo_id2cap, f, cls=NpEncoder)

  0%|          | 9/10423 [00:02<40:02,  4.33it/s]


In [37]:
import pickle
with open("/root/autodl-tmp/generateSearch/MSRVTTdataset/kmeans/k30_c30_seed_34_webvid2.pkl", 'rb') as f:
    mapping = pickle.load(f)

In [38]:
mapping['video0']

[7, 11, 19]

In [39]:
res = {}
for k in mapping:
    if "-".join([str(int(item)) for item in mapping[k]]) in res:
        res["-".join([str(int(item)) for item in mapping[k]])].append(k)
    else:
        res["-".join([str(int(item)) for item in mapping[k]])] = [k]

In [29]:
[k for k in res if len(res[k]) >= 2]

[]

In [4]:
with open("../data/MSRVTT/val_list_jsfusion.txt", "r") as f:
    val_list_id = f.readlines()
for i, item in enumerate(val_list_id):
    if '\n' in item:
        item = item.replace('\n', '')
    val_list_id[i] = item

In [42]:
res2 = {}
for vid in val_list_id:
    if "-".join([str(int(item)) for item in mapping[vid]]) in res2:
        res2["-".join([str(int(item)) for item in mapping[vid]])] += 1
    else:
        res2["-".join([str(int(item)) for item in mapping[vid]])] = 1

In [44]:
len([k for k in res2 if res2[k] >= 2])

126

---

In [2]:
import json

In [3]:
zero_shot_caption_test_path = 'MSRVTT_JSFUSION_test_titles.json'
zero_shot_caption_train_path = 'msrvtt_train_with_vitb32_max1_title_titles.json'

with open(zero_shot_caption_test_path, 'r') as f:
    zero_shot_caption_test = json.load(f)
with open(zero_shot_caption_train_path, 'r') as f:
    zero_shot_caption_train = json.load(f)

In [5]:
type(zero_shot_caption_test), list(zero_shot_caption_test.keys())[0:5]

(dict, ['video9770', 'video9771', 'video7020', 'video9773', 'video7026'])

In [6]:
type(zero_shot_caption_train), list(zero_shot_caption_train.keys())

(dict, ['info', 'videos', 'sentences', 'title', 'titles'])

In [52]:
import pickle
with open("/root/autodl-tmp/generateSearch/MSRVTTdataset/kmeans/k10_c10_seed_34_webvid2.pkl", 'rb') as f:
    mapping = pickle.load(f)

In [57]:
msrvtt_cap4video_id2cap = []

In [54]:
title = zero_shot_caption_train['title']
titles = zero_shot_caption_train['titles']

In [55]:
list(title.keys())[0:5], list(titles.keys())[0:5]

(['video4122', 'video212', 'video6570', 'video3994', 'video4583'],
 ['video10', 'video7', 'video0', 'video3', 'video23'])

In [56]:
title['video4122'], titles['video4122']

('Picture of interview with a female interviewer, who was asked if she had ever seen the video footage of an interview',
 ['Image shows a video interview with actress and singer Amy interviews her ex, actor.',
  'Image showing the video of a live interview with an actor, which was uploaded to social networking site.',
  'Image shows video interview of the interviewer in a previous season.',
  'Image shows a scene of the interview, where she said that her husband had been in contact with a number of',
  "Video showing interview with a woman who claimed she was kidnapped by the singer's producer.",
  "Picture shows interviewee's face being interviewed in a studio, with interviewer saying she was not allowed to speak.",
  'Image shows the footage that was taken by a member of staff interviewing an interviewee.',
  "Video shows footage from a woman who was interviewed by the show's producers, but not given an opportunity to explain",
  'Image shows a video interview with the victim, who wa

In [58]:
for k in titles:
    caps = titles[k]
    for cap in caps:
        msrvtt_cap4video_id2cap.append([cap, k, mapping[k]])

In [59]:
len(msrvtt_cap4video_id2cap) # 3 * 9000

270000

In [61]:
zero_shot_caption_test['video9771']

{'gt': 'a little girl does gymnastics',
 'titles': ['Photo shows girl jumping off gym floor to compete for girls gymnastics title.',
  'Video shows gymnasts compete in a jump competition.',
  'Video showing gymnast jumping from a platform to the ground.',
  "Image showing the jump from a gymnast's vault into an open gym.",
  'Picture of gymnast flips off a fence in the gym.',
  'Photo of the jump from schoolgirl to toddler by a gymnast.',
  "Video of gymnast's flip flips over hurdles to win gold medal in competition.",
  'Picture of jumps training for girls in gymnast outfit, which is part way up.',
  'Image shows a jumping girl with her mother, who is wearing the jumpsuit in this video clip from a gymn',
  'Picture shows the gymnast leaping from platform to jump, landing on her feet with a flip.',
  'Video showing gymnastics coach flips off competitor after being caught jumping from platform to podium in competition video posted by the',
  'Image showing jumps in gymnastics and diving

In [62]:
for k in zero_shot_caption_test:
    caps = zero_shot_caption_test[k]['titles']
    for cap in caps:
        msrvtt_cap4video_id2cap.append([cap, k, mapping[k]])

In [64]:
len(msrvtt_cap4video_id2cap) # 3*9000 + 3*1000

300000

In [68]:
import numpy as np
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
with open("msrvtt_cap4video_id2cap.json", 'w') as f:
    json.dump(msrvtt_cap4video_id2cap, f, cls=NpEncoder)