In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cd /content/drive/MyDrive/unilm/beit3 && pip install -r requirements.txt

Collecting timm==0.4.12 (from -r requirements.txt (line 3))
  Downloading timm-0.4.12-py3-none-any.whl.metadata (30 kB)
Collecting blobfile (from -r requirements.txt (line 5))
  Downloading blobfile-3.0.0-py3-none-any.whl.metadata (15 kB)
Collecting mypy (from -r requirements.txt (line 6))
  Downloading mypy-1.11.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (1.9 kB)
Collecting tensorboardX (from -r requirements.txt (line 11))
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting ftfy (from -r requirements.txt (line 13))
  Downloading ftfy-6.2.3-py3-none-any.whl.metadata (7.8 kB)
Collecting torchmetrics==0.7.3 (from -r requirements.txt (line 17))
  Downloading torchmetrics-0.7.3-py3-none-any.whl.metadata (20 kB)
Collecting deepspeed==0.4.0 (from -r requirements.txt (line 19))
  Downloading deepspeed-0.4.0.tar.gz (444 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m444.6/444.6 kB[0m [31m15

In [None]:
import json
import sys
import os
import torch
import numpy as np
from tqdm import tqdm
from PIL import Image
from torchvision import transforms
from transformers import XLMRobertaTokenizer
from torchvision.transforms.functional import InterpolationMode

In [None]:
sys.path.append('/content/drive/MyDrive/unilm/beit3')
from unilm.beit3.modeling_finetune import beit3_base_patch16_384_retrieval

In [None]:
model_weight_path = '/content/drive/MyDrive/beit3-retrieval-pth/beit3_base_patch16_384_f30k_retrieval.pth'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
beit3_model = beit3_base_patch16_384_retrieval(pretrained=True)
checkpoint = torch.load(model_weight_path)
beit3_model.load_state_dict(checkpoint['model'])
beit3_model.to(device)
beit3_model.eval()

  checkpoint = torch.load(model_weight_path)


BEiT3ForRetrieval(
  (beit3): BEiT3(
    (text_embed): TextEmbedding(64010, 768)
    (vision_embed): VisionEmbedding(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): Encoder(
      (dropout_module): Dropout(p=0.0, inplace=False)
      (embed_positions): MutliwayEmbedding(
        (A): PositionalEmbedding(579, 768)
        (B): PositionalEmbedding(1024, 768)
      )
      (layers): ModuleList(
        (0-11): 12 x EncoderLayer(
          (self_attn): MultiheadAttention(
            (k_proj): MultiwayNetwork(
              (A): Linear(in_features=768, out_features=768, bias=True)
              (B): Linear(in_features=768, out_features=768, bias=True)
            )
            (v_proj): MultiwayNetwork(
              (A): Linear(in_features=768, out_features=768, bias=True)
              (B): Linear(in_features=768, out_features=768, bias=True)
            )
            (q_proj): MultiwayNetwork(
              (A): Linear(in_features=768, out_featu

In [None]:


def get_sentencepiece_model_for_beit3(model_path):
    from transformers import XLMRobertaTokenizer
    return XLMRobertaTokenizer(model_path)

def to_text_tokens(text, tokenizer, max_len = 64):

    tokens_orig = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens_orig)
    tokens = token_ids

    if len(tokens) > max_len - 2:
        tokens = tokens[:max_len - 2]

    tokens = [tokenizer.bos_token_id] + tokens[:] + [tokenizer.eos_token_id]
    num_tokens = len(tokens)
    padding_mask = [0] * num_tokens + [1] * (max_len - num_tokens)
    tokens_true = tokens + [tokenizer.pad_token_id] * (max_len - num_tokens)

    padding_mask_tensor = torch.tensor(padding_mask).reshape(1, -1).to(device)
    token_ids_tensor = torch.tensor(tokens_true).reshape(1, -1).to(device)

    return token_ids_tensor, padding_mask_tensor

def calc_text_embedding(text, tokenizer):
    text_tokens, padding_mask = to_text_tokens(text, tokenizer)
    text_embedding = beit3_model(text_description=text_tokens, padding_mask=padding_mask, only_infer=True)
    return text_embedding[1]



In [None]:
import torch
from torchvision import transforms
from PIL import Image
import numpy as np
from torch.nn.functional import normalize

def encode_images(image_paths, batch_size, image_size=384):
    """
    Encode a list of images into feature vectors using a pre-trained model.

    Args:
        image_paths (list): List of image paths to be encoded.
        batch_size (int): Number of images to process in a single batch.
        image_size (int): Size to which each image will be resized (default is 384).

    Returns:
        video_features (list): A list of numpy arrays containing the feature vectors for each image.
    """
    video_features, images = [], []

    # Loop through each image path and preprocess the image
    for image_path in image_paths:
        # Define the image transformation pipeline
        transform = transforms.Compose([
            transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC),
            transforms.ToTensor(),
        ])

        # Open the image, convert to RGB, and apply transformations
        raw_image = Image.open(image_path).convert('RGB')
        image = transform(raw_image).unsqueeze(0).to(device)
        images.append(image)

    # Concatenate all images into a single tensor
    images = torch.cat(images, dim=0).to(device)

    with torch.no_grad():
        # Loop through the images in batches and encode them
        for start_index in range(0, images.shape[0], batch_size):
            # Process a batch of images through the model
            image_features, _ = beit3_model(image=images[start_index:start_index+batch_size], only_infer=True)

            # Normalize the feature vectors
            image_features = normalize(image_features, p=2, dim=-1)

            # Convert feature vectors to numpy arrays and store them
            for index in range(image_features.shape[0]):
                video_features.append(image_features[index].cpu().numpy().astype(np.float32).flatten())

    # Return the list of feature vectors for all images
    return video_features

In [None]:
import os
import re

def sorted_by_id(keyframe_paths):
    id_path_keyframes = []

    for keyframe_path in keyframe_paths:
        keyframe_filename = os.path.basename(keyframe_path)

        match = re.search(r'\d+', keyframe_filename)
        if match:
            keyframe_id = int(match.group())  # Lấy phần số đầu tiên tìm được
        else:
            print(f"Warning: {keyframe_filename} does not contain a valid ID")
            continue

        # Thêm tuple (ID, đường dẫn) vào danh sách
        id_path_keyframes.append((keyframe_id, keyframe_path))

    # Sắp xếp danh sách theo ID
    sorted_id_path_keyframes = sorted(id_path_keyframes, key=lambda id_path: id_path[0])

    # Trả về danh sách chỉ chứa các đường dẫn (path)
    return [id_path[1] for id_path in sorted_id_path_keyframes]

In [None]:
# id2image_save_dir='./beit3/id2image'
feature_save_dir="/content/drive/MyDrive/Frame Embedding/"
# if not os.path.exists(id2image_save_dir):
#     os.makedirs(id2image_save_dir)
if not os.path.exists(feature_save_dir):
    os.makedirs(feature_save_dir)

In [None]:
def convert_frame_path_to_id(frame_path):
    # Tách chuỗi frame_path thành các phần tử
    parts = frame_path.split('/')
    filename = parts[-2]  # Lấy tên file
    video_info = filename.split('_')  # Tách tên file bằng dấu gạch dưới
    video_id = video_info[0] + '_' + video_info[1]  # Lấy ID video
    frame_number = parts[-1].replace('.jpg', '').replace('frame_', '')  # Lấy số frame

    # Định dạng lại frame_id với số frame đủ 5 chữ số
    frame_id = f"{video_id}_{frame_number.zfill(5)}"
    return frame_id

In [None]:

# Function to save keyframe embeddings and paths to a JSON Lines file
def save_to_json_lines(file_path, data):
    with open(file_path, "a") as json_file:  # Open in append mode
        for item in data:
            json.dump(item, json_file)
            json_file.write('\n')

# Function to process keyframes, extract embeddings, and save to JSON Lines
def process_video_embeddings(video_folder_path, batch_size, feature_save_path):
    """
    Processes all keyframes from videos in the specified folder, extracts their embeddings, and saves them to a JSON Lines file.

    Args:
        video_folder_path (str): Path to the folder containing videos.
        batch_size (int): The batch size for encoding images.
        feature_save_path (str): Path where the JSON Lines file will be saved.
        encode_images (function): Function to encode images and extract embeddings.
        sorted_by_id (function): Function to sort keyframe paths by their ID.

    Returns:
        None
    """
    # Ensure the file is created if it doesn't exist
    open(feature_save_path, 'a').close()

    # Loop through each subfolder in the video folder
    for video_id in tqdm(sorted(os.listdir(video_folder_path)), desc='Processing Videos'):
        video_id_path = os.path.join(video_folder_path, video_id)

        # Check if it's a directory (i.e., a video part)
        if os.path.isdir(video_id_path):
            # Get and sort the keyframe paths
            keyframe_image_paths = [os.path.join(video_id_path, keyframe_image_path) for keyframe_image_path in os.listdir(video_id_path)]
            sorted_keyframe_image_paths = sorted_by_id(keyframe_image_paths)

            # Encode the sorted keyframe images
            video_features = encode_images(sorted_keyframe_image_paths, batch_size)

            # Prepare data to be saved for the current video
            all_frame_embeddings = []  # List to store embeddings for the current video

            # Append each keyframe embedding along with its relative path to the list
            for idx, feature in enumerate(video_features):
                absolute_frame_path = sorted_keyframe_image_paths[idx]
                all_frame_embeddings.append({
                    "frame_id": convert_frame_path_to_id(absolute_frame_path),
                    "frame_embedding": [feature.tolist()]
                })

            save_to_json_lines(feature_save_path, all_frame_embeddings)

    print(f"Frame embeddings and paths saved to {feature_save_path}")


In [None]:
torch.cuda.empty_cache()

In [None]:
for i in range(17, 19):
        print(f'Processing L{str(i).zfill(2)}')
        all_video_paths = f'/content/drive/MyDrive/Qualifying Round Full Frame (DONT TOUCH)/L{str(i).zfill(2)}_rest'
        batch_size = 32
        features_save_path = f'/content/drive/MyDrive/Frame Embedding/Frame Embeddings Of Ls Beit3/frame_embedding_L{str(i).zfill(2)}_Beit3_rest.json'
        process_video_embeddings(all_video_paths, batch_size, features_save_path)

Processing L17


Processing Videos: 100%|██████████| 28/28 [23:29<00:00, 50.32s/it]


Frame embeddings and paths saved to /content/drive/MyDrive/Frame Embedding/Frame Embeddings Of Ls Beit3/frame_embedding_L17_Beit3_rest.json
Processing L18


Processing Videos: 100%|██████████| 29/29 [29:55<00:00, 61.91s/it]

Frame embeddings and paths saved to /content/drive/MyDrive/Frame Embedding/Frame Embeddings Of Ls Beit3/frame_embedding_L18_Beit3_rest.json





In [None]:
for i in range(20, 21):
        print(f'Processing L{str(i).zfill(2)}')
        all_video_paths = f'/content/drive/MyDrive/Qualifying Round Full Frame (DONT TOUCH)/L{str(i).zfill(2)}_rest'
        batch_size = 32
        features_save_path = f'/content/drive/MyDrive/Frame Embedding/Frame Embeddings Of Ls Beit3/frame_embedding_L{str(i).zfill(2)}_Beit3_rest.json'
        process_video_embeddings(all_video_paths, batch_size, features_save_path)

Processing L20


Processing Videos: 100%|██████████| 58/58 [30:22<00:00, 31.43s/it]

Frame embeddings and paths saved to /content/drive/MyDrive/Frame Embedding/Frame Embeddings Of Ls Beit3/frame_embedding_L20_Beit3_rest.json





In [None]:
import os

def count_total_frames(video_folder_path):
    """
    Counts the total number of frames in all videos in a folder.

    Args:
        video_folder_path (str): Path to the folder containing video subfolders.

    Returns:
        int: Total number of frames across all videos.
    """
    total_frame_count = 0

    # Iterate through each subfolder (each subfolder represents a video)
    for video_id in os.listdir(video_folder_path):
        if video_id.startswith('L'):
            video_id_path = os.path.join(video_folder_path, video_id)

            # Check if it's a directory (i.e., a video part)
            if os.path.isdir(video_id_path):
                # List all files (frames) in the video folder
                frame_files = os.listdir(video_id_path)

                # Count the number of frames (assuming each file is a frame image)
                total_frame_count += len(frame_files)

    return total_frame_count


In [None]:
count_total_frames('/content/drive/MyDrive/Qualifying Round Full Frame (DONT TOUCH)/L18_rest')

18083