In [3]:
import torch
import torchvision
import clip
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")



Using cuda device


In [4]:
clip_model_type = "ViT-B/32"
csv_path = "./Charades/Charades_v1_train.csv"
csv_path2 = "./Charades/Charades_gpt_train.csv"
video_path = "./video" 
pt_path = "./video_tensors_120"
frames_no = 60
num_video_batch = 50
model, preprocess = clip.load(clip_model_type, device=device, jit=False)

In [3]:
#extract frames
def preprocess_frames(video_file):
    video, audio, info = torchvision.io.read_video(video_file, output_format='TCHW')

    num_frames = len(video)
    frame_rate = info["video_fps"]
    interval = int(frame_rate * 60 / frames_no)
    tensors = []
    to_pil = torchvision.transforms.ToPILImage()
    for i in range(0, num_frames, interval):
      frame = preprocess(to_pil(video[i].squeeze(0))).unsqueeze(0).to(device)
      tensors.append(frame)
    
    preprocess_batch = torch.cat(tensors, dim = 0)
    preprocess_batch = preprocess_batch.to(device)     
 

    return preprocess_batch

In [4]:
# Define a function to concatenate tensors 
def CLIPtokenise(tensors):
    #the input tensors are 3d stacks (lets say x,y,z)
    #function returns a tensor of shape (x, y, sum(z)) and a list of z values

    # get the z values for each tensor
    z_values = [t.shape[0] for t in tensors]

    joined = torch.cat(tensors, dim=0)

    with torch.no_grad():
            image_features = model.encode_image(joined)

    image_features.to(device)
    print(image_features.shape)

    #gets z values and splits them back to video stacks
    split = torch.split(image_features, z_values, dim=0)

    return split




In [5]:
# load captions and video names
df = pd.read_csv(csv_path)
# df_new = pd.read_csv(csv_path)
# df = df_new.head(16)
ids = df["id"]
ids = ids[3750:].copy()
errors = []

# loop over the ids with a batch size step
for i in tqdm(range(0, len(ids), num_video_batch)):
    # get the current batch of ids
    batch_ids = ids[i:i+num_video_batch].copy()
    batch_ids = batch_ids.reset_index(drop=True)

    batch = []
    temp_errors = []

    # loop over the batch ids
    for j in range(len(batch_ids)):
        try:
            # get the video file name with the corresponding id
            # video_file = f"{video_path}/{batch_ids[j]}.mp4"
            video_file = os.path.join(video_path, batch_ids[j] + ".mp4")
            batch.append(preprocess_frames(video_file))
            temp_errors.append(False)
        except Exception as e:
            temp_errors.append(True)
            errors.append(f"video {batch_ids[j]} error {e}")
        
        
    tokenised_batch = CLIPtokenise(batch)

    l = 0
    for k in range(len(batch_ids)):
        if not temp_errors[k]:
            torch.save(tokenised_batch[l], f"{pt_path}/{batch_ids[k]}.pt")
            l=l+1

        




torch.Size([1457, 512])


  1%|          | 1/85 [01:40<2:21:13, 100.87s/it]

torch.Size([1500, 512])


  2%|▏         | 2/85 [02:39<1:44:58, 75.88s/it] 

torch.Size([1536, 512])


  4%|▎         | 3/85 [03:36<1:32:16, 67.51s/it]

torch.Size([1674, 512])


  5%|▍         | 4/85 [04:41<1:29:24, 66.23s/it]

torch.Size([1635, 512])


  6%|▌         | 5/85 [05:44<1:26:54, 65.18s/it]

torch.Size([1606, 512])


  7%|▋         | 6/85 [06:46<1:24:35, 64.24s/it]

torch.Size([1545, 512])


  8%|▊         | 7/85 [07:45<1:21:13, 62.48s/it]

torch.Size([1584, 512])


  9%|▉         | 8/85 [08:48<1:20:07, 62.44s/it]

torch.Size([1593, 512])


 11%|█         | 9/85 [10:04<1:25:08, 67.21s/it]


KeyboardInterrupt: 

In [6]:
df = pd.read_csv(csv_path2)
# Initialize empty lists to store the embeddings and captions
all_embeddings = []
all_captions = []


for i in tqdm(range(len(df["id"])-200)):
  path = os.path.join(pt_path, df.loc[i, "id"] + ".pt")
  #load the .pt files
  data = torch.load(path)
  # Extract the clip embedding and caption
  clip_embedding = data
  caption = df.loc[i, 'descriptions']
  # Append them to the lists
  all_embeddings.append(clip_embedding)
  all_captions.append(caption)


# Save the final dictionary as a pickle file
output_path = "./clip_caption/clip_caption.pkl"
with open(output_path, 'wb') as f:
  pickle.dump({"clip_embedding": all_embeddings, "captions": all_captions}, f)


100%|██████████| 7783/7783 [00:30<00:00, 259.14it/s]


In [None]:
print(errors)


In [None]:
#Comparison between batch processing and processing frame by frame
video, audio, info = torchvision.io.read_video("./video/S6MPZ.mp4", output_format='TCHW')
to_pil = torchvision.transforms.ToPILImage()
image = preprocess(to_pil(video[0].squeeze(0))).unsqueeze(0).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)


tensor = torch.load("./video_tensors/S6MPZ.pt")

print(tensor)
cos_sim = torch.nn.functional.cosine_similarity(image_features, tensor[0].unsqueeze(0)) 
euc_dist = torch.nn.functional.pairwise_distance(image_features, tensor[0].unsqueeze(0))

print(cos_sim)
print(euc_dist)
