In [1]:
# IMPORT LIBRARIES
from glob import glob
import pandas as pd
import numpy as np
import chromadb
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import normalize, cosine_similarity
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import os
import csv
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torchtext
from torchtext.data.utils import get_tokenizer

## Initialize TextEncoder, VisionEncoder and DualEncoder

In [2]:
class ProjectionHead(nn.Module):
    def __init__(self, input_dim, projection_dim, num_layers=2, dropout_rate=0.1):
        super(ProjectionHead, self).__init__()
        layers = []
        layers.append(nn.Linear(input_dim, projection_dim))
        for _ in range(num_layers):
            layers.extend([nn.ReLU(), nn.Linear(projection_dim, projection_dim), nn.Dropout(dropout_rate)])
        self.projection = nn.Sequential(*layers)

    def forward(self, x):
        return self.projection(x)

class VisionEncoder(nn.Module):
    def __init__(self, input_channels, projection_dim):
        super(VisionEncoder, self).__init__()
        self.fc_layers = nn.Sequential(
            nn.Linear(input_channels, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, projection_dim),
        )

    def forward(self, x):
        x = self.fc_layers(x)
        return x
 
class TextEncoder(nn.Module):
    def __init__(self, input_dim, projection_dim):
        super(TextEncoder, self).__init__()
        self.fc_layers = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, projection_dim),
        )

    def forward(self, x):
        x = self.fc_layers(x)
        return x

In [3]:
class EmbeddingsDataset(Dataset):
    def __init__(self, video_embeddings, text_embeddings):
        self.video_embeddings = video_embeddings
        self.text_embeddings = text_embeddings

    def __len__(self):
        return len(self.video_embeddings)

    def __getitem__(self, idx):
        video_embedding = self.video_embeddings[idx]
        text_embedding = self.text_embeddings[idx]

        return {'video_embedding': video_embedding, 'text_embedding': text_embedding}


In [4]:
class DualEncoder(nn.Module):
    def __init__(self, text_encoder, vision_encoder, temperature=1.0):
        super(DualEncoder, self).__init__()
        self.text_encoder = text_encoder
        self.vision_encoder = vision_encoder
        self.temperature = temperature
        self.loss_criterion = nn.CrossEntropyLoss()

    def forward(self, captions, images):
        # Get the embeddings
        caption_embeddings = self.text_encoder(captions)
        image_embeddings = self.vision_encoder(images)
        return caption_embeddings, image_embeddings

    def compute_loss(self, caption_embeddings, image_embeddings):
        # Calculate dot-product similarity for captions and images.
        logits = torch.matmul(caption_embeddings, image_embeddings.t()) / self.temperature
        captions_similarity = torch.matmul(caption_embeddings, caption_embeddings.t())
        images_similarity = torch.matmul(image_embeddings, image_embeddings.t())
        # Targets: average of similarities between captions and images.
        targets = torch.nn.functional.softmax(
            (captions_similarity + images_similarity) / (2 * self.temperature), dim=1
        )
        # Compute the loss using crossentropy.
        captions_loss = self.loss_criterion(logits, torch.argmax(targets, dim=1))
        images_loss = self.loss_criterion(logits.t(), torch.argmax(targets, dim=0))
        # Return the mean of the loss over the batch.
        return (captions_loss + images_loss) / 2

    def training_step(self, captions, images):
        # Forward pass
        caption_embeddings, image_embeddings = self(captions, images)
        loss = self.compute_loss(caption_embeddings, image_embeddings)
        return loss

    def validation_step(self, captions, images):
        caption_embeddings, image_embeddings = self(captions, images)
        loss = self.compute_loss(caption_embeddings, image_embeddings)
        return loss


In [5]:
# Instantiate VisionEncoder, TextEncoder
vision_encoder = VisionEncoder(input_channels=300, projection_dim=512)
text_encoder = TextEncoder(input_dim=300, projection_dim=512)
dual_encoder = DualEncoder(text_encoder, vision_encoder, temperature=1.0)

# Load the trained model state dictionary
dual_encoder.load_state_dict(torch.load('final_trained_model.pth'))

# Set the model to evaluation mode
dual_encoder.eval()

DualEncoder(
  (text_encoder): TextEncoder(
    (fc_layers): Sequential(
      (0): Linear(in_features=300, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=128, bias=True)
      (3): ReLU()
      (4): Linear(in_features=128, out_features=512, bias=True)
    )
  )
  (vision_encoder): VisionEncoder(
    (fc_layers): Sequential(
      (0): Linear(in_features=300, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=128, bias=True)
      (3): ReLU()
      (4): Linear(in_features=128, out_features=512, bias=True)
    )
  )
  (loss_criterion): CrossEntropyLoss()
)

In [6]:
def StringEncoder(model_name, text):
    # Load pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize the input text and get the embeddings
    input_ids = tokenizer(text, return_tensors="pt")["input_ids"]
    with torch.no_grad():
        embeddings = model(input_ids)["last_hidden_state"]

    # Return the embeddings
    return embeddings

## Convert Text to Embedding

Output: text embedding tensor

In [7]:
query_text = "Swifts Appearance Drives Up Ticket Prices For Sundays Chiefs Jets Game"
# Tokenize the text and convert it to a tensor
tokenizer = get_tokenizer('basic_english')
tokens = tokenizer(query_text)
vocab = set(tokens)
numerical_representation = torch.tensor([int(hash(token)) % (2**32 - 1) for token in tokens])

# Create an instance of TextEncoder
input_dim = len(vocab)  
projection_dim =  512
text_encoder = TextEncoder(input_dim, projection_dim)

print("Input tensor shape:", numerical_representation.shape)
print("Model input dimension:", input_dim)

# Convert numerical_representation to the same dtype as the model's parameters
numerical_representation = numerical_representation.to(dtype=text_encoder.fc_layers[0].weight.dtype)

# Forward pass through the TextEncoder
query_embedding = text_encoder(numerical_representation)
print("Query Embedding:", query_embedding)

Input tensor shape: torch.Size([11])
Model input dimension: 11
Query Embedding: tensor([-1.3131e+08, -1.7661e+08,  6.6014e+07,  2.8985e+07, -8.5384e+07,
         2.3054e+08, -7.5165e+07, -1.2140e+08, -3.3044e+07,  3.4474e+08,
         2.4152e+08,  2.2820e+07,  5.0775e+08, -1.4650e+08,  5.9302e+07,
        -9.3238e+07, -1.5730e+08, -3.7881e+07, -2.7724e+08, -2.3220e+08,
        -7.5880e+07,  6.2738e+07, -3.6293e+08,  1.3033e+08, -2.3097e+07,
         9.3789e+07, -8.0836e+07, -6.4580e+07,  1.7577e+07,  2.8726e+07,
        -7.0629e+07, -1.4580e+08, -2.4964e+06,  5.8642e+07,  1.5697e+08,
        -3.0691e+07,  1.2644e+08, -8.5521e+07, -2.2248e+07, -2.6229e+08,
        -1.7647e+08, -1.3815e+08, -9.9745e+07, -2.8437e+08, -8.3837e+07,
        -1.7935e+07, -1.3109e+07, -1.2078e+08,  2.2371e+08, -9.7195e+07,
        -2.2799e+08,  2.5087e+08, -1.0022e+08,  2.6720e+07, -1.8680e+08,
         1.0164e+08,  7.0585e+07,  2.6267e+07,  2.1471e+08, -7.6310e+07,
        -2.1280e+08, -8.7102e+07, -1.9998e+0

## Create image embedding dataframe from precomputed image embeddings

Output: dataframe contain all frames of all videos and their corresponding image embeddings that were previously computed using a pre-trained ResNet-18 model as an image encoder (see ***Demo-Video-Similarity-Search/create_video_embeddings.ipynb*** for demonstration).

In [8]:
folder_path = 'DATASET-VideoEmbeddings'
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]


# Load each CSV file and concatenate the DataFrames
df = pd.DataFrame()
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    df_vid = pd.read_csv(file_path)

    # Add a 'Video_Name' column to identify each video
    video_name = os.path.splitext(csv_file)[0] 
    df_vid['Video_Name'] = video_name
    df = pd.concat([df, df_vid], ignore_index=True)

# Normalize embeddings
# Separate 'Frame_Name' column for later use
frame_names = df['Frame_Name']
df = df.drop(columns=['Frame_Name', 'Video_Name'])
df = pd.DataFrame(normalize(df, axis=1), columns=df.columns)

# Concatenate 'Frame_Name' column back to the DataFrame
df['Frame_Name'] = frame_names

  df = pd.concat([df, df_vid], ignore_index=True)


In [9]:
df.head(-10) # Frame_Name is the last column

Unnamed: 0,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_503,Feature_504,Feature_505,Feature_506,Feature_507,Feature_508,Feature_509,Feature_510,Feature_511,Frame_Name
0,0.045443,0.044564,0.043701,0.050924,0.039351,0.042087,0.044200,0.056195,0.044337,0.043860,...,0.039878,0.043183,0.042164,0.050661,0.047833,0.046347,0.041723,0.048221,0.046147,2000_Bodies_Recovered_After_Dam_Bursts_In_Dern...
1,0.045598,0.045650,0.045880,0.050546,0.040334,0.042043,0.045416,0.056799,0.044544,0.042159,...,0.040946,0.044024,0.042006,0.055342,0.048176,0.046328,0.043464,0.049472,0.044225,2000_Bodies_Recovered_After_Dam_Bursts_In_Dern...
2,0.045179,0.045751,0.045950,0.050591,0.040414,0.040394,0.043415,0.056406,0.045872,0.044331,...,0.042489,0.043959,0.043274,0.054938,0.049206,0.046792,0.044024,0.051492,0.042095,2000_Bodies_Recovered_After_Dam_Bursts_In_Dern...
3,0.044533,0.045341,0.044516,0.049777,0.042718,0.039888,0.043203,0.057817,0.046113,0.043583,...,0.042967,0.043936,0.043244,0.053722,0.048281,0.046948,0.043002,0.050004,0.043348,2000_Bodies_Recovered_After_Dam_Bursts_In_Dern...
4,0.044543,0.044307,0.044359,0.050740,0.038902,0.042239,0.044177,0.056450,0.045668,0.043098,...,0.040940,0.044535,0.041731,0.053873,0.049166,0.048552,0.044719,0.051249,0.043723,2000_Bodies_Recovered_After_Dam_Bursts_In_Dern...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10985,0.042669,0.051468,0.046789,0.052202,0.046959,0.041746,0.040148,0.054444,0.045893,0.044535,...,0.041764,0.041338,0.040663,0.052616,0.048509,0.045287,0.043127,0.050034,0.047328,Zelenskyy_Addresses_The_UN_Security_Council_To...
10986,0.043562,0.044194,0.047336,0.046317,0.044575,0.039825,0.045231,0.057896,0.045042,0.045179,...,0.041124,0.042022,0.041736,0.054149,0.046368,0.041673,0.042671,0.047120,0.047027,Zelenskyy_Addresses_The_UN_Security_Council_To...
10987,0.042242,0.048944,0.044493,0.048755,0.046315,0.038265,0.043463,0.058381,0.045202,0.046411,...,0.042628,0.043353,0.041210,0.055984,0.050291,0.044614,0.044134,0.052522,0.045944,Zelenskyy_Addresses_The_UN_Security_Council_To...
10988,0.043596,0.043367,0.046413,0.050501,0.046416,0.037183,0.040676,0.058908,0.047449,0.046202,...,0.042659,0.036741,0.039801,0.054503,0.049948,0.045490,0.044431,0.051799,0.047692,Zelenskyy_Addresses_The_UN_Security_Council_To...


## Find similar images from user query embedding

In [10]:
def semantic_search(query_embedding, img_dataframe, top_n=10):
    """
    Perform semantic search using cosine similarity.
    
    Parameters:
    - query_embedding: The vector of the query.
    - img_dataframe: The img_dataframe containing all vectors.
    - top_n: Number of similar vectors to retrieve.
    
    Returns:
    - top_n_indices: Indices of the top N similar vectors.
    """
    # Calculate cosine similarity
    # cosine_similarities = cosine_similarity([query_embedding], img_dataframe.iloc[:, :-1].values)[0]
    cosine_similarities = cosine_similarity([query_embedding.detach().numpy()], img_dataframe.iloc[:, :-1].values)[0]


    # Get the indices of top N similar vectors
    top_n_indices = cosine_similarities.argsort()[-top_n:][::-1]

    return top_n_indices

def generate_query_vector(csv_file, target_frame_name):
    with open(csv_file, newline='') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)

        # Find the index of the frame_name in the header
        frame_name_index = header.index('Frame_Name')

        for row in reader:
            frame_name = row[0]
            if frame_name == target_frame_name:
                frame_data = list(map(float, row[1:]))  # Convert frame_data to float
                return frame_data

    return None


In [11]:
top_n_indices = semantic_search(query_embedding, df, top_n=10)
print(f"Indices of the top similar vectors: {top_n_indices}")

Indices of the top similar vectors: [1532 6279 6193 1534 3423 6530 5569 1724 6542 5573]


In [12]:
def get_images_from_indices(indices, img_dataframe):
    return img_dataframe.iloc[indices]['Frame_Name'].tolist()

# Retrieve the top N images
top_n_images = get_images_from_indices(top_n_indices, df)

print(f"Top 10 images: {top_n_images}")

Top 10 images: ['Biden_Zelenskyy_Address_UNs_General_Assembly_frame_4300', 'Johannesburg_Building_Fire_Kills_At_Least_73_People_frame_6375', 'Johannesburg_Building_Fire_Kills_At_Least_73_People_frame_4225', 'Biden_Zelenskyy_Address_UNs_General_Assembly_frame_4350', 'Former_Proud_Boys_Leader_Enrique_Tarrio_To_Be_Sentenced_Today_frame_3925', 'Manhunt_Continues_For_Suspect_In_Maine_Shootings_frame_0025', 'Israel_Gives_1M_Civilians_In_Northern_Gaza_24_Hours_To_Evacuate_frame_0200', 'Blinken_Announces_1_Billion_In_New_Aid_To_Ukraine_frame_3375', 'Manhunt_Continues_For_Suspect_In_Maine_Shootings_frame_0325', 'Israel_Gives_1M_Civilians_In_Northern_Gaza_24_Hours_To_Evacuate_frame_0300']
