In [1]:
from glob import glob
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler

In [2]:
def process_csv_files(embedding_folder, has_header=True, skip_first_column=False):
    csv_files = glob(embedding_folder + '/*.csv')
    all_embeddings = []

    for csv_file in csv_files:
                                                                    # Read the CSV file
        embeddings_df = pd.read_csv(csv_file, header=0 if has_header else None)
        
        if skip_first_column:                                       # Skip the first column for video csv files
            embeddings_df = embeddings_df.iloc[:, 1:]               # because it contains the name of video frame

    
        embeddings_array = embeddings_df.astype(np.float64).values  # Convert all elements to numpy.float64
                                                                    # Remove rows with NaN values
        embeddings_array = embeddings_array[~np.isnan(embeddings_array).any(axis=1)]
        all_embeddings.append(embeddings_array)                     # Append the NumPy array to the list
        

    # Concatenate all embeddings into a single NumPy array
    embeddings_array = np.concatenate(all_embeddings, axis=0)

    return embeddings_array

text_embedding_folder = 'DATASET-TextEmbedding'
vid_embedding_folder = 'DATASET-VideoEmbeddings'

text_embeddings = process_csv_files(text_embedding_folder, has_header=False)
video_embeddings = process_csv_files(vid_embedding_folder, has_header=True, skip_first_column=True)


ValueError: need at least one array to concatenate

In [52]:
print(text_embeddings)
print(video_embeddings)

[[-0.33789062  0.19824219 -0.296875   ... -0.15917969  0.03417969
   0.09179688]
 [-0.28710938  0.29296875 -0.04467773 ... -0.01049805 -0.25976562
   0.11083984]
 [-0.06933594  0.15332031 -0.02490234 ...  0.06054688 -0.19238281
   0.27148438]
 ...
 [-0.01116943  0.06738281  0.13867188 ...  0.18066406 -0.23730469
  -0.17578125]
 [-0.08251953  0.12988281  0.18945312 ...  0.18164062  0.03271484
  -0.09472656]
 [-0.0279541   0.03369141 -0.03027344 ...  0.13574219 -0.0004921
   0.26171875]]
[[0.96737421 0.94864553 0.93027443 ... 0.88816762 1.02650452 0.98235989]
 [0.97024786 0.97135627 0.97625911 ... 0.92484605 1.05267894 0.94103754]
 [0.95929432 0.97145921 0.97567111 ... 0.93477076 1.09334481 0.89381284]
 ...
 [0.87987137 0.9994846  1.00720835 ... 0.91209149 1.10154176 0.98179394]
 [0.86699724 1.04539263 0.90224159 ... 0.91217023 1.14227533 0.93422842]
 [0.8895027  0.96568727 0.93459833 ... 0.89999688 1.05757153 0.93650216]]


In [53]:
# Check data types in the video_embeddings array
print("Data Types in video_embeddings:", set(type(item) for row in video_embeddings for item in row))

Data Types in video_embeddings: {<class 'numpy.float64'>}


In [56]:
# Convert NumPy arrays to PyTorch tensors
text_embeddings_tensor = torch.tensor(text_embeddings, dtype=torch.float32)
video_embeddings_tensor = torch.tensor(video_embeddings, dtype=torch.float32)

# Normalize the embeddings using StandardScaler from scikit-learn
scaler_text = StandardScaler()
scaler_video = StandardScaler()

# Fit and transform each set of embeddings separately
text_embeddings_normalized = scaler_text.fit_transform(text_embeddings)
video_embeddings_normalized = scaler_video.fit_transform(video_embeddings)
# Find the minimum dimensionality among all sets of embeddings
min_dim = min(text_embeddings_normalized.shape[1], video_embeddings_normalized.shape[1])

# Trim embeddings to the minimum dimensionality
text_embeddings_normalized = text_embeddings_normalized[:, :min_dim]
video_embeddings_normalized = video_embeddings_normalized[:, :min_dim]

# Concatenate normalized embeddings
all_embeddings_normalized = np.concatenate([text_embeddings_normalized, video_embeddings_normalized], axis=0)

# Normalize the concatenated embeddings using a single StandardScaler from scikit-learn
scaler_all = StandardScaler()
all_embeddings_normalized = scaler_all.fit_transform(all_embeddings_normalized)

# Split normalized embeddings back into text, video, and subtitle embeddings
text_embeddings_normalized_tensor = torch.tensor(all_embeddings_normalized[:len(text_embeddings)], dtype=torch.float32)
video_embeddings_normalized_tensor = torch.tensor(all_embeddings_normalized[len(text_embeddings):len(text_embeddings)+len(video_embeddings)], dtype=torch.float32)

Check if dimensions of subtitle embeddings match the dimensions of video embeddings

In [59]:
text_embeddings_normalized_tensor.shape[1] == video_embeddings_normalized_tensor.shape[1] 

True

### Text encoder

In [None]:
class ProjectionHead(nn.Module):
    def __init__(self, input_dim, projection_dim, num_layers=2, dropout_rate=0.1):
        super(ProjectionHead, self).__init__()
        layers = []
        layers.append(nn.Linear(input_dim, projection_dim))
        for _ in range(num_layers):
            layers.extend([nn.ReLU(), nn.Linear(projection_dim, projection_dim), nn.Dropout(dropout_rate)])
        self.projection = nn.Sequential(*layers)

    def forward(self, x):
        return self.projection(x)

# Implement the vision encoder with CNN
class VisionEncoder(nn.Module):
    def __init__(self, input_channels, projection_dim):
        super(VisionEncoder, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(input_channels, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.flatten = nn.Flatten()
        self.projection_head = ProjectionHead(256 * 8 * 8, projection_dim)

    def forward(self, x):
        x = self.cnn(x)
        x = self.flatten(x)
        x = self.projection_head(x)
        return x

# Implement the text encoder
class TextEncoder(nn.Module):
    def __init__(self, input_dim, projection_dim):
        super(TextEncoder, self).__init__()
        self.projection_head = ProjectionHead(input_dim, projection_dim)

    def forward(self, x):
        x = self.projection_head(x)
        return x

In [None]:
# Instantiate VisionEncoder, TextEncoder, and ProjectionHead
vision_encoder = VisionEncoder(input_channels=3, projection_dim=64)
text_encoder = TextEncoder(input_dim=min_dim, projection_dim=64)

# Example usage of the encoders
video_embedding = vision_encoder(video_embeddings_normalized_tensor)
text_embedding = text_encoder(text_embeddings_normalized_tensor)