In [1]:
from imagebind_dataloader import ImageText_DataLoader, VideoText_DataLoader
import torch
from torch import nn
from config import EPOCHS, IMAGE_TRANSFORM, VIDEO_TRANSFORM, DEVICE
from torch.utils.data import DataLoader
from model import ImageBindModel, ModalityType

import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt

import clip



In [2]:
image_ds = ImageText_DataLoader(image_paths = "Data/Image_Data", transform = IMAGE_TRANSFORM)
image_dl = DataLoader(image_ds, batch_size=3, shuffle=True)

video_ds = VideoText_DataLoader(video_paths = "Data/Video_Data", transform = VIDEO_TRANSFORM)
video_dl = DataLoader(video_ds, batch_size=3, shuffle=True)

model = ImageBindModel()

In [29]:
from info_nce import InfoNCE, info_nce

loss_fn = InfoNCE()
batch_size, embedding_size = 32, 128
query = torch.randn(batch_size, embedding_size)
positive_key = torch.randn(batch_size, embedding_size)
output = loss_fn(query, positive_key)

In [11]:
for name, param in model.named_parameters():
    if param.requires_grad == False:
        print(name, param.requires_grad)

In [12]:
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: {param.grad.abs().mean()}")
        break

In [7]:
def initialize_model_weights(model):
    for name, param in model.named_parameters():
        if 'weight' in name:
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)
            else:
                nn.init.uniform_(param, -0.01, 0.01)
        elif 'bias' in name:
            nn.init.constant_(param, 0.0)
    return model

model = initialize_model_weights(model)

In [None]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: grad={None if param.grad is None else param.grad.abs().mean()}")

In [32]:
model = ImageBindModel(out_embed_dim=128)

In [None]:
import torch
from torch.nn.functional import normalize, cross_entropy
from tqdm import tqdm

DEVICE = "cpu"

def contrastive_loss(x, y, temperature=0.07):
    """
    InfoNCE-style contrastive loss between x and y.
    """
    logits = (x @ y.T) / temperature
    labels = torch.arange(x.size(0)).to(x.device)
    loss_i2t = cross_entropy(logits, labels)
    loss_t2i = cross_entropy(logits.T, labels)
    return (loss_i2t + loss_t2i) / 2

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3, weight_decay=0.01)
loss_fn = nn.MSELoss()

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    step = 0

    pbar = tqdm(image_dl, desc=f"Epoch {epoch+1}/{EPOCHS}")

    for image, text in pbar:
        # Move data to device
        image = image.to(DEVICE)
        text = text.to(DEVICE)

        # Forward pass
        inputs = {"vision": image, "text": text}
        embeddings = model(inputs)

        # Normalize embeddings
        img_emb = normalize(embeddings["vision"], dim=-1)
        txt_emb = normalize(embeddings["text"], dim=-1)

        # Compute contrastive loss
        loss = contrastive_loss(img_emb, txt_emb)
        print(loss)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Logging
        total_loss += loss.item()
        step += 1
        pbar.set_postfix(loss=total_loss / step)

    print(f"Epoch {epoch+1}: Avg Loss = {total_loss / step:.4f}")

In [25]:
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: grad mean {param.grad.abs().mean():.6f}")

modality_preprocessors.vision.cls_tokens: grad mean 0.000000
modality_preprocessors.vision.rgbt_stem.proj.1.weight: grad mean 0.000001
modality_preprocessors.vision.pos_embed_helper.pos_embed: grad mean 0.000000
modality_preprocessors.text.pos_embed: grad mean 0.000002
modality_preprocessors.text.token_embedding.weight: grad mean 0.000000
modality_trunks.vision.pre_transformer_layer.0.weight: grad mean 0.000010
modality_trunks.vision.pre_transformer_layer.0.bias: grad mean 0.000050
modality_trunks.vision.blocks.0.attn.in_proj_weight: grad mean 0.000011
modality_trunks.vision.blocks.0.attn.in_proj_bias: grad mean 0.002267
modality_trunks.vision.blocks.0.attn.out_proj.weight: grad mean 0.000025
modality_trunks.vision.blocks.0.attn.out_proj.bias: grad mean 0.006823
modality_trunks.vision.blocks.0.norm1.weight: grad mean 0.000010
modality_trunks.vision.blocks.0.norm1.bias: grad mean 0.000058
modality_trunks.vision.blocks.0.mlp.fc1.weight: grad mean 0.000000
modality_trunks.vision.blocks.0.

In [14]:
output = model(inputs)
loss = loss_fn(output['vision'], output['text'])
loss.backward()

# Check gradients
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"✅ {name} has grad with mean {param.grad.mean().item():.5f}")
    else:
        print(f"❌ {name} has NO grad")

✅ modality_preprocessors.vision.cls_tokens has grad with mean 0.00000
✅ modality_preprocessors.vision.rgbt_stem.proj.1.weight has grad with mean -0.00000
✅ modality_preprocessors.vision.pos_embed_helper.pos_embed has grad with mean -0.00000
✅ modality_preprocessors.text.pos_embed has grad with mean 0.00000
✅ modality_preprocessors.text.token_embedding.weight has grad with mean 0.00000
✅ modality_trunks.vision.pre_transformer_layer.0.weight has grad with mean -0.00000
✅ modality_trunks.vision.pre_transformer_layer.0.bias has grad with mean 0.00000
✅ modality_trunks.vision.blocks.0.attn.in_proj_weight has grad with mean -0.00000
✅ modality_trunks.vision.blocks.0.attn.in_proj_bias has grad with mean -0.00000
✅ modality_trunks.vision.blocks.0.attn.out_proj.weight has grad with mean -0.00000
✅ modality_trunks.vision.blocks.0.attn.out_proj.bias has grad with mean 0.00000
✅ modality_trunks.vision.blocks.0.norm1.weight has grad with mean -0.00000
✅ modality_trunks.vision.blocks.0.norm1.bias ha

In [4]:
from preprocessing import load_and_transform_vision_data, load_and_transform_text

text_list = ["close up of a brown and white pet dog", "little kitten playing his toy mouse", "video of funny cat"]

images = load_and_transform_vision_data(image_paths = "Test Data/Image_Data", device = "cpu")
texts = load_and_transform_text(texts = text_list, device = "cpu")

print(f"Loaded Images shape: {images.shape}")
print(f"Loaded Texts shape: {texts.shape}")

Loaded Images shape: torch.Size([3, 3, 224, 224])
Loaded Texts shape: torch.Size([3, 77])


In [5]:
with torch.no_grad():
    embeddings = model(inputs)

In [6]:
embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T

tensor([[1.3306, 1.3307, 1.3306],
        [1.3313, 1.3314, 1.3313],
        [1.3306, 1.3306, 1.3306]])

In [22]:
vision_x_text = torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1)
vision_x_text

tensor([[0.3333, 0.3333, 0.3333],
        [0.3333, 0.3333, 0.3333],
        [0.3333, 0.3333, 0.3333]])