In [1]:
import torch
from torch import Tensor
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import timeit
from tqdm import tqdm
import transformers

# Handling Data for Training


In [2]:
import pickle

def load_pickle_to_df(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return pd.DataFrame(data)

train_df = load_pickle_to_df('../data/df_train.pkl')
val_df = load_pickle_to_df('../data/df_val.pkl')
test_df = load_pickle_to_df('../data/df_test.pkl')

In [3]:
'''
Is it better to split data up into new rows by the multiple captions or would it be more helpful to a function to iterate over the list inside the df for when creating batches? 
Is this second, harder method actually going to save me from having to convert an image into a tensor -> embedding multiple times over?
Verdict: For now will convert into 5x dataframe for each so that I can just iterate over each value in training. 
'''

#Honestly I could have turned this into a function, but i'm lazy so I just copy-paste this two more times
train_copy = train_df.copy()
separated_rows = []
for _,row in train_copy.iterrows():
    # print(row['caption'],"\n\n")
    for idx in range(0,len(row['caption'])):#iterate over caption idx and use it to create new rows
         #make copy and modify caption, then append that to separated_rows list 
        # print(row['caption'][idx])
        dupe = row.copy()
        dupe['caption'] = row['caption'][idx]
        dupe['sentids'] = row['sentids'][idx]
        separated_rows.append(dupe)

sep_train_df = pd.DataFrame(separated_rows)
sep_train_df.reset_index(drop=True, inplace=True)
sep_train_df.head()

Unnamed: 0,image,caption,sentids,split,img_id,filename
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,Two young guys with shaggy hair look at their ...,0,train,0,1000092795.jpg
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,"Two young, White males are outside near many b...",1,train,0,1000092795.jpg
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,Two men in green shirts are standing in a yard.,2,train,0,1000092795.jpg
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,A man in a blue shirt standing in a garden.,3,train,0,1000092795.jpg
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,Two friends enjoy time spent together.,4,train,0,1000092795.jpg


In [4]:
val_copy = val_df.copy()
separated_rows = []
for _,row in val_copy.iterrows():
    # print(row['caption'],"\n\n")
    for idx in range(0,len(row['caption'])):#iterate over caption idx and use it to create new rows
         #make copy and modify caption, then append that to separated_rows list 
        # print(row['caption'][idx])
        dupe = row.copy()
        dupe['caption'] = row['caption'][idx]
        dupe['sentids'] = row['sentids'][idx]
        separated_rows.append(dupe)

sep_val_df = pd.DataFrame(separated_rows)
sep_val_df.reset_index(drop=True, inplace=True)

test_copy = test_df.copy()
separated_rows = []
for _,row in test_copy.iterrows():
    # print(row['caption'],"\n\n")
    for idx in range(0,len(row['caption'])):#iterate over caption idx and use it to create new rows
         #make copy and modify caption, then append that to separated_rows list 
        # print(row['caption'][idx])
        dupe = row.copy()
        dupe['caption'] = row['caption'][idx]
        dupe['sentids'] = row['sentids'][idx]
        separated_rows.append(dupe)

sep_test_df = pd.DataFrame(separated_rows)
sep_test_df.reset_index(drop=True, inplace=True)

#fix the dataframes
train_df = sep_train_df
val_df = sep_val_df
test_df = sep_test_df


# Import Models for Image and Text Encoding


In [None]:
device = 'cuda'
clip_tokenizer = transformers.CLIPTokenizer.from_pretrained('openai/clip-vit-base-patch32')
vision_transformer = transformers.ViTModel.from_pretrained('google/vit-base-patch16-224-in21k').to(device)

AttributeError: CLIPTokenizer has no attribute to

In [6]:
params = lambda m: sum(p.numel() for p in m.parameters())
print(f"CLIP: \n {clip_tokenizer}")
print(f"ViT: \n {vision_transformer}")


CLIP: 
 CLIPTokenizer(name_or_path='openai/clip-vit-base-patch32', vocab_size=49408, model_max_length=77, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	49406: AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	49407: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)
ViT: 
 ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTAttention(
          (attention): ViTSelfAttention(
            (query): 

In [7]:
import sentencepiece as spm
# text_transformer = transformers.ViTModel.from_pretrained('google/sentencepiece')
sp = spm.SentencePieceProcessor(model_file='ext_model/test_model.model')
sp.encode('This is a test.')

[284, 47, 11, 4, 15, 400, 6]

# Model Definition


In [8]:
from PIL import Image

import torch
from torch.utils.data import Dataset

class ImageCaptionDataset(Dataset):
    def __init__(self, dataframe, clip_model, feature_extractor, vision_transformer, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.df = dataframe
        self.clip_model = clip_model
        self.feature_extractor = feature_extractor
        self.vision_transformer = vision_transformer
        self.device = device

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = row['image']      # Already a PIL.Image object
        caption = row['caption']  # Text string

        # Get image patch embeddings using ViT
        image_feats = self.get_image_patches(image)

        # Get text token IDs using CLIP tokenizer
        inputs = self.clip_model.processor(text=caption, return_tensors="pt", padding=True).to(self.device)
        caption_ids = clip_tokenizer(caption, return_tensors="pt")["input_ids"].squeeze(0)  # Shape: (seq_len,)

        return image_feats, caption_ids

    def get_image_patches(self, image):
        inputs = self.feature_extractor(images=image, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.vision_transformer(**inputs)
        return outputs.last_hidden_state.squeeze(0)  # shape: (num_patches + 1, hidden_dim)



class CaptionDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, d_model))  # max length 512

        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers)

        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, tgt, memory):
        """
        tgt: (T, B) token ids
        memory: (S, B, d_model) image features
        """
        tgt_emb = self.embedding(tgt) + self.positional_encoding[:, :tgt.size(0)]
        output = self.decoder(tgt_emb, memory)
        return self.fc_out(output)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        self.register_buffer('pe', pe.unsqueeze(1))  # shape: (max_len, 1, d_model)

    def forward(self, x):  # x: (batch, seq_len, d_model)
        return x + self.pe[:x.size(1)].transpose(0, 1)  # (1, seq_len, d_model)


In [10]:
class MaskedSelfAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0
        self.head_dim = d_model // n_heads
        self.n_heads = n_heads

        self.qkv_proj = nn.Linear(d_model, 3 * d_model)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x):
        # x: (batch, seq_len, d_model)
        B, T, D = x.size()
        H = self.n_heads
        qkv = self.qkv_proj(x)  # (B, T, 3*D)
        qkv = qkv.view(B, T, H, 3 * self.head_dim)
        q, k, v = qkv.chunk(3, dim=-1)  # each: (B, T, H, head_dim)

        # Rearrange to (B, H, T, head_dim)
        q = q.permute(0, 2, 1, 3)
        k = k.permute(0, 2, 1, 3)
        v = v.permute(0, 2, 1, 3)

        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)  # (B, H, T, T)

        # Causal mask
        mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).unsqueeze(0)  # (1, 1, T, T)
        attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))

        attn = F.softmax(attn_scores, dim=-1)
        out = torch.matmul(attn, v)  # (B, H, T, head_dim)

        out = out.permute(0, 2, 1, 3).contiguous().view(B, T, D)  # (B, T, D)
        return self.out_proj(out)


In [11]:
class CustomDecoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, dim_ff):
        super().__init__()
        self.self_attn = MaskedSelfAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, dim_ff),
            nn.ReLU(),
            nn.Linear(dim_ff, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        # Self-attention
        attn_out = self.self_attn(x)
        x = self.norm1(x + attn_out)

        # Feed-forward
        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)
        return x


In [12]:
class CustomDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=768, n_heads=8, dim_ff=2048, num_layers=3, max_len=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([
            CustomDecoderBlock(d_model, n_heads, dim_ff) for _ in range(num_layers)
        ])
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, input_ids):
        # input_ids: (batch, seq_len)
        x = self.embedding(input_ids)      # (batch, seq_len, d_model)
        x = self.pos_enc(x)                # (batch, seq_len, d_model)
    
        for layer in self.layers:
            x = layer(x)                   # assuming layer supports (batch, seq_len, d_model)
    
        logits = self.output_proj(x)       # (batch, seq_len, vocab_size)
        return F.log_softmax(logits, dim=-1)


In [13]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    image_feats, captions = zip(*batch)

    # Stack image features: shape -> (batch_size, num_patches+1, hidden_dim)
    image_feats = torch.stack(image_feats)

    # Pad captions
    captions_padded = pad_sequence(captions, batch_first=True, padding_value=0)

    return image_feats, captions_padded

In [None]:
from transformers import ViTFeatureExtractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# print(device)
dataset = ImageCaptionDataset(
    dataframe=train_df,
    clip_model=clip_tokenizer,
    feature_extractor=feature_extractor,
    vision_transformer=vision_transformer,
    device=device
)

dataloader = DataLoader(
    dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=collate_fn
)



AttributeError: 'ViTFeatureExtractor' object has no attribute 'to'

In [15]:
# dataloader
for image_feats, caption_tokens in dataloader:
    print(image_feats.shape)       # (batch_size, num_patches+1, hidden_dim)
    print(caption_tokens.shape)    # (batch_size, max_caption_len)
    break

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [18]:
import torch.nn as nn
from tqdm import tqdm

# Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vocab_size = clip_tokenizer.vocab_size
model = CustomDecoder(vocab_size=vocab_size)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # padding index is 0

num_epochs = 5


for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for image_feats, captions in tqdm(dataloader):
        # Move to device
        image_feats = image_feats.to(device)
        captions = captions.to(device)
 
        print(image_feats.device, captions.device, next(model.parameters()).device)
        # break
        
        # Prepare inputs and targets
        inputs = captions[:, :-1]      # remove <end> token
        targets = captions[:, 1:]      # shift left

        # Forward pass
        outputs = model(image_feats, inputs)  # shape: (B, seq_len, vocab_size)

        # Compute loss
        outputs = outputs.view(-1, outputs.size(-1))    # (B * seq_len, vocab_size)
        targets = targets.reshape(-1)                   # (B * seq_len)
        loss = criterion(outputs, targets)

        # Backprop + optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}: Loss = {epoch_loss:.4f}")


  0%|          | 0/18125 [00:00<?, ?it/s]


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [None]:
def generate_caption(model, image_feats, tokenizer, max_len=20):
    model.eval()
    image_feats = image_feats.unsqueeze(0).to(device)  # add batch dim

    caption_ids = [tokenizer.bos_token_id]
    for _ in range(max_len):
        input_ids = torch.tensor(caption_ids, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            outputs = model(image_feats, input_ids)
        next_token_logits = outputs[0, -1]  # last time step
        next_token = next_token_logits.argmax().item()
        if next_token == tokenizer.eos_token_id:
            break
        caption_ids.append(next_token)

    return tokenizer.decode(caption_ids, skip_special_tokens=True)
