Overall Steps

In [None]:
# Get hugging face dataset - Done
# Resize all images to 448 by 448 - Done
# For each image, split into grid of 16 x 16 pixels - sequences of each patch representing the image - Done
# Convert each patch to 1 dimensional vector - this will be the pre embedding of the patch - Done
# Pass pre embedding to embedding layer to get the embedding of the patch 
# This will input to the encoder 

# Tokenise each caption in the dataset - Done
# Create two copies of each tokenised caption, 
#  - one with start of sentence token at the start - input to decoder
#  - one with end of sentence token at the end - label for the loss function 

In [18]:
from PIL import Image
import numpy as np
import torch

from tqdm import tqdm
import pandas as pd
import sentencepiece as spm
import ast  # For converting string representation of list to list


Split up image into grid of 16 x 16 and convert to list of 1D arrays

In [6]:

# Path to your image
image_path = 'flickr30k/flickr30k-images/testImage.jpg'


def convertImageTo1DPatchesVectorList(image_path):
    
    # Load the image
    image = Image.open(image_path)

    # Resize the image to 448 x 448 pixels
    resized_image = image.resize((448, 448))

    # Convert the image to a numpy array for easier manipulation
    image_np = np.array(resized_image)

    # Initialize a list to hold the 1D vectors for each 16x16 patch
    patch_vectors = []

    # Iterate over the image in 16x16 blocks
    for y in range(0, image_np.shape[0], 16):
        for x in range(0, image_np.shape[1], 16):
            # Extract the patch
            patch = image_np[y:y+16, x:x+16]

            # Flatten the patch to a 1D vector (16*16*number_of_channels)
            # This line assumes the image is in color (RGB), so the patch size becomes 16*16*3
            patch_vector = patch.flatten()
            
            # Add the patch vector to our list
            patch_vectors.append(patch_vector)
    
    return patch_vectors

patch_vectors = convertImageTo1DPatchesVectorList(image_path)

print(f"Total patches extracted: {len(patch_vectors)}")



Total patches extracted: 784


Sentencepiece sub word encoding

In [7]:
# Tokenise the captions 
# Create the copies (one for the decoder, the second for the loss function label)


def encode_sentences(csv_path, sp_model_path):
    # Load the SentencePiece model
    sp = spm.SentencePieceProcessor(model_file=sp_model_path)
    
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Prepare a dictionary to hold encoded sentences
    encoded_sentences = {'img_id': [], 'encoded_sentences': []}
    
    for _, row in df.iterrows():
        # Convert string representation of list to actual list
        sentences = ast.literal_eval(row['raw'])
        
        # Encode each sentence in the list
        encoded = [sp.encode(sentence, out_type=int) for sentence in sentences]
        
        # Append results
        encoded_sentences['img_id'].append(row['img_id'])
        encoded_sentences['encoded_sentences'].append(encoded)
    
    # Convert dictionary to DataFrame for easy handling/viewing
    encoded_df = pd.DataFrame(encoded_sentences)
    return encoded_df

# Usage example
csv_path = 'flickr_annotations_30k.csv'
sp_model_path = 'spm.model'
encoded_captions = encode_sentences(csv_path, sp_model_path) #Dimensions - batch size x num_images x num_captions x max caption length (after padding)

31014


Create captions copies - one for decoder and one for label

In [17]:
print(len(encoded_captions))
print((encoded_captions["encoded_sentences"][1]))

31014
[[332, 351, 40, 7, 336, 297, 17, 1438, 4, 905, 4167, 2906, 5], [1066, 195, 44, 73, 56, 345, 9, 4, 330, 13, 471, 5], [19, 40, 125, 9, 4, 430, 21, 336, 297, 5], [164, 40, 9, 114, 13, 4, 457, 552, 5], [65, 50, 40, 9, 4, 63, 4932, 5]]


In [None]:
# Adding sos token to start of token and adding eos token to end of token

Positional Encoding

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_length):
        super().__init__()
        self.pos_encoding = self.create_pos_encoding(d_model, max_length)

    def create_pos_encoding(self, d_model, max_length):
        position = torch.arange(max_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe = torch.zeros(max_length, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        return nn.Parameter(pe, requires_grad=False)
    
    def forward(self, embeddings):
        # Use embeddings to determine sequence length and to get device information
        seq_len = embeddings.size(1)
        pos_encoding = self.pos_encoding[:, :seq_len, :].to(embeddings.device)
        return embeddings + pos_encoding