Overall Steps

In [None]:
# Get hugging face dataset 
# For each image, split into grid of 16 x 16 pixels - sequences of each patch representing the image 
# Convert each patch to 1 dimensional vector - this will be the pre embedding of the patch 
# Pass pre embedding to embedding layer to get the embedding of the patch 
# This will input to the encoder 

# Tokenise each caption in the dataset 
# Create two copies of each tokenised caption, 
#  - one with start of sentence token at the start - input to decoder
#  - one with end of sentence token at the end - label for the loss function 

Split up image into grid of 16 x 16 and convert to list of 1D arrays

In [3]:
from PIL import Image
import numpy as np

# Path to your image
image_path = 'flickr30k/flickr30k-images/testImage.jpg'

# Load the image
image = Image.open(image_path)

def convertImageTo1DPatchesVectorList(image):
    # Convert the image to a numpy array for easier manipulation
    image_np = np.array(image)

    # Initialize a list to hold the 1D vectors for each 16x16 patch
    patch_vectors = []

    # Iterate over the image in 16x16 blocks
    for y in range(0, image_np.shape[0], 16):
        for x in range(0, image_np.shape[1], 16):
            # Extract the patch
            patch = image_np[y:y+16, x:x+16]

            # Flatten the patch to a 1D vector (16*16*number_of_channels)
            # This line assumes the image is in color (RGB), so the patch size becomes 16*16*3
            patch_vector = patch.flatten()
            
            # Add the patch vector to our list
            patch_vectors.append(patch_vector)
    
    return patch_vectors

patch_vectors = convertImageTo1DPatchesVectorList(image)

print(f"Total patches extracted: {len(patch_vectors)}")



Total patches extracted: 768


Sentencepiece sub word encoding

In [7]:
# Tokenise the captions 
# Create the copies (one for the decoder, the second for the loss function label)
from tqdm import tqdm
import pandas as pd
import sentencepiece as spm
import ast  # For converting string representation of list to list

def encode_sentences(csv_path, sp_model_path):
    # Load the SentencePiece model
    sp = spm.SentencePieceProcessor(model_file=sp_model_path)
    
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Prepare a dictionary to hold encoded sentences
    encoded_sentences = {'img_id': [], 'encoded_sentences': []}
    
    for _, row in df.iterrows():
        # Convert string representation of list to actual list
        sentences = ast.literal_eval(row['raw'])
        
        # Encode each sentence in the list
        encoded = [sp.encode(sentence, out_type=int) for sentence in sentences]
        
        # Append results
        encoded_sentences['img_id'].append(row['img_id'])
        encoded_sentences['encoded_sentences'].append(encoded)
    
    # Convert dictionary to DataFrame for easy handling/viewing
    encoded_df = pd.DataFrame(encoded_sentences)
    return encoded_df

# Usage example
csv_path = 'flickr_annotations_30k.csv'
sp_model_path = 'spm.model'
encoded_df = encode_sentences(csv_path, sp_model_path)

31014it [00:08, 3734.85it/s]

   img_id                                  encoded_sentences
0       0  [[19, 29, 373, 14, 2280, 124, 195, 20, 74, 177...
1       1  [[332, 351, 40, 7, 336, 297, 17, 1438, 4, 905,...
2       2  [[6, 59, 7, 4, 100, 128, 11, 261, 56, 4, 380, ...
3       3  [[608, 7, 4, 32, 25, 10, 71, 11, 37, 9, 3093, ...
4       4  [[19, 40, 15, 54, 7, 4, 127, 25, 15, 54, 7, 4,...



