In [None]:
#already setup: conda env using python3.12.9
#and "pip install" done for all needed packages - also see pip freeze at the bottom

In [None]:
# Warning control
#import warnings
#warnings.filterwarnings('ignore')

In [None]:
# Vision Transformer and GPT fine-tuned for an image captioning system
# with this kaggle dataset: https://www.kaggle.com/datasets/adityajn105/flickr8k

from transformers import ViTModel
from transformers import AutoFeatureExtractor #which converts images into tensors

from transformers import VisionEncoderDecoderModel, GPT2TokenizerFast, AutoFeatureExtractor, \
                         AutoTokenizer, TrainingArguments, Trainer

from sklearn.feature_extraction.text import CountVectorizer

from PIL import Image #pip install Pillow
import os

import matplotlib.pyplot as plt
import numpy as np

from datasets import Dataset

import torch
import numpy as np
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor, Resize
import pandas as pd

import requests
from io import BytesIO

feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

#img = Image.open('./kobe.jpeg')
img = Image.open('.\kobe.jpeg')
display(img)
print(feature_extractor(img).pixel_values[0].shape)
# (3,224,224) # 3 due to color, 224 due to feature_extractor resized the image

plt.imshow(feature_extractor(img).pixel_values[0].transpose(1, 2, 0))
# original image got (3,224,224)
# .transpose here put the 1st dimension for colors as the last dimension
# .. which was the 0 in (1, 2, 0), as it is 0-based

In [None]:
feature_extractor

In [None]:
# Load up a pretrained Vision Transformer
# Many weights are innitialized randomly, namely the cross attention weights
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    'google/vit-base-patch16-224-in21k', #224 for image size, in for ImageNet, 21k dataset
    'distilgpt2' #use better/newer gemma3:1b or gemma3:4b instead? #https://huggingface.co/docs/transformers/en/model_doc/gemma3
    # .. NO .. because this small distilgpt2 already took hours to train below
)
#needed this for above (per https://pytorch.org/get-started/locally/):
#pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

#from practical point of view nowadays, this (and others) will be much better for the task here: https://ollama.com/x/llama3.2-vision

print(type(model.encoder))
print(type(model.decoder))

total_params = 0
for param in model.parameters():
    #total_params += numel(param)
    total_params += param.numel()
print(f"Our model has a combined {total_params:,} parameters")

gpt2_tokenizer = GPT2TokenizerFast.from_pretrained('distilgpt2')

In [None]:
model # note "patch embeddings" below, vs text-based transformer's word embeddings

In [None]:
#IMAGES_PATH = './flicker_images'
IMAGES_PATH = '.\\flicker_images' #for windows
MIN_CAPTION, MAX_CAPTION = 10, 50

def load_captions_data(filename):
    with open(filename) as caption_file:
        caption_data = caption_file.readlines()
        caption_mapping = {}
        text_data = []

        for line in caption_data:
            line = line.rstrip("\n")
            # Image name and captions are separated using a tab
            img_name, caption = line.split("\t")

            # Each image is repeated five times for the five different captions.
            # Each image name has a suffix `#(caption_number)`
            img_name = img_name.split("#")[0]
            img_name = os.path.join(IMAGES_PATH, img_name.strip())

            if img_name.endswith("jpg"):
                caption = caption.replace(' .', '').strip()
                tokens = caption.strip().split()
                if len(caption) < MIN_CAPTION or len(caption) > MAX_CAPTION:
                    continue
                text_data.append(caption)

                if img_name in caption_mapping:
                    caption_mapping[img_name].append(caption)
                else:
                    caption_mapping[img_name] = [caption]

        return caption_mapping, text_data

# Load the dataset
captions_mapping, text_data = load_captions_data("./Flickr8k.token.txt")

In [None]:
list(captions_mapping.items())[:3] #multiple captions for each image, all are considered correct

In [None]:
normalize = Normalize(
    mean=feature_extractor.image_mean,
    std=feature_extractor.image_std
)

_transforms = Compose(
    [
        RandomResizedCrop(size=(feature_extractor.size['height'], feature_extractor.size['width'])), # Data augmentation. Randomly crop the image, then resize to 224x224
        ToTensor(),                                  # Convert to pytorch tensor
        normalize                                    # normalize pixel values to look like images during pre-training
    ]
)

In [None]:
rows = []

# using multiple captions per image, given data augmentation being used
for path, captions in captions_mapping.items():
    for caption in captions:
        rows.append({'path': path, 'caption': caption})

image_df = pd.DataFrame(rows)

image_dataset = Dataset.from_pandas(image_df)

In [None]:
print(image_df.shape)
print(image_df.head())

In [None]:
# this is just for debug purpose
current_directory = os.getcwd()
#print("current directory is:", current_directory)
entries = os.listdir(current_directory)

for entry in entries:
    print(entry)

entries = os.listdir("./flicker_images")

for entry in entries[:10]:
    print(entry)

if "1000268201_693b08cb0e.jpg" in entries:
    print("yes, 1000268201_693b08cb0e.jpg, which will be used later for both fine-tuned and non-fine-tuned models, is in the folder")
else:
    print("no, 1000268201_693b08cb0e.jpg, which will be used later for both fine-tuned and non-fine-tuned models, is NOT in the folder")

In [None]:
#for debugging purpose
'''
print(image_dataset[0]) # image_dataset[0] is a dictionary with keys: 'path' and 'caption'
print(image_dataset[0]['path']) # image_dataset[0] is a dictionary with keys: 'path' and 'caption'

file_path = image_dataset[0]['path']

# Check if the file exists
if os.path.exists(file_path):
    # Check if the file is readable
    if os.access(file_path, os.R_OK):
        print(f"The file '{file_path}' exists and is readable.")
        # You can now safely open and read the file
        try:
            with open(file_path, 'rb') as file:
            #with open(file_path, 'r') as file:
                # Perform operations on the file (e.g., read its contents)
                contents = file.read()
                print("File contents:")
                print(contents)
        except IOError as e:
            print(f"An error occurred while reading the file: {e}")
    else:
        print(f"The file '{file_path}' exists but is not readable.")
else:
    print(f"The file '{file_path}' does not exist.")

tmp_imgimg = Image.open(image_dataset[0]['path'])
display(tmp_img)

normalize = Normalize(
    mean=feature_extractor.image_mean,
    std=feature_extractor.image_std
)
print(feature_extractor.image_mean)
_transforms = Compose(
    [
        RandomResizedCrop(size=(feature_extractor.size['height'], feature_extractor.size['width'])), # Data augmentation. Randomly crop the image, then resize to 224x224
        ToTensor(),                                  # Convert to pytorch tensor
        normalize                                    # normalize pixel values to look like images during pre-training
    ]
)
_transforms(Image.open(file_path))
'''

In [None]:
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

def image_preprocess(examples):
    # ViT expects pixel_values instead of input_ids
    examples['pixel_values'] = [_transforms(Image.open(path)) for path in examples['path']]
    # We are padding tokens here instead of using a datacollator
    tokenized = gpt2_tokenizer(
        examples['caption'], padding='max_length', max_length=10, truncation=True
    )['input_ids']
    # the output captions
    examples['labels'] = [[l if l != gpt2_tokenizer.pad_token_id else -100 for l in t] for t in tokenized]
    #setting to -100 for the pad tokens so the "loss" will NOT be calculated for such, as such is NOT part of the caption

    # delete unused keys
    del examples['path']
    del examples['caption']
    return examples

image_dataset = image_dataset.map(image_preprocess, batched=True)

image_dataset = image_dataset.train_test_split(test_size=0.1)

image_dataset

In [None]:
# We set a pad token and a start token in our combined model to be the same as gpt2

model.config.pad_token = gpt2_tokenizer.pad_token
model.config.pad_token_id = gpt2_tokenizer.pad_token_id

model.config.decoder_start_token = gpt2_tokenizer.bos_token
model.config.decoder_start_token_id = gpt2_tokenizer.bos_token_id

In [None]:
!pip show accelerate
!pip show transformers
import accelerate
accelerate.__version__

In [None]:
# freeze all but the last two layers in the ViT
for name, param in model.encoder.named_parameters():
    if 'encoder.layer.10' in name: #10 and 11 are the last 2 layers in ViT (0 based index, 12 layers in total)
        break
    param.requires_grad = False
#note that we are NOT freezing anything in the gpt2 model as the cross attention weights are all over the place in gpt2

training_args = TrainingArguments(
    output_dir='./v1_image_captioning', # The output directory
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=4, # number of training epochs
    per_device_train_batch_size=64, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    load_best_model_at_end=True,
    log_level='info',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=image_dataset['train'],
    eval_dataset=image_dataset['test'],
)

trainer.evaluate()

In [None]:
trainer.train()

In [None]:
# the loss decline is starting to slow down. This is a good indication that we may want to try training on more data

In [None]:
trainer.save_model()

In [None]:
# loading model and config from pretrained folder
finetuned_model = VisionEncoderDecoderModel.from_pretrained('./v1_image_captioning')

# Create a new composition that doesn't crop images for inference to make it easier for the model
inference_transforms = Compose(
    [
        Resize(size=(feature_extractor.size['height'], feature_extractor.size['width'])),
        ToTensor(),
        normalize
    ]
)

# a helper function to caption images from the web or a file path
def caption_image(m, path):
    if 'http' in path:
        response = requests.get(path)
        img = Image.open(BytesIO(response.content))
        image_matrix = inference_transforms(img).unsqueeze(0) #.unsqueeze(0) to add a "batch" dimension in the beginning
    else:
        img = Image.open(path)
        image_matrix = inference_transforms(img).unsqueeze(0)

    generated = m.generate(
        image_matrix,
        num_beams=5, #default is 1, setting to 5 is asking GPT to think ahead more on what words could be used in the future
        max_length=20,
        early_stopping=True,
        do_sample=True,
        top_k=10,
        num_return_sequences=5, #had this: ValueError: `num_return_sequences` (5) has to be smaller or equal to `num_beams` (3).
        pad_token_id=gpt2_tokenizer.eos_token_id
    )

    caption_options = [gpt2_tokenizer.decode(g, skip_special_tokens=True).strip() for g in generated]
    display(img)
    return caption_options, generated, image_matrix

In [None]:
captions, generated, image_matrix = caption_image(finetuned_model, './kobe.jpeg')
captions

In [None]:
non_finetuned = VisionEncoderDecoderModel.from_encoder_decoder_pretrained('google/vit-base-patch16-224-in21k', 'distilgpt2')

captions, generated, image_matrix = caption_image(non_finetuned, './kobe.jpeg')
captions

In [None]:
captions, generated, image_matrix = caption_image(
    finetuned_model, './flicker_images/1000268201_693b08cb0e.jpg'
)
captions

In [None]:
captions, generated, image_matrix = caption_image(
    non_finetuned, './flicker_images/1000268201_693b08cb0e.jpg'
)
captions

In [None]:
!pip freeze