In [None]:
# Step 1: Import Libraries
import os
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from nltk.translate.bleu_score import corpus_bleu
# from nltk.translate.meteor_score import single_meteor_score
from gensim.models import KeyedVectors
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torchvision

from transformers import AutoTokenizer
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from tqdm.notebook import trange, tqdm

from torch.distributions import Categorical

torch.backends.cuda.matmul.allow_tf32 = True

In [None]:
import os
import nltk
from nltk.corpus import wordnet

!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


In [None]:
!pip install -U nltk rouge-score

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer


In [None]:
!pip install transformers

In [None]:
from transformers import ViTModel, GPT2LMHeadModel, GPT2Config, VisionEncoderDecoderModel, ViTFeatureExtractor

# Step 2: Load Dataset

In [None]:
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

In [None]:
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

In [None]:
work_directory = "/kaggle/input/deep-learning-ic-dataset/"
data_path = os.path.join(work_directory, "captions.csv")
data = pd.read_csv(data_path)

In [None]:
from pathlib import Path
temp_directory = Path('../temp')
temp_directory.mkdir(exist_ok=True)

In [None]:
data.head(5)

In [None]:
data['filepath'] = data['filepath'].apply(lambda x: os.path.join(work_directory, x))

In [None]:
# Split into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Save the split datasets for easier access later (optional)
train_csv_path = os.path.join(temp_directory, "train_captions.csv")
val_csv_path = os.path.join(temp_directory, "val_captions.csv")

In [None]:
train_data.to_csv(train_csv_path, index=False)
val_data.to_csv(val_csv_path, index=False)

In [None]:
class CustomImageCaptionDataset(Dataset):
    def __init__(self, data_frame, transform=None):
        self.data = data_frame

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get file path and caption
        image_info = self.data.iloc[idx]
        image_path = image_info['filepath']  # Use the filepath column directly
        caption = image_info['caption']

        # Load and transform the image
        image = Image.open(image_path).convert('RGB')  # Pastikan format PIL.Image
       
        return image, caption

In [None]:
image_size = 128
batch_size = 32

In [None]:
# Create datasets
train_dataset = CustomImageCaptionDataset(
    data_frame=train_data,

)

val_dataset = CustomImageCaptionDataset(
    data_frame=val_data,
)

In [None]:
# We'll use a pre-built Tokenizer for the BERT Model
# https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def collate_fn(batch):
    images, captions = zip(*batch)
    # Tokenize dynamically
    images, captions = zip(*batch)
    return images, list(captions)  # Return captions as a list of raw strings

In [None]:
# Data Loaders
data_loader_train = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)

data_loader_val = DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
)

In [None]:
dataiter = next(iter(data_loader_val))
test_images, test_captions = dataiter

In [None]:
# Convert the PIL images to tensors using ViTFeatureExtractor
inputs = feature_extractor(images=[test_images[1]], return_tensors="pt")  # Process a single image

# Extract the image tensor
image_tensor = inputs["pixel_values"][0]  # Shape: [C, H, W]

# Visualize the image
plt.figure(figsize=(3, 3))
plt.imshow(image_tensor.permute(1, 2, 0).numpy())  # Permute to [H, W, C] for visualization
plt.show()

# Print the corresponding caption
caption = test_captions[1]
print(caption)

# Step 3: Start Modelling

In [None]:
tokenizer.vocab_size

In [None]:
tokens = tokenizer(test_captions, padding=True, truncation=True, return_tensors="pt")

In [None]:
tokens['attention_mask']

In [None]:
token_ids = tokens['input_ids'][0]
tokens['input_ids']

In [None]:
#Prepare a GPT-2 decoder configuration that includes cross-attention:

decoder_config = GPT2Config.from_pretrained("gpt2")
decoder_config.add_cross_attention = True  # This is crucial
gpt2_decoder = GPT2LMHeadModel.from_pretrained("gpt2", config=decoder_config)


In [None]:
model = VisionEncoderDecoderModel(encoder=vit_model, decoder=gpt2_decoder)


tokenizer setup

In [None]:
from transformers import GPT2TokenizerFast


tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a real pad token, use eos

#Configure model start and end tokens:

model.config.decoder_start_token_id = tokenizer.bos_token_id if tokenizer.bos_token_id is not None else tokenizer.eos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id


## Training Loop

In [None]:
import torch
import torch.optim as optim
from tqdm.notebook import trange, tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

nepochs = 50
training_loss_logger = []
eval_loss_logger = []
eval_bleu_logger = []
eval_meteor_logger = []
eval_rouge_logger = []

max_length = 25  # Adjust if needed

In [None]:
# Define the EarlyStopping Class
class EarlyStopping:
    def __init__(self, patience=2, verbose=False, delta=0.0, path='best_model.pt'):
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.path = path
        self.best_model_state = None

    def __call__(self, current_loss, model):
        if self.best_loss is None:
            self.best_loss = current_loss
            self.save_checkpoint(model)
        elif current_loss > self.best_loss - self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = current_loss
            self.save_checkpoint(model)
            self.counter = 0

    def save_checkpoint(self, model):
        """Saves model when validation loss decreases."""
        torch.save(model.state_dict(), self.path)
        if self.verbose:
            print(f'Validation loss decreased. Saving model to {self.path}')

In [None]:
# Initialize Early Stopping with model checkpoint path
early_stopping = EarlyStopping(patience=5, verbose=True, path='best_model.pt')

In [None]:
import warnings

# Hide all warnings
warnings.filterwarnings('ignore')

# OR, to hide only a specific warning message, you can do something like:
# warnings.filterwarnings('ignore', message="The attention mask and the pad token id were not set.*")

# Rest of your code

In [None]:
# See how many Parameters our Model has!
num_model_params = 0
for param in model.parameters():
    num_model_params += param.flatten().shape[0]

print("Number of Model Parameters : %d or >%d Juta Params!" % (num_model_params, num_model_params//1e6))

In [None]:

rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

optimizer = optim.Adam(model.parameters(), lr=1e-4)

for epoch in trange(0, nepochs, leave=False, desc="Epoch"):
    # Training phase
    model.train()
    epoch_train_loss = 0.0
    num_train_batches = 0
    
    for images, captions in tqdm(data_loader_train, desc="Training", leave=False):
        # Preprocess images
        inputs = feature_extractor(images=images, return_tensors="pt").to(device)
        pixel_values = inputs["pixel_values"]
        
        # Tokenize captions
        tokenized = tokenizer(captions, padding=True, truncation=True, return_tensors="pt").to(device)
        
        # Forward pass: Model computes loss when labels are provided
        outputs = model(pixel_values=pixel_values, labels=tokenized.input_ids)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        epoch_train_loss += loss.item()
        num_train_batches += 1

    avg_train_loss = epoch_train_loss / num_train_batches
    training_loss_logger.append(avg_train_loss)

    # Evaluation phase
    model.eval()
    epoch_eval_loss = 0.0
    num_eval_batches = 0
    bleu_scores = []
    meteor_scores = []
    rouge1_scores = []
    rouge2_scores = []
    rougel_scores = []

    with torch.no_grad():
        for images, captions in tqdm(data_loader_val, desc="Eval", leave=False):
            inputs = feature_extractor(images=images, return_tensors="pt").to(device)
            pixel_values = inputs["pixel_values"]

            tokenized = tokenizer(captions, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)

            # Compute validation loss
            outputs = model(pixel_values=pixel_values, labels=tokenized.input_ids)
            val_loss = outputs.loss.item()
            epoch_eval_loss += val_loss
            num_eval_batches += 1

            # Generate predictions
            pred_ids = model.generate(pixel_values=pixel_values, max_length=max_length)
            predicted_captions = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

            # Compute metrics
            for pred_caption, gt_caption in zip(predicted_captions, captions):
                reference = [tokenizer.tokenize(gt_caption.lower())]
                hypothesis = tokenizer.tokenize(pred_caption.lower())

                # BLEU Score
                smoothing_fn = SmoothingFunction().method1
                bleu = sentence_bleu(reference, hypothesis, smoothing_function=smoothing_fn)
                bleu_scores.append(bleu)

                # METEOR Score
                meteor = meteor_score(reference, hypothesis)
                meteor_scores.append(meteor)

                # ROUGE Scores
                scores = rouge.score(gt_caption, pred_caption)
                rouge1_scores.append(scores['rouge1'].fmeasure)
                rouge2_scores.append(scores['rouge2'].fmeasure)
                rougel_scores.append(scores['rougeL'].fmeasure)

    avg_eval_loss = epoch_eval_loss / num_eval_batches if num_eval_batches > 0 else 0.0
    eval_loss_logger.append(avg_eval_loss)

    avg_bleu_score = np.mean(bleu_scores) if bleu_scores else 0.0
    avg_meteor_score = np.mean(meteor_scores) if meteor_scores else 0.0
    avg_rouge1 = np.mean(rouge1_scores) if rouge1_scores else 0.0
    avg_rouge2 = np.mean(rouge2_scores) if rouge2_scores else 0.0
    avg_rougeL = np.mean(rougel_scores) if rougel_scores else 0.0

    eval_bleu_logger.append(avg_bleu_score)
    eval_meteor_logger.append(avg_meteor_score)
    eval_rouge_logger.append({'rouge1': avg_rouge1, 'rouge2': avg_rouge2, 'rougeL': avg_rougeL})

    print(f"Epoch {epoch + 1}/{nepochs} - "
          f"Avg Eval Loss: {avg_eval_loss:.4f} - "
          f"Avg BLEU: {avg_bleu_score:.4f} - "
          f"Avg Meteor: {avg_meteor_score:.4f} - "
          f"Avg Rouge1: {avg_rouge1:.4f}, Rouge2: {avg_rouge2:.4f}, RougeL: {avg_rougeL:.4f}")

    # Early Stopping Check
    early_stopping(avg_eval_loss, model)
    if early_stopping.early_stop:
        print("Early stopping triggered. Restoring the best model.")
        model.load_state_dict(torch.load('best_model.pt'))
        break


In [None]:
# Save the model's state dict
torch.save(model.state_dict(), "caption_model_state_dict.pth")

# Save model and optimizer state dicts
torch.save({
    'epoch': nepochs,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': training_loss_logger[-1],
}, "checkpoint.pth")

In [None]:
from IPython.display import FileLink
FileLink(r'caption_model_state_dict.pth')


In [None]:
# Just check the lengths of the logs
print(len(training_loss_logger), len(eval_loss_logger), len(eval_bleu_logger))

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assume avg_training_loss_logger and avg_eval_loss_logger contain epoch-level losses
avg_training_loss_logger = training_loss_logger  # Replace with epoch-wise averaged training loss
avg_eval_loss_logger = eval_loss_logger          # Replace with epoch-wise averaged evaluation loss

# Define the range and step size for y-axis ticks
y_min, y_max = min(min(avg_training_loss_logger), min(avg_eval_loss_logger)), \
               max(max(avg_training_loss_logger), max(avg_eval_loss_logger))
step = (y_max - y_min) / 10  # Set 10 evenly spaced ticks (adjust if needed)
y_ticks = np.arange(y_min, y_max + step, step)

# Create the plot
plt.figure(figsize=(10, 5))

# Plot average training loss per epoch
plt.plot(range(1, len(avg_training_loss_logger) + 1), avg_training_loss_logger, 
         label="Training Loss", color='tab:blue')

# Plot average evaluation loss per epoch
plt.plot(range(1, len(avg_eval_loss_logger) + 1), avg_eval_loss_logger, 
         label="Evaluation Loss", color='tab:orange')

# Apply custom y-ticks
plt.yticks(y_ticks)

# Add labels, title, legend, and grid
plt.title("Training and Evaluation Loss per Epoch")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)

# Show the plot
plt.show()


In [None]:
# Define the range and step size for y-axis ticks
y_min, y_max = min(eval_bleu_logger), max(eval_bleu_logger)
step = (y_max - y_min) / 10  # Set 10 evenly spaced ticks (adjust if needed)
y_ticks = np.arange(y_min, y_max + step, step)

_ = plt.figure(figsize=(10, 5))
_ = plt.plot(eval_bleu_logger[:])
_ = plt.title("Bleu Score")

# Apply custom y-ticks with fewer steps
_ = plt.yticks(y_ticks)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define the range and step size for y-axis ticks for both training and eval losses
y_min, y_max = min(min(eval_bleu_logger), min(eval_meteor_logger)), max(max(eval_bleu_logger), max(eval_meteor_logger))
step = (y_max - y_min) / 10  # Set 10 evenly spaced ticks (adjust if needed)
y_ticks = np.arange(y_min, y_max + step, step)

# Create the plot
_ = plt.figure(figsize=(10, 5))

# Plot training loss
_ = plt.plot(eval_bleu_logger, label="Bleu", color='tab:blue')

# Plot evaluation loss
_ = plt.plot(eval_meteor_logger, label="Meteor", color='tab:orange')

# Apply custom y-ticks
_ = plt.yticks(y_ticks)

# Add labels, title, legend, and grid
_ = plt.title("Bleu and Meteor Value")
_ = plt.xlabel("Epochs")
_ = plt.ylabel("Loss")
_ = plt.legend()
_ = plt.grid(True)

# Show the plot
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Extract ROUGE scores
rouge1_scores = [entry['rouge1'] for entry in eval_rouge_logger]
rouge2_scores = [entry['rouge2'] for entry in eval_rouge_logger]
rougeL_scores = [entry['rougeL'] for entry in eval_rouge_logger]

# Define the range and step size for y-axis ticks for ROUGE scores
all_rouge_scores = rouge1_scores + rouge2_scores + rougeL_scores
y_min, y_max = min(all_rouge_scores), max(all_rouge_scores)
step = (y_max - y_min) / 10  # Set 10 evenly spaced ticks
y_ticks = np.arange(y_min, y_max + step, step)

# Create the plot
_ = plt.figure(figsize=(10, 5))

# Plot ROUGE scores
_ = plt.plot(rouge1_scores, label="ROUGE-1", color='tab:green')
_ = plt.plot(rouge2_scores, label="ROUGE-2", color='tab:red')
_ = plt.plot(rougeL_scores, label="ROUGE-L", color='tab:purple')

# Apply custom y-ticks
_ = plt.yticks(y_ticks)

# Add labels, title, legend, and grid
_ = plt.title("ROUGE Scores Across Epochs")
_ = plt.xlabel("Epochs")
_ = plt.ylabel("ROUGE Score")
_ = plt.legend()
_ = plt.grid(True)

# Show the plot
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Extract ROUGE scores
rouge1_scores = [entry['rouge1'] for entry in eval_rouge_logger]
rouge2_scores = [entry['rouge2'] for entry in eval_rouge_logger]
rougeL_scores = [entry['rougeL'] for entry in eval_rouge_logger]

# Define the range and step size for y-axis ticks for all metrics
all_metrics = eval_bleu_logger + eval_meteor_logger + rouge1_scores + rouge2_scores + rougeL_scores
y_min, y_max = min(all_metrics), max(all_metrics)
step = (y_max - y_min) / 10  # Set 10 evenly spaced ticks
y_ticks = np.arange(y_min, y_max + step, step)

# Create the plot
_ = plt.figure(figsize=(12, 6))

# Plot BLEU and Meteor scores
_ = plt.plot(eval_bleu_logger, label="BLEU", color='tab:blue')
_ = plt.plot(eval_meteor_logger, label="Meteor", color='tab:orange')

# Plot ROUGE scores
_ = plt.plot(rouge1_scores, label="ROUGE-1", color='tab:green')
_ = plt.plot(rouge2_scores, label="ROUGE-2", color='tab:red')
_ = plt.plot(rougeL_scores, label="ROUGE-L", color='tab:purple')

# Apply custom y-ticks
_ = plt.yticks(y_ticks)

# Add labels, title, legend, and grid
_ = plt.title("Evaluation Metrics Across Epochs")
_ = plt.xlabel("Epochs")
_ = plt.ylabel("Metric Value")
_ = plt.legend()
_ = plt.grid(True)

# Show the plot
plt.show()

In [None]:
# For inference on a new image:
dataiter = next(iter(data_loader_val))
test_images, test_captions = dataiter

index = 2
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
test_image = transform(test_images[index]).unsqueeze(0)

In [None]:
plt.figure(figsize=(3,3))
out = torchvision.utils.make_grid(test_image, 1, normalize=True)
_ = plt.imshow(out.permute(1, 2, 0).numpy())
print("Ground Truth Caption:", test_captions[index])



In [None]:
# Now to generate a caption using the model:
model.eval()
with torch.no_grad():
    pixel_values = test_image.to(device)
    # If your model expects pixel_values in a specific format, ensure they match.
    # If using the feature extractor:
    # inputs = feature_extractor(images=[transforms.ToPILImage()(test_image.squeeze(0))], return_tensors="pt").to(device)
    # pixel_values = inputs["pixel_values"]

    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    pred_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Predicted Caption:", pred_text)


In [None]:
# Save the model if needed
torch.save(model.state_dict(), "/kaggle/working/ViT-GPT2_model.pth")