In [30]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Add
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from transformers import ViTModel, ViTFeatureExtractor
from nltk.translate.bleu_score import corpus_bleu
from PIL import Image
import matplotlib.pyplot as plt
import torch
from tensorflow.keras.preprocessing.image import load_img, img_to_array

In [31]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [32]:
# Function to load image paths and captions
def load_data(caption_file, image_dir):
    with open(caption_file, 'r') as f:
        captions = f.readlines()
    
    image_paths = []
    image_captions = []
    
    for line in captions:
        # Assuming each line is in the format: <image_filename> <caption>
        image_filename, caption = line.strip().split('\t')
        image_paths.append(os.path.join(image_dir, image_filename))  # Full path to the image
        image_captions.append(caption)
    
    return image_paths, image_captions

In [33]:
# Function to preprocess images
def preprocess_image(image_path, target_size=(224, 224)):
    # Load image, resize to target size, and convert to numpy array
    image = load_img(image_path, target_size=target_size)
    image = img_to_array(image)
    image = image / 255.0  # Normalize pixel values to [0, 1]
    return image

In [34]:
# Function to tokenize captions
def tokenize_captions(captions, max_vocab_size=10000, max_len=40):
    tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<unk>')
    tokenizer.fit_on_texts(captions)
    sequences = tokenizer.texts_to_sequences(captions)
    sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
    return tokenizer, sequences

In [35]:
# Function to build the image captioning model
def build_model(vit_model, vocab_size, max_len=40, embedding_dim=256, lstm_units=512):
    # Define the input layers
    image_input = Input(shape=(224, 224, 3))  # Image shape for ViT
    caption_input = Input(shape=(max_len-1,))  # Caption sequence input

    # Image feature extraction with ViT (Vision Transformer)
    vit_output = vit_model(image_input)[0]  # (batch_size, sequence_length, hidden_dim)
    image_features = tf.reduce_mean(vit_output, axis=1)  # Global average pooling across patches

    # Caption sequence processing (embedding and LSTM)
    embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(caption_input)
    caption_features = LSTM(lstm_units)(embedding)

    # Combine the image features and caption features
    combined = Add()([image_features, caption_features])
    
    # Output layer (prediction for next word)
    output = Dense(vocab_size, activation='softmax')(combined)

    # Define the model
    model = Model(inputs=[image_input, caption_input], outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [36]:
# Path to captions file and images directory
caption_file = "Images/captions.txt"  # Path to captions.txt
image_dir = "Images/"  # Directory containing images

# Load the image paths and captions
image_paths, captions = load_data(caption_file, image_dir)

# Preprocess images
image_data = np.array([preprocess_image(img_path) for img_path in image_paths])
max_len = max([len(caption.split()) for caption in captions]) + 1

# Tokenize captions
tokenizer, sequences = tokenize_captions(captions, max_len=max_len)

In [37]:
# Split data into training, validation, and testing sets
X_train_img, X_test_img, y_train, y_test = train_test_split(image_data, sequences, test_size=0.2, random_state=42)
X_train_img, X_val_img, y_train, y_val = train_test_split(X_train_img, y_train, test_size=0.25, random_state=42)

# Create input and target sequences for captions
X_train_text, y_train_target = y_train[:, :-1], y_train[:, 1:]
X_val_text, y_val_target = y_val[:, :-1], y_val[:, 1:]
X_test_text, y_test_target = y_test[:, :-1], y_test[:, 1:]

# Flatten target sequences
y_train_target = y_train_target.reshape(-1, max_len - 1)
y_val_target = y_val_target.reshape(-1, max_len - 1)
y_test_target = y_test_target.reshape(-1, max_len - 1)

In [41]:
import tensorflow_hub as hub

# Load pre-trained ViT model from TensorFlow Hub
vit_model = hub.load("https://tfhub.dev/google/vit_base_patch16_224/1")
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")



Failed to import tf_keras. Please note that tf_keras is not installed by default when you install tensorflow_hub. This is so that users can decide which tf_keras package to use. To use tensorflow_hub, please install a current version of tf_keras.




ImportError: cannot import name 'check_pinned' from 'tensorflow.python.ops.gen_experimental_dataset_ops' (/Users/prihandana/Developer/Deep Learning/Unsplash_Image_Scrapper/Unsplash_Image_Captioning/Scrapping/.venv/lib/python3.9/site-packages/tensorflow/python/ops/gen_experimental_dataset_ops.py)

In [39]:
# Build the image captioning model
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for the padding token
model = build_model(vit_model, vocab_size)

AttributeError: 'KerasTensor' object has no attribute 'to'

In [None]:
# Train the model
history = model.fit([X_train_img, X_train_text], y_train_target, 
          validation_data=([X_val_img, X_val_text], y_val_target), 
          epochs=10, batch_size=32)

In [None]:
# Plot Training and Validation Accuracy
def plot_accuracy(history):
    plt.figure(figsize=(8, 6))
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    plt.show()

# Plot Training and Validation Loss
def plot_loss(history):
    plt.figure(figsize=(8, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

In [None]:
# Plot accuracy and loss
plot_accuracy(history)


In [None]:
plot_loss(history)

In [None]:
# Save the trained model, including the ViT layers
model.save("image_captioning_with_vit_model.h5")

In [None]:
def generate_caption(model, tokenizer, image_path, max_len=max_len):
    image = preprocess_image(image_path)
    image = np.expand_dims(image, axis=0)  # Add batch dimension
    
    # Start with the <start> token
    sequence = tokenizer.texts_to_sequences(['<start>'])
    sequence = pad_sequences(sequence, maxlen=max_len-1, padding='post')

    caption = []
    for i in range(max_len-1):
        pred = model.predict([image, sequence], verbose=0)
        next_word_idx = np.argmax(pred)
        next_word = tokenizer.index_word.get(next_word_idx, '<unk>')

        if next_word == '<end>':
            break
        
        caption.append(next_word)
        
        # Update the sequence with the predicted word
        sequence[0, i] = next_word_idx
    
    return ' '.join(caption)


In [None]:
# BLEU Score Evaluation
def evaluate_bleu(model, tokenizer, image_paths, actual_captions, max_len=40):
    predicted_captions = []
    
    for image_path in image_paths:
        caption = generate_caption(model, tokenizer, image_path, max_len)
        predicted_captions.append(caption.split())

    # Tokenize the actual captions for BLEU evaluation
    actual_captions = [caption.split() for caption in actual_captions]
    
    # Calculate BLEU score
    bleu_score = corpus_bleu(actual_captions, predicted_captions)
    print(f'BLEU Score: {bleu_score}')
    return bleu_score

In [None]:
# Example: Evaluate on test data
bleu_score = evaluate_bleu(model, tokenizer, X_test_img, y_test_target)
print(f'Test BLEU Score: {bleu_score}')