In [19]:
import json
import numpy as np
import torch
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image

In [20]:
# Load the ViT model and feature extractor
def load_vit_model():
    feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
    vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
    return feature_extractor, vit_model

In [21]:
# Extract features from a single image using ViT
def extract_features_vit(image_path, feature_extractor, vit_model):
    # Load the image
    image = Image.open(image_path).convert("RGB")
    
    # Process the image and get the features
    inputs = feature_extractor(images=image, return_tensors="pt")
    
    # Use no_grad() to avoid computing gradients
    with torch.no_grad():
        outputs = vit_model(**inputs)
        image_features = outputs.last_hidden_state.numpy().flatten()  # Flatten the features
    return image_features

In [22]:
# Load the image captioning model
def load_captioning_model(model_path):
    model = load_model(model_path)
    return model

In [27]:
# Load the tokenizer
def load_tokenizer(tokenizer_path):
    with open(tokenizer_path, 'r') as f:
        tokenizer_data = json.load(f)  # Load the tokenizer as a dictionary
    tokenizer_json = json.dumps(tokenizer_data)  # Convert dictionary back to a JSON string
    tokenizer = tokenizer_from_json(tokenizer_json)  # Pass the JSON string to tokenizer_from_json
    return tokenizer

In [24]:
# Generate a caption from the image features
def generate_caption(model, tokenizer, image_features, max_len=31):
    # Prepare the image feature (flattened) and text input (empty start sequence)
    image_feature = np.array(image_features).reshape(1, -1)
    seq = np.zeros((1, max_len))  # Start with a sequence of zeros (the <start> token)

    # Predict the next word based on the image and the initial sequence
    caption = []
    for i in range(max_len):
        pred = model.predict([image_feature, seq], verbose=0)
        next_word_idx = np.argmax(pred)
        next_word = tokenizer.index_word.get(next_word_idx, '')
        
        # If <end> token is generated, break the loop
        if next_word == "<end>" or next_word == '':
            break
        
        caption.append(next_word)
        seq[0, i] = next_word_idx  # Update the sequence with the predicted word
    
    return " ".join(caption)

In [25]:
# Example: Load the models and tokenizer, extract features, and generate a caption
def test_image_captioning(image_path, model_path, tokenizer_path):
    # Load the ViT model and feature extractor
    feature_extractor, vit_model = load_vit_model()
    
    # Extract features from the image
    image_features = extract_features_vit(image_path, feature_extractor, vit_model)
    
    # Load the captioning model and tokenizer
    model = load_captioning_model(model_path)
    tokenizer = load_tokenizer(tokenizer_path)
    
    # Generate the caption for the image
    caption = generate_caption(model, tokenizer, image_features)
    print("Generated Caption:", caption)

In [28]:
# Run the test on one image
image_path = "test/test.jpg"  # Replace with your image path
model_path = "image_captioning_model.h5"  # Replace with your saved image captioning model
tokenizer_path = "tokenizer.json"  # Replace with your tokenizer json file

test_image_captioning(image_path, model_path, tokenizer_path)



ValueError: Input 0 of layer "functional_15" is incompatible with the layer: expected shape=(None, 768), found shape=(1, 151296)