In [47]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import VGG16, ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.layers import Embedding, LSTM, Dense, Add, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
resnet_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
resnet_model.trainable = False


In [48]:

def extract_features(image_path):
    image = load_img(image_path, target_size=(224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = tf.keras.applications.resnet50.preprocess_input(image)
    features = resnet_model.predict(image)
    features = features.flatten()
    return features


In [49]:

captions = ["a dog running", "a person walking", "a child playing with a ball"]  
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1 
sequences = tokenizer.texts_to_sequences(captions)
max_caption_length = max([len(seq) for seq in sequences])


In [50]:

from tensorflow.keras.preprocessing.sequence import pad_sequences
image_paths = ['/workspace/MyDailyWork/Imagecaption/images.jpg', '/workspace/MyDailyWork/Imagecaption/img2.jpg', 
                '/workspace/MyDailyWork/Imagecaption/img3.jpg', '/workspace/MyDailyWork/Imagecaption/is.jpg',
                '/workspace/MyDailyWork/Imagecaption/Kazakhstan.jpg'] 

X_image = []
X_caption = []
y = []

for image_path, caption in zip(image_paths, captions):
    features = extract_features(image_path)

    sequence = tokenizer.texts_to_sequences([caption])[0]
    
    for i in range(1, len(sequence)):
        X_caption.append(sequence[:i]) 
        y.append(sequence[i]) 

        X_image.append(features)

X_image = np.array(X_image)
X_caption = pad_sequences(X_caption, maxlen=max_caption_length)  # Padding sequences to the same length
y = np.array(y)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step


In [51]:

image_input = Input(shape=(100352,))  
image_features = Dense(256, activation='relu')(image_input)
caption_input = Input(shape=(max_caption_length,))
embedding = Embedding(vocab_size, 256)(caption_input)
lstm = LSTM(256)(embedding)
merged = Add()([image_features, lstm])
output = Dense(vocab_size, activation='softmax')(merged)
model = Model(inputs=[image_input, caption_input], outputs=output)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [52]:

def generate_caption(image_path):
    features = extract_features(image_path)

    features = features.reshape((1, features.shape[0]))

    sequence = tokenizer.texts_to_sequences(['startseq'])[0]  

    for i in range(max_caption_length):
        padded_sequence = pad_sequences([sequence], maxlen=max_caption_length)
        predicted_probs = model.predict([features, padded_sequence])
        predicted_word_index = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word[predicted_word_index]
        sequence.append(predicted_word_index) 

        if predicted_word == 'endseq':
            break
            
    return ' '.join([tokenizer.index_word[idx] for idx in sequence[1:]]) 

generated_caption = generate_caption('/workspace/MyDailyWork/Imagecaption/img3.jpg')
print(generated_caption)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
walking walking walking walking walking
