In [3]:
!pip install numpy pandas matplotlib keras tensorflow nltk gradio



In [4]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [5]:
import os
import random
import numpy as np
import pandas as pd
import zipfile
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Embedding, LSTM, add
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from PIL import Image
import scipy.sparse
import gradio as gr

In [6]:
def unzip_files():
    with zipfile.ZipFile('Flickr8k_Dataset.zip', 'r') as zip_ref:
        zip_ref.extractall('Flickr8k_Dataset')
    with zipfile.ZipFile('Flickr8k_text.zip', 'r') as zip_ref:
        zip_ref.extractall('Flickr8k_text')

def select_random_images(image_directory, num_images=500):
    all_images = [img for img in os.listdir(image_directory) if img.endswith('.jpg')]
    selected_images = random.sample(all_images, num_images)
    selected_images_with_suffixes = [f"{img}#{i}" for img in selected_images for i in range(5)]
    return selected_images, selected_images_with_suffixes

def load_captions(captions_file):
    captions = pd.read_csv(captions_file, sep='\t', header=None)
    captions.columns = ['image', 'caption']
    return captions

def preprocess_captions(captions_df, selected_images_with_suffixes):
    filtered_captions = captions_df[captions_df['image'].isin(selected_images_with_suffixes)]
    filtered_captions['caption'] = filtered_captions['caption'].apply(lambda x: f"<start> {x} <end>")
    return filtered_captions

def load_and_preprocess_images(selected_images, image_directory):
    processed_images = []
    for img_name in selected_images:
        img_path = os.path.join(image_directory, img_name)
        if os.path.exists(img_path):
            img = Image.open(img_path).convert('RGB').resize((128, 128))
            img_array = np.array(img) / 255.0  # Normalize
            processed_images.append(img_array)
        else:
            print(f"Image not found: {img_path}")
    return np.array(processed_images)

def create_custom_cnn(input_shape):
    inputs = Input(shape=input_shape)
    x = Conv2D(32, (3, 3), activation='relu')(inputs)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Flatten()(x)
    x = Dense(512, activation='relu')(x)
    model = Model(inputs, x)
    return model

In [7]:
def prepare_sequences(filtered_captions, selected_images, processed_images):
    # Add special tokens to the captions
    filtered_captions['caption'] = filtered_captions['caption'].apply(lambda x: f"<start> {x} <end>")
    
    tokenizer = Tokenizer(oov_token="<unk>")
    tokenizer.fit_on_texts(filtered_captions['caption'])
    
    # Manually add <start> and <end> tokens if they're not already in the vocabulary
    if '<start>' not in tokenizer.word_index:
        tokenizer.word_index['<start>'] = len(tokenizer.word_index) + 1
    if '<end>' not in tokenizer.word_index:
        tokenizer.word_index['<end>'] = len(tokenizer.word_index) + 1
    
    sequences = tokenizer.texts_to_sequences(filtered_captions['caption'])
    vocab_size = len(tokenizer.word_index) + 1
    
    max_length = max(len(seq) for seq in sequences)
    X_caption = pad_sequences(sequences, maxlen=max_length, padding='post')
    
    X_image = np.repeat(np.array(processed_images), len(filtered_captions) // len(processed_images), axis=0)

    y_caption = [seq[1:] for seq in sequences]  # Shift sequences by one
    y_caption = pad_sequences(y_caption, maxlen=max_length, padding='post')
    y_caption = np.array([to_categorical(seq, num_classes=vocab_size) for seq in y_caption])
    
    return X_image, X_caption, y_caption, vocab_size, max_length, tokenizer

In [8]:
def define_model(vocab_size, max_length):
    inputs_image = Input(shape=(128, 128, 3))
    cnn_model = create_custom_cnn((128, 128, 3))
    features = cnn_model(inputs_image)

    inputs_caption = Input(shape=(max_length,))
    embedding = Embedding(vocab_size, 256, mask_zero=True)(inputs_caption)
    lstm = LSTM(256, return_sequences=True)(embedding)

    features = Dense(256)(features)
    features = RepeatVector(max_length)(features)
    
    decoder = add([features, lstm])
    decoder = Dense(256, activation='relu')(decoder)
    outputs = Dense(vocab_size, activation='softmax')(decoder)

    model = Model([inputs_image, inputs_caption], outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    return model

In [9]:
def train_model(model, X_image, X_caption, y_caption, vocab_size):
    model.fit(
        [X_image, X_caption], y_caption,
        epochs=30,
        batch_size=32,
        validation_split=0.2
    )


In [10]:
def create_gradio_interface(model, tokenizer, max_length):
    def predict_caption(image):
        img_array = np.array(image.resize((128, 128))) / 255.0
        img_array = np.expand_dims(img_array, axis=0)

        input_seq = [tokenizer.word_index['<start>']]
        for _ in range(max_length):
            sequence = pad_sequences([input_seq], maxlen=max_length, padding='post')
            pred = model.predict([img_array, sequence])
            pred = np.argmax(pred[0], axis=-1)
            word = tokenizer.index_word.get(pred[-1], '<unknown>')
            if word == '<end>':
                break
            input_seq.append(pred[-1])
        
        return ' '.join([tokenizer.index_word.get(idx, '<unknown>') for idx in input_seq[1:]])

    iface = gr.Interface(fn=predict_caption, inputs="image", outputs="text")
    return iface

In [11]:
# Set up paths
images_directory = "Flickr8k_Dataset/Flicker8k_Dataset"
captions_file = "Flickr8k_text/Flickr8k.token.txt"

# Unzip files if necessary
# unzip_files()

# Select random images
selected_images, selected_images_with_suffixes = select_random_images(images_directory)

# Load and preprocess captions
captions_df = load_captions(captions_file)
filtered_captions = preprocess_captions(captions_df, selected_images_with_suffixes)

# Load and preprocess images
processed_images = load_and_preprocess_images(selected_images, images_directory)

# Prepare sequences
X_image, X_caption, y_caption, vocab_size, max_length, tokenizer = prepare_sequences(filtered_captions, selected_images, processed_images)

# Define and train the model
model = define_model(vocab_size, max_length)
train_model(model, X_image, X_caption, y_caption, vocab_size)

# Create Gradio interface
gradio_interface = create_gradio_interface(model, tokenizer, max_length)

Epoch 1/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 616ms/step - loss: 3.0365 - val_loss: 1.6584
Epoch 2/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 557ms/step - loss: 1.5539 - val_loss: 1.5296
Epoch 3/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 554ms/step - loss: 1.4200 - val_loss: 1.4622
Epoch 4/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 459ms/step - loss: 1.3313 - val_loss: 1.3747
Epoch 5/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 481ms/step - loss: 1.2199 - val_loss: 1.3481
Epoch 6/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 555ms/step - loss: 1.1331 - val_loss: 1.3342
Epoch 7/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 489ms/step - loss: 1.0652 - val_loss: 1.3638
Epoch 8/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 546ms/step - loss: 0.9351 - val_loss: 1.4140
Epoch 9/30
[1m63/63[0m [32m━━

In [35]:
def create_gradio_interface(model, tokenizer, max_length):
    def predict_caption(image):
        try:
            # Ensure the image is in RGB format
            img = Image.fromarray(image).convert('RGB')
            img_array = np.array(img.resize((128, 128))) / 255.0
            img_array = np.expand_dims(img_array, axis=0)

            input_seq = [tokenizer.word_index['<start>']]
            for i in range(max_length):
                sequence = pad_sequences([input_seq], maxlen=max_length, padding='post')
                pred = model.predict([img_array, sequence], verbose=0)
                pred = np.argmax(pred[0], axis=-1)
                word = tokenizer.index_word.get(pred[i], '<unknown>')
                if word == '<end>':
                    break
                if word != '<start>' and word != '<unknown>':
                    input_seq.append(pred[i])
            
            caption = ' '.join([tokenizer.index_word.get(idx, '<unknown>') for idx in input_seq[1:] if idx not in [tokenizer.word_index['start'], 
                                                                                                                   tokenizer.word_index['end']]])
            return caption
        except Exception as e:
            return f"An error occurred: {str(e)}"

    iface = gr.Interface(
        fn=predict_caption,
        inputs=gr.Image(type="numpy"),
        outputs="text",
        live=False
    )
    return iface

# Create and launch the interface
gradio_interface = create_gradio_interface(model, tokenizer, max_length)
gradio_interface.launch()

* Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.


