In [None]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from PIL import Image

from nltk.stem import PorterStemmer
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Load Twitter captions data
labels = pd.read_excel("Captions.xlsx")[["Caption", "image_id"]]
labels

In [None]:
labels.info()

In [None]:
# Feature Extraction
vgg = VGG16()
vgg = Model(inputs=vgg.inputs, outputs=vgg.layers[-2].output)
vgg.summary()

In [None]:
# Extract image features
features = {}
BASE_DIR = "./Images/"
images = os.listdir(BASE_DIR)

for img in tqdm(images):
    try:
        image_path = os.path.join(BASE_DIR, img)
        image = Image.open(image_path).resize((224, 224))

        image = img_to_array(image)
        image = np.expand_dims(image, axis=0)
        image = preprocess_input(image)

        feature = vgg.predict(image, verbose=0)

        image_idx = str(img).split(".")[0]
        features[image_idx] = feature
    except Exception as e:
        print(f"Error processing {img}: {e}")

# Save features
with open("image_features.pkl", "wb") as f:
    pickle.dump(features, f)

In [None]:
# Preprocess Twitter captions
class Preprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words("english"))

    def remove_username(self, tweet):
        return re.sub(r"@(\w+)", "", tweet)
    
    def remove_punctuations(self, tweet):
        puncs = [".", ",", "!", "?", ":", ";", "-", "_", "(", ")", "[", "]", "{", "}", "<", ">", "/", "\\", "|", "@", "$", "%", "^", "&", "*", "+", "=", "~", "`", "RT"]
        for punc in puncs:
            tweet = tweet.replace(punc, "")
        return tweet

    def remove_stopwords(self, tweet):
        stop_words = set(stopwords.words('english'))
        filtered_tweets = [t for t in tweet.split(" ") if t.lower() not in stop_words]
        return " ".join(filtered_tweets).strip()
    
    def lemmatize(self, tweet):
        lemmatized = [self.lemmatizer.lemmatize(t) for t in tweet.split(" ")]
        return " ".join(lemmatized)
    
    def stem_words(self, tweet):
        stemmer = PorterStemmer()
        stemmed = [stemmer.stem(t) for t in tweet.split(" ")]
        return " ".join(stemmed)

In [None]:
# Apply preprocessing
preprocessor = Preprocessor()
labels["Caption"] = labels["Caption"].fillna("").astype(str)
labels["Caption"] = labels["Caption"].apply(preprocessor.remove_username)
labels["Caption"] = labels["Caption"].apply(preprocessor.remove_punctuations)
labels["Caption"] = labels["Caption"].apply(preprocessor.remove_stopwords)
labels["Caption"] = labels["Caption"].apply(preprocessor.lemmatize)

# Add start/end tokens
labels["Caption"] = labels["Caption"].apply(lambda x: "<start> " + x + " <end>")

In [None]:
# Load image features
with open("image_features.pkl", "rb") as f:
    image_features = pickle.load(f)

In [None]:
# Tokenize captions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(labels["Caption"].values)
vocab_size = len(tokenizer.word_index) + 1

print("Vocab size: ", vocab_size)

In [None]:
# Prepare data for model
def generate_data(features, labels, max_length, tokenizer, vocab_size):
    image_features, i_sequence, o_sequence = [], [], []

    for key in features.keys():
        image_feature = features[key][0] #(4096,)

        label = labels[labels["image_id"] == f"{key}.jpg"]["Caption"].values[0]
        label_split = tokenizer.texts_to_sequences([label])[0]
        
        for i in range(1, len(label_split)):
            prev, next_word = label_split[:i], label_split[i]

            in_seq = pad_sequences([prev], maxlen=max_length, padding='post')[0]
            out_seq = to_categorical([next_word], num_classes=vocab_size)[0]

            image_features.append(image_feature)
            i_sequence.append(in_seq)
            o_sequence.append(out_seq)
            
    return np.array(image_features), np.array(i_sequence), np.array(o_sequence)

# Find max caption length
max_length = max(len(caption.split()) for caption in labels["Caption"])
img_features, i_sequence, o_sequence = generate_data(image_features, labels, 
                                                    max_length=max_length, 
                                                    tokenizer=tokenizer, 
                                                    vocab_size=vocab_size)

print(f"Image features: {img_features.shape}, Input sequence: {i_sequence.shape}, Output sequence: {o_sequence.shape}")

In [None]:
# Model definition
def caption_generator_model(vocab_size, max_length):
    # Image feature branch
    image_input = Input(shape=(4096,))
    image_dropout = Dropout(0.5)(image_input)
    image_dense = Dense(256, activation='relu')(image_dropout)

    # Text branch
    text_input = Input(shape=(max_length,))
    text_embedding = Embedding(input_dim=vocab_size, output_dim=256)(text_input)
    text_dropout = Dropout(0.5)(text_embedding)
    text_lstm = LSTM(256)(text_dropout)

    # Combined model
    combined = add([image_dense, text_lstm])
    dense = Dense(256, activation='relu')(combined)
    output = Dense(vocab_size, activation='softmax')(dense)

    model = Model(inputs=[image_input, text_input], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

# Create and train model
model = caption_generator_model(vocab_size=vocab_size, max_length=max_length)
model.summary()

# Train model
model.fit([img_features, i_sequence], o_sequence, batch_size=32, epochs=10)

In [None]:
# Save model
model.save("twitter_caption_model.h5")

# Save tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
# Function to generate captions
def generate_caption(image_path, model, tokenizer, max_length):
    # Load and preprocess image
    image = Image.open(image_path).resize((224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    
    # Extract features
    feature = vgg.predict(image, verbose=0)
    
    # Initialize caption
    caption = "<start>"
    
    # Generate caption word by word
    for _ in range(max_length):
        # Tokenize current caption
        sequence = tokenizer.texts_to_sequences([caption])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        
        # Predict next word
        pred = model.predict([feature, sequence], verbose=0)
        next_word_idx = np.argmax(pred[0])
        
        # Get word from index
        next_word = tokenizer.index_word.get(next_word_idx, '')
        
        # Stop if we predict the end token
        if next_word == 'end':
            break
            
        # Append to caption
        caption += " " + next_word
    
    # Remove start token
    caption = caption.replace("<start>", "").strip()
    return caption

# Test the model
test_image = "test.jpg"  # Replace with your test image path
generated_caption = generate_caption(test_image, model, tokenizer, max_length)
print("Generated Caption:", generated_caption)