In [78]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from PIL import Image

from nltk.stem import PorterStemmer
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [79]:
labels = pd.read_excel("Captions.xlsx")[["Caption", "image_id"]]
labels

Unnamed: 0,Caption,image_id
0,How I feel today #legday #jelly #aching #gym,1.jpg
1,@ArrivaTW absolute disgrace two carriages from...,10.jpg
2,This is my Valentine's from 1 of my nephews. I...,100.jpg
3,betterfeelingfilms: RT via Instagram: First da...,1000.jpg
4,Zoe's first love #Rattled @JohnnyHarper15,1001.jpg
...,...,...
4864,OMG. Well done #Eskom! 'Man dies during #LoadS...,995.jpg
4865,Feelin' the love in here! #ValentinesDay #caring,996.jpg
4866,#blue #eyes can't be #beaten,997.jpg
4867,LA CHUCHA LOUUU TE CHUPO LOS OJOS..!,998.jpg


In [80]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4869 entries, 0 to 4868
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Caption   4869 non-null   object
 1   image_id  4869 non-null   object
dtypes: object(2)
memory usage: 76.2+ KB


In [81]:
# Feature Extraction
vgg = VGG16()
vgg = Model(inputs=vgg.inputs, outputs=vgg.layers[-2].output)
vgg.summary()

KeyboardInterrupt: 

In [None]:
features = {}
BASE_DIR = "./Images/"
images = os.listdir(BASE_DIR)

for img in images:
    image_path = os.path.join(BASE_DIR, img)
    image = Image.open(image_path).resize((224, 224))

    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)

    feature = vgg.predict(image, verbose=0)

    image_idx = str(img).split(".")[0]
    features[image_idx] = feature

pickle.dump(features, open("image_features.pkl", "wb"))

In [82]:
# Preprocess Twitter captions
class Preprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words("english"))

    def remove_username(self, tweet):
        return re.sub(r"@(\w+)", "", tweet)
    
    def remove_punctuations(self, tweet):
        puncs = [".", ",", "!", "?", ":", ";", "-", "_", "(", ")", "[", "]", "{", "}", "<", ">", "/", "\\", "|", "@", "$", "%", "^", "&", "*", "+", "=", "~", "`", "RT"]
        for punc in puncs:
            tweet = tweet.replace(punc, "")
        return tweet

    def remove_stopwords(self, tweet):
        stop_words = set(stopwords.words('english'))
    
        filtered_tweets = [t for t in tweet.split(" ") if t.lower() not in stop_words]
        filtered_tweets = " ".join(filtered_tweets).strip()
        return filtered_tweets
    
    def lemmatize(self, tweet):
        lemmatizer = WordNetLemmatizer()
        lemmatized = [lemmatizer.lemmatize(t) for t in tweet.split(" ")]
        lemmatized = " ".join(lemmatized)
        return lemmatized
    
    def stem_words(self, tweet):
        stemmer = PorterStemmer()
        stemmed = [stemmer.stem(t) for t in tweet.split(" ")]
        stemmed = " ".join(stemmed)
        return stemmed

In [83]:
preprocessor = Preprocessor()
labels["Caption"] = labels["Caption"].fillna("").astype(str)
labels["Caption"] = labels["Caption"].apply(preprocessor.remove_username)
labels["Caption"] = labels["Caption"].apply(preprocessor.remove_punctuations)
labels["Caption"] = labels["Caption"].apply(preprocessor.remove_stopwords)
labels["Caption"] = labels["Caption"].apply(preprocessor.lemmatize)

In [84]:
with open(os.path.join("./", "image_features.pkl"), "rb") as f:
    image_features = pickle.load(f)

In [85]:
labels["Caption"] = labels["Caption"].apply(lambda x: "<start> " + x + " <end>")

In [86]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(labels["Caption"].values)
vocab_size = len(tokenizer.word_index) + 1

print("Vocab size: ", vocab_size)

Vocab size:  15173


In [87]:
labels

Unnamed: 0,Caption,image_id
0,<start> feel today #legday #jelly #aching #gym...,1.jpg
1,<start> absolute disgrace two carriage Bangor ...,10.jpg
2,<start> Valentine's 1 nephew elated sometimes ...,100.jpg
3,<start> betterfeelingfilms via Instagram Firs...,1000.jpg
4,<start> Zoe's first love #Rattled <end>,1001.jpg
...,...,...
4864,<start> OMG Well done #Eskom 'Man dy #LoadShed...,995.jpg
4865,<start> Feelin' love #ValentinesDay #caring <end>,996.jpg
4866,<start> #blue #eyes can't #beaten <end>,997.jpg
4867,<start> LA CHUCHA LOUUU TE CHUPO LOS OJOS <end>,998.jpg


In [None]:
# 
def generate_data(features, labels, max_length, tokenizer, vocab_size):
    """
    Organize the data into image features, input sequence features and the label (output sequence).

    features - image features
    labels - caption dataframe
    max)_length - maximum length of the caption
    tokenizer - tokenizer object
    vocab_size - total # of classes the model can predict
    """
    image_features, i_sequence, o_sequence = [],[],[]

    for key in features.keys():
        image_feature = features[key][0] #(4096,)

        label = labels[labels["image_id"] == f"{key}.jpg"]["Caption"].values[0]
        # print(label)
        label_split = tokenizer.texts_to_sequences([label])[0]
        for i in range(1,len(label_split)):
            prev, next = label_split[:i], label_split[i]

            in_seq = pad_sequences([prev], maxlen=max_length, padding='post')[0]
            out_seq = to_categorical([next], 	num_classes=vocab_size)[0]

            image_features.append(image_feature)
            i_sequence.append(in_seq)
            o_sequence.append(out_seq)
    return np.array(image_features), np.array(i_sequence), np.array(o_sequence)

img_features, i_sequence, o_sequence = generate_data(image_features, labels, max_length=235, tokenizer=tokenizer, vocab_size=vocab_size)
print(f"Image features: {img_features.shape}, Input sequence: {i_sequence.shape}, Output sequence: {o_sequence.shape}")

Image features: (44972, 4096), Input sequence: (44972, 235), Output sequence: (44972, 15173)


In [None]:
# Model