# <font color='blue'>Computer Vision and Natural Language Processing for Automatic Subtitle Generation From an Image</font>

## Installing and Loading Packages

In [None]:
# Python Version
from platform import python_version
print('Python Verison in this Jupyter Notebook:', python_version())

In [None]:
# To update a package, execute the following command in the terminal or command prompt:
# pip install -U pack_name

# To install the exact version of a package, execute the following command in the terminal or command prompt:
# pip install pack_name==desired_version

# After installing or updating the package, restart the jupyter notebook.

# Watermark package 
# !pip install -q -U 

In [None]:
# Imports
import re
import cv2
import pickle
import matplotlib
import tensorflow
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.applications.xception import Xception, preprocess_input, decode_predictions
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, LSTM
%matplotlib inline

In [None]:
# Packages versions in this Jupyter Notebook 
%reload_ext watermark
%watermark -a "Joao Salero" --iversions

## Loading and Understanding the Subtitle Dataset

In [None]:
# Function to open and read a file
def read_file(path):
    with open(path) as file:
        data = file.read()
    return data

In [None]:
# Reading subtitles from Flickr8k.token.txt file
data_captions = read_file("/media/datasets/ComputerVision/Cap10/dados/texto/tokens.txt")

In [None]:
# Extract each line from the file
caption = data_captions.split('\n')

In [None]:
# Remove the last blanc line
caption = caption[:-1]

In [None]:
# Print an example subtitle
print(caption[100])

In [None]:
print("Total of subtittles = " + str(len(caption)))

## Storing Legends in Dictionary

In [None]:
# Empty Dictionary
dict_content = {}

In [None]:
# Loop through subtitles
for line in caption:
    
    # Split
    ID, caption = line.split('\t')
    imageID = ID.split('.')[0]

    # Include image if it does not in the Dictionary
    if dict_content.get(imageID) is None:
        dict_content[imageID] = []

    # Append
    dict_content[imageID].append(caption)

In [None]:
# Print the subtitles of the index image 1
print(dict_content[caption[1].split('.')[0]])

In [None]:
# Another example: Print image captions from ID=1002674143_1b742ab4b8
dict_content["1002674143_1b742ab4b8"]

Vamos verificar se as legendas foram mapeadas corretamente.

In [None]:
# Images path
image_path = "/media/datasets/ComputerVision/Cap10/dados/imagens/"

In [None]:
# Gets the image of ID equal to 15
image_id = caption[15].split('.')[0]

In [None]:
# Associating images and subtitles
img = cv2.imread(image_path + image_id + ".jpg")
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.axis("off")
plt.show()

print("Caption:")
for caption in dict_content[image_id]:
    print(caption)

## Subtitle Data Cleanup

In [None]:
# Function Data clearing 
def caption_cleaner(data):
    
    # Converte tudo para minúsculo
    data = data.lower()

    # Tudo que não for caracter será convertido para espaço
    data = re.sub("[^a-z]+", " ", data)
    
    # Retorna somente sentenças com comprimento maior que 1
    data = data.split()
    data = [s for s in data if len(s) > 1]
    data = " ".join(data)

    return data

In [None]:
# Function Testing
print(caption_cleaner("The white 3 and brown # dog is running over the surface of the snow."))

In [None]:
# Apply the function to all subtitles
for key, value in dict_content.items():
    for i in range(len(value)):
        value[i] = caption_cleaner(value[i])

In [None]:
# Result
print(dict_content["1000268201_693b08cb0e"])

In [None]:
# Store the clean data on disk
with open ("dados/texto/tokens_clean.txt", "w") as file:
    file.write(str(dict_content))

## Vocabulary Preparation

In [None]:
# Create a Python set for the vocabulary (sets are unordered objects that accept any data type)
vocab = set()

In [None]:
# Loop through the subtitles to prepare vocabulary
for key in dict_content.keys():
    [vocab.update(sentence.split()) for sentence in dict_content[key]]

In [None]:
print("Vocabulary Size (Number of Individual Words): %d"% len(vocab))

In [None]:
# View vocabulary
vocab

In [None]:
# Total number of word occurrences considering all sentences.
total_words = []
for key in dict_content.keys():
    [total_words.append(i) for des in dict_content[key] for i in des.split()]

In [None]:
print("Total Occurrences of Words:" , len(total_words))

In [None]:
# View the total number of occurrences
total_words

## Applying a filter to vocabulary based on word frequency.

In [None]:
# Creating a counter from total words
counter = collections.Counter(total_words)
counter

In [None]:
type(counter)

In [None]:
# Convert count to dictionary
freq_cnt = dict(counter)
print(len(freq_cnt.keys()))

In [None]:
# View Data
freq_cnt

In [None]:
# Sort the dictionary according to frequency
sorted_freq_cnt = sorted(freq_cnt.items(), reverse = True, key = lambda x:x[1])
sorted_freq_cnt

In [None]:
# Apply the filter returning only words with a frequency greater than 10
threshold = 10
sorted_freq_cnt  = [x for x in sorted_freq_cnt if x[1] > threshold]
total_words = [x[0] for x in sorted_freq_cnt]

In [None]:
print(len(total_words))

### Reading Training and Test Data

In [None]:
# Reading image ids
training_files = read_file("/media/datasets/ComputerVision/Cap10/dados/texto/trainImages.txt")
testing_files = read_file("/media/datasets/ComputerVision/Cap10/dados/texto/testImages.txt")

In [None]:
# Separate and list sentences
training_data = [row.split(".")[0] for row in training_files.split("\n")[:-1]]
testing_data = [row.split(".")[0] for row in testing_files.split("\n")[:-1]]

In [None]:
# Visualize data (image ids)
training_files[:10]

In [None]:
# Dictionary for training content
training_content = {}

In [None]:
# Loop through training data
for img_id in training_files:
    training_content[img_id] = []
    for cap in dict_content[img_id]:
        cap_to_append = "startseq " + cap + " endseq"
        training_content[img_id].append(cap_to_append)

In [None]:
# Example
training_content["2638369467_8fc251595b"]

## Computer Vision - Xception Model for Feature Extraction

[Xception](https://openaccess.thecvf.com/content_cvpr_2017/papers/Chollet_Xception_Deep_Learning_CVPR_2017_paper.pdf): Deep Learning with Depthwise Separable Convolutions

In [None]:
# Transfer learning from Xception model with imagenet weights
model = Xception(weights = 'imagenet', input_shape = (299, 299, 3))

In [None]:
# Model Summary
model.summary()

In [None]:
# Removes the "head" (last two layers) of the original model
new_model = Model(model.input, model.layers[-2].output)

In [None]:
#  Summary of the new model
new_model.summary()

## Pre-Processing Images in the Xception Model Pattern

In [None]:
# Function for image processing
def image_processing(img):
    
    # Upload an image
    img = image.load_img(img, target_size = (299, 299))
    img = image.img_to_array(img)

    # Convert 3D tensor to 4D
    img = np.expand_dims(img, axis = 0)

    # Normalizes images according to Xception architecture requirements
    img = preprocess_input(img)

    return img

In [None]:
# Testing the image processing function
img = image_processing("/media/datasets/ComputerVision/Cap10/dados/imagens/2638369467_8fc251595b.jpg")
print(img.shape)
plt.imshow(img[0])
plt.axis('off')
plt.show()

### Images Encoding 

In [None]:
# Function that receives an image and returns its encoding (resource vector)
# Note that we are using "new_model" predictions to generate the feature vector
def encode_image(img):
    img = image_processing(img)
    feature_vector = new_model.predict(img)
    feature_vector = feature_vector.reshape((-1,))
    return feature_vector

In [None]:
# Testing the encoding function
encode_image(image_path + "1000268201_693b08cb0e.jpg")

In [None]:
# Dictionary for image ids and training resource vectors
encoding_treino = {}

In [None]:
%%time

# Looping and encoding training data
for img_id in training_data:
    try:
        PATH = image_path + img_id + ".jpg"
        encoding_treino[img_id] = encode_image(PATH)
    except:
        pass

In [None]:
# Storing the encoding result on disk 
with open("dados/encoders/atributos_treino_encoded.pkl", "wb") as file:
    pickle.dump(encoding_treino, file)

In [None]:
# Dictionary for image ids and test resource vectors
encoding_teste = {}

In [None]:
%%time

# Loop and encoding in test data
for img_id in testing_data:
    try:
        PATH = image_path + img_id + ".jpg"
        encoding_teste[img_id] = encode_image(PATH)
    except:
        pass

In [None]:
# Storing the encoding result on disk
with open("dados/encoders/atributos_teste_encoded.pkl", "wb") as file:
    pickle.dump(encoding_teste, file)

### Preparing Subtitles for Training

In [None]:
# Vocabulary
len(total_words)

In [None]:
# Word/Index/Word Mapping Dictionaries
word_to_index = {}
index_to_word = {}

In [None]:
# Loop to fill dictionaries
for i, palavra in enumerate(total_words):
    word_to_index[palavra] = i + 1
    index_to_word[i + 1] = palavra

In [None]:
print(len(index_to_word))

In [None]:
# We also added the startseq and endseq tokens to the mappings to mark the beginning and end of sentences
index_to_word[1846] = 'startseq'
word_to_index['startseq'] = 1846
index_to_word[1847] = 'endseq'
word_to_index['endseq'] = 1847

In [None]:
# TVocabulary Size
len(word_to_index) + 1

In [None]:
# Checking the maximum length of a caption
max_len = 0 

for key in training_content.keys():
    for cap in training_content[key]:
        max_len = max(max_len, len(cap.split()))
        
print(max_len)

## Word Embeddings
Loading the Word Embeddings (numeric arrays that represent the words).

In [None]:
# Contains 50-dimensional embeddings for 6 billion English words
arquivo = open("/media/datasets/ComputerVision/Cap10/dados/glove/glove.6B.50d.txt", encoding = 'utf8')

In [None]:
# Dictionary to map words to Embeddings
word_to_embedding = {}

In [None]:
# Dictionary for Mapping Embeddings to Indexes
embedding_index = {}

In [None]:
# Loop to fill embedding_index
for line in file:
    
    # Line split extracted from Glove
    values = line.split()
    
    # Extract a value from Glove
    word = values[0]
    
    # Extract word embedding from Glove
    word_embedding = np.array(values[1:], dtype = 'float')
    
    # Feed the array with word embedding
    embedding_index[word] = word_embedding

In [None]:
# Closing files
arquivo.close()

In [None]:
# Testing
embedding_index['apple']

In [None]:
# Embeddings Dimension
EMBEDDING_DIM = 50

In [None]:
# Vocabulary Size
VOCAB_SIZE = len(word_to_index) + 1
print("Vocabulary Size:", VOCAB_SIZE)

In [None]:
# Generate Embeddings Matrix
def get_embedding_matrix():
    
    # Creates array of zeros as dimensions (VOCAB_SIZE, EMBEDDING_DIM)
    embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
    
    # Loop through the mapping to fill the Embeddings vector
    for word, idx in word_to_index.items():
        embedding_vector = embedding_index.get(word)
        
        # Load the Embeddings matrix
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
            
    return embedding_matrix

In [None]:
# Execute the function and get the Embeddings array
embedding_matrix = get_embedding_matrix()

In [None]:
# Embeddings matrix shape
print(embedding_matrix.shape)

## Final Model Architecture

They will be two parts of the same model:
- Part 1 - Image Prediction
- Part 2 - Subtitle Predictions

#### Part 1

In [None]:
# Input layer of images with shape 2048
input_img_features = Input(shape = (2048,))

In [None]:
# Dropout layer to smooth the model and avoid overfitting
inp_img1 = Dropout(0.3)(input_img_features)

In [None]:
# Fully connected dense layer for image predictions
inp_img2 = Dense(256, activation = 'relu')(inp_img1)

#### Part 2

In [None]:
# Subtitles input layer with max_len shape
input_captions = Input(shape = (max_len,))

In [None]:
# Embedding Layer
inp_cap1 = Embedding(input_dim = VOCAB_SIZE, output_dim = EMBEDDING_DIM, mask_zero = True)(input_captions)

In [None]:
# Dropout layer to smooth the model and avoid overfitting
inp_cap2 = Dropout(0.3)(inp_cap1)

In [None]:
# LSTM Layer
inp_cap3 = LSTM(256)(inp_cap2)

### Inputs Decode 
- 1- An image (299x299x3) passes through the model.

- 2- The final output is inp_img2 which now goes through the decoder (cell below).

- 3- Similarly for subtitles that initially have a shape (batch_size x max_len).

- 4- Next, after passing the subtitles through the Embeddings layer, we generate output as (batch_size x max_len x 50 (embedding_size))) and then it passes through the LSTM layer above and outputs as inp_cap3 (a 256-dimensional vector).

In [None]:
# Associating images with captions for model learning
decoder1 = Add()([inp_img2, inp_cap3])

In [None]:
# Image/caption forecast result
decoder2 = Dense(256, activation = 'relu')(decoder1)

In [None]:
# Model output
outputs = Dense(VOCAB_SIZE, activation = 'softmax')(decoder2)

In [None]:
# Adds the input and output data format to the model as input
final_model = Model(inputs = [input_img_features, input_captions], outputs = outputs)

In [None]:
# Summmary of the model
final_model.summary()

In [None]:
# Pre-initializes the Embedding layer
final_model.layers[2].set_weights([embedding_matrix])
final_model.layers[2].trainable = False

In [None]:
# Compile the model
final_model.compile(optimizer = "adam", loss = "categorical_crossentropy")

### Data Loader (Data Generator to Train the Model)

In [None]:
# Data generator
def data_gen(conteudo_treino, encoding_treino, word_to_idx, max_len, batch_size):
    
    # image, subtitle, label
    X1, X2, y = [], [], []
    
    # Counter
    n = 0
    
    # Loop
    while True:
        
        # Extract image id and caption
        for key, desc_list in conteudo_treino.items():
            
            # Update the counter
            n += 1
            
            # Extract the encoding from the image
            try:
                photo = encoding_treino[key]
            except:
                pass
            
            # Extract captions for the image from this loopp pass
            for desc in desc_list:
                
                # Sequence
                seq = [word_to_idx[word] for word in desc.split() if word in word_to_idx]
                
                for i in range(1,len(seq)):
                    
                    # Input word string
                    xi = seq[0:i]
                    
                    # Output word string
                    yi = seq[i]
                    
                    # Add zero padding to the length of input strings
                    # We take the first row only, since this method inserts and returns a 2D array
                    xi = pad_sequences([xi], maxlen = max_len, value = 0, padding = 'post')[0]
                    
                    # Convert the expected word into One Hot vector notation
                    yi = to_categorical([yi], num_classes = VOCAB_SIZE)[0]
                    
                    # Append
                    X1.append(photo)
                    X2.append(xi)
                    y.append(yi)
                    
                # Update the variables
                if n==batch_size:
                    yield ([np.array(X1), np.array(X2)], np.array(y))
                    X1,X2,y = [],[],[]
                    n = 0

## Model Training

In [None]:
# Hyperparameters
num_epochs = 50
batch_size = 3
steps = len(training_content) // batch_size 

In [None]:
# Create the data generator
generator = data_gen(training_content, encoding_treino, word_to_index, max_len, batch_size)

Training Model

In [None]:
%%time
final_model.fit(generator, epochs = num_epochs, steps_per_epoch = steps, verbose = 1)
print("\nTraining Completed!")

In [None]:
# Save model
final_model.save('modelos/modelo_final.h5')

## Predictions

In [None]:
# Function to predict subtitles from images
def caption_generator(img):
    
    # Mark start of text
    in_text = 'startseq'
    
    # Loop
    for i in range(max_len):
        
        # Text strings
        sequence = [word_to_index[w] for w in in_text.split() if w in word_to_index]
        sequence = pad_sequences([sequence], maxlen = max_len, padding = 'post')
        
        # Prediction
        ypred = final_model.predict([img, sequence])
        
        # Get the highest probability prediction
        ypred = ypred.argmax()
        
        # index for word
        word = index_to_word[ypred]
        in_text += (' ' +  word)
        
        # Check if it is end of sequence
        if word == "endseq":
            break
    
    # Final caption
    legenda_final = in_text.split()[1:-1]
    legenda_final = " ".join(legenda_final)
    
    return legenda_final

In [None]:
# Preview and generation of captions for images

# Style of images
plt.style.use("seaborn")

# Loop through 10 images
for i in range(11):
    
    # Generate an automatic id to fetch an image at random
    idx = np.random.randint(0, 1000)
    
    # Encoding
    all_img_names = list(encoding_teste.keys())
    
    # Image ID 
    img_name = all_img_names[idx]
    
    # Image to preview the subtitle
    photo_2048 = encoding_teste[img_name].reshape((1,2048))
    
    # CLoad image from disk
    i = plt.imread("/media/datasets/ComputerVision/Cap10/dados/imagens/" + img_name + ".jpg")
    
    # Subtitle preview
    legenda = caption_generator(photo_2048)
    
    # Print
    plt.title(legenda)
    plt.imshow(i)
    plt.axis("off")
    plt.show()