<h1>Real-Time Video Captioning</h1>

<h3>Importing Libraries and Dependecies</h3>

In [50]:
import os
import time
import sys
import string
import pickle

import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

from tensorflow.keras.utils import to_categorical
from keras.applications.xception import Xception

from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add, Concatenate, Flatten, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import pad_sequences

<h3>Data Extraction</h3>

In [11]:
df = pd.read_csv("captions.txt", nrows=10000)

In [12]:
df.head(6)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...
5,1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting


<h3>Data Exploratory Analysis</h3>

In [13]:
df.shape

(10000, 2)

In [5]:
df.columns

Index(['image', 'caption'], dtype='object')

In [6]:
df.dtypes

image      object
caption    object
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   image    40455 non-null  object
 1   caption  40455 non-null  object
dtypes: object(2)
memory usage: 632.2+ KB


In [14]:
df.describe()

Unnamed: 0,image,caption
count,10000,10000
unique,2000,9977
top,1000268201_693b08cb0e.jpg,Two dogs play in the grass .
freq,5,3


<h3>Data Preprocessing Steps</h3>

<h4>1. Validation and Cleansing</h4>

In [9]:
df.isna().sum()

image      0
caption    0
dtype: int64

In [8]:
df.duplicated().sum()

10

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.duplicated().sum()

0

<h4>2. Image Resizing and Reshapping</h4>

In [15]:
images = []
for counter, img in enumerate(df['image']):
    image = cv2.imread(f"../dataset/images/{img}")
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    resized_image = cv2.resize(image_rgb, (224, 224))
    images.append(resized_image)
    
    if counter % 1000 == 0:
        print(f"Processing {counter} of {df['image'].shape[0]}...\n")

Processing 0 of 10000...

Processing 1000 of 10000...

Processing 2000 of 10000...

Processing 3000 of 10000...

Processing 4000 of 10000...

Processing 5000 of 10000...

Processing 6000 of 10000...

Processing 7000 of 10000...

Processing 8000 of 10000...

Processing 9000 of 10000...



In [16]:
len(images)

10000

In [17]:
# np.save("images_array.npy", np.array(images))

In [18]:
images = np.load("../dataset/images_array.npy")

In [19]:
images.shape

(10000, 224, 224, 3)

<h4>3. Caption Normalization</h4>

In [20]:
df['caption'] = df['caption'].str.lower()

In [21]:
df.head(3)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,a child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,a girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,a little girl climbing into a wooden playhouse .


In [22]:
df['caption'] = df['caption'].str.translate(str.maketrans('', '', string.punctuation))

In [23]:
tokenized_caption = [word_tokenize(caption) for caption in df['caption']]

In [24]:
stop_words = stopwords.words('english')

In [25]:
filtered_caption = [[word for word in caption if word not in stop_words] for caption in tokenized_caption]

In [26]:
vocab = {}
for caption_tokens in filtered_caption:
    for token in caption_tokens:
        vocab[token] = vocab.get(token, 0) + 1

In [27]:
vocab_size = len(vocab)

In [28]:
vocab_size

4437

In [29]:
tokenized_tokens = [[vocab.get(word) for word in caption] for caption in filtered_caption]

In [30]:
len(tokenized_tokens)

10000

In [31]:
max_length = np.max([len(caption_tokens) for caption_tokens in filtered_caption])
max_length

21

In [32]:
padded_tokens = pad_sequences(tokenized_tokens, maxlen=max_length, padding="pre")

In [33]:
df = pd.DataFrame(padded_tokens)
pd.get_dummies(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,486,180,78,189,20,19,1,23
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,939,34,81,84
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,556,939,189,81,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,556,939,189,19,4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,556,939,180,78,34,81,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1150,601,464,77,26
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1150,601,464,12,26
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1150,24,26
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1150,22,26,16,31,11,32


In [34]:
padded_tokens.shape

(10000, 21)

In [35]:
# target_captions = to_categorical(padded_tokens[:32], num_classes=vocab_size)

In [36]:
# Function to load GloVe embeddings into a dictionary
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Provide the path to your downloaded GloVe file
glove_file_path = '../dataset/embed/glove.6B.100d.txt'  # Change the path and dimensionality accordingly

# Load GloVe embeddings
glove_embeddings = load_glove_embeddings(glove_file_path)

In [37]:
# Create an embedding matrix for your vocabulary
embedding_dim = 100  # Change the dimensionality based on your GloVe model

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in vocab.items():
    if i < vocab_size:
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [40]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.30004001, -0.24823999,  0.41365001, ...,  0.41650999,
        -0.65367001, -0.20812   ],
       [ 0.44822001,  0.48396   , -0.37865001, ...,  0.44064   ,
        -0.055766  ,  0.039926  ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [41]:
embedding_matrix.shape

(4437, 100)

# Model Development

In [51]:
img_model = Xception( include_top=False, pooling='avg' )

In [52]:
img_model.summary()

Model: "xception"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, None, None, 3)]      0         []                            
                                                                                                  
 block1_conv1 (Conv2D)       (None, None, None, 32)       864       ['input_2[0][0]']             
                                                                                                  
 block1_conv1_bn (BatchNorm  (None, None, None, 32)       128       ['block1_conv1[0][0]']        
 alization)                                                                                       
                                                                                                  
 block1_conv1_act (Activati  (None, None, None, 32)       0         ['block1_conv1_bn[0][0]

                                                                                                  
 block4_sepconv2_act (Activ  (None, None, None, 728)      0         ['block4_sepconv1_bn[0][0]']  
 ation)                                                                                           
                                                                                                  
 block4_sepconv2 (Separable  (None, None, None, 728)      536536    ['block4_sepconv2_act[0][0]'] 
 Conv2D)                                                                                          
                                                                                                  
 block4_sepconv2_bn (BatchN  (None, None, None, 728)      2912      ['block4_sepconv2[0][0]']     
 ormalization)                                                                                    
                                                                                                  
 conv2d_2 

                                                                                                  
 block7_sepconv1 (Separable  (None, None, None, 728)      536536    ['block7_sepconv1_act[0][0]'] 
 Conv2D)                                                                                          
                                                                                                  
 block7_sepconv1_bn (BatchN  (None, None, None, 728)      2912      ['block7_sepconv1[0][0]']     
 ormalization)                                                                                    
                                                                                                  
 block7_sepconv2_act (Activ  (None, None, None, 728)      0         ['block7_sepconv1_bn[0][0]']  
 ation)                                                                                           
                                                                                                  
 block7_se

 ormalization)                                                                                    
                                                                                                  
 add_7 (Add)                 (None, None, None, 728)      0         ['block9_sepconv3_bn[0][0]',  
                                                                     'add_6[0][0]']               
                                                                                                  
 block10_sepconv1_act (Acti  (None, None, None, 728)      0         ['add_7[0][0]']               
 vation)                                                                                          
                                                                                                  
 block10_sepconv1 (Separabl  (None, None, None, 728)      536536    ['block10_sepconv1_act[0][0]']
 eConv2D)                                                                                         
          

 block12_sepconv3_act (Acti  (None, None, None, 728)      0         ['block12_sepconv2_bn[0][0]'] 
 vation)                                                                                          
                                                                                                  
 block12_sepconv3 (Separabl  (None, None, None, 728)      536536    ['block12_sepconv3_act[0][0]']
 eConv2D)                                                                                         
                                                                                                  
 block12_sepconv3_bn (Batch  (None, None, None, 728)      2912      ['block12_sepconv3[0][0]']    
 Normalization)                                                                                   
                                                                                                  
 add_10 (Add)                (None, None, None, 728)      0         ['block12_sepconv3_bn[0][0]', 
          

In [53]:
# img_features = img_model.predict(images)



In [54]:
# np.save("img_features.npy", img_features)

In [57]:
img_features = np.load("../dataset/img_features.npy")

In [None]:
# img_features.shape

In [63]:
img_model.output.shape[1]

2048

In [73]:
image_input = Input(shape=img_model.output.shape[1],)
dropout_image = Dropout(rate=0.2) (image_input)
dense_image = Dense(256, activation="relu")(flatten_image)

caption_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length,
                           weights=[embedding_matrix], trainable=False)(caption_input)
lstm = LSTM(256, return_sequences=True)(embedding_layer)


merged_layer = add([dense_image, lstm])
decoder = Dense(256, activation='relu')(merged_layer)
output = Dense(vocab_size, activation='softmax')(lstm)

# Create the caption generation model
caption_model = Model(inputs=[image_input, caption_input], outputs=output)

caption_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

caption_model.summary()


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_11 (InputLayer)       [(None, 21)]                 0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 21, 100)              443700    ['input_11[0][0]']            
                                                                                                  
 lstm_2 (LSTM)               (None, 21, 256)              365568    ['embedding_2[0][0]']         
                                                                                                  
 input_10 (InputLayer)       [(None, 2048)]               0         []                            
                                                                                            

In [74]:
def train_model(model, inputs, target, batch_size=32, epochs=5):
    print("Model training started...\n")
    for epoch in range(epochs):
        start = 0
        end = batch_size
        
        total_batch_count = inputs[0].shape[0] // batch_size
        history = None
        for iters in range(total_batch_count):            
            target_batch = to_categorical(padded_tokens[start:end], num_classes=vocab_size)
            inp1 = inputs[0][start:end]
            inp2 = inputs[1][start:end]
            history = model.fit([inp1, inp2], target_batch, verbose=0, batch_size=batch_size, validation_split=0.15)
            start = end
            end = start + batch_size
            
        data = history.history
        print(f"Epoch {epoch + 1} of {epochs}:\tLoss: {data['loss']}\tAccuracy: {data['accuracy']}")
    print("\nModel training completed!!")
    return model

In [75]:
model = train_model(caption_model, [img_features, padded_tokens], padded_tokens, epochs=10)

Model training started...

Epoch 1 of 10:	Loss: [1.3293161392211914]	Accuracy: [0.7989417910575867]
Epoch 2 of 10:	Loss: [0.47866809368133545]	Accuracy: [0.9964726567268372]
Epoch 3 of 10:	Loss: [0.14594675600528717]	Accuracy: [1.0]
Epoch 4 of 10:	Loss: [0.02975761517882347]	Accuracy: [1.0]
Epoch 5 of 10:	Loss: [0.013669713400304317]	Accuracy: [1.0]
Epoch 6 of 10:	Loss: [0.00812610611319542]	Accuracy: [1.0]
Epoch 7 of 10:	Loss: [0.005410151556134224]	Accuracy: [1.0]
Epoch 8 of 10:	Loss: [0.0038291814271360636]	Accuracy: [1.0]
Epoch 9 of 10:	Loss: [0.002821696689352393]	Accuracy: [1.0]
Epoch 10 of 10:	Loss: [0.0021416358649730682]	Accuracy: [1.0]

Model training completed!!


In [77]:
with open("../dataset/caption_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

In [None]:
sample_img = cv2.imread(f"../dataset/sample_img/sample1.jpg")
img_rgb = cv2.cvtColor(sample_img, cv2.COLOR_BGR2RGB)
resized_img = cv2.resize(img_rgb, (224, 224))

In [None]:
np.array([resized_img]).shape

In [None]:
resized_image_batch = np.expand_dims(resized_image, axis=0)
img_f = img_model.predict(resized_image_batch)

In [None]:

# Initialize an empty caption
captions = []
max_caption_length = 100

# Token for the start of the sequence
start_token = vocab['index']

# Initialize the seed caption with the start token
seed_caption_tokens = [start_token]

sequences = pad_sequences([seed_caption_tokens], maxlen=max_length, padding="pre")

In [None]:
# Generate captions iteratively
for _ in range(max_caption_length):

    # Predict the next word based on the current seed caption tokens and image features
    predictions = caption_model.predict([img_f, sequences])

    # Get the index of the predicted word
    predicted_index = np.argmax(predictions)

    # Append the predicted word to the caption
    captions.append(predicted_index)

In [None]:
p = []
for c in captions:
    for k, v in vocab.items():
        print(c, v)
        if c == v:
            p.append(k)

In [None]:
p