In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

/bin/bash: line 1: nvidia-smi: command not found


In [2]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/MyDrive/CSCE642

Mounted at /gdrive
/gdrive/MyDrive/CSCE642


In [3]:
import tensorflow as tf
from tensorflow import keras
from keras import layers

import json

from collections import defaultdict
import torch

import numpy as np

In [4]:
# Load the JSON data from file
with open('fixed_train_val_videodatainfo.json', 'r') as file:
    data = json.load(file)

# Filter videos with "category": 15, d stands for dictionary
cat15_video_d = [video_d for video_d in data['videos'] if video_d['category'] == 15]

# Do something with the filtered videos
video_15_ids = []
for video_d in cat15_video_d:
    video_15_ids.append(video_d['id'])


In [5]:
print(cat15_video_d[0])

{'category': 15, 'url': 'https://www.youtube.com/watch?v=sNQZGRmpPL0', 'video_id': 'video203', 'start time': 24.37, 'end time': 40.49, 'split': 'train', 'id': 203}


In [6]:
# Filter captions for videos with "category": 15
cat15_sent_d = [sent_d for sent_d in data['sentences'] if int(sent_d['video_id'][5:]) in video_15_ids]

print(cat15_sent_d[1])

{'caption': 'a group discusses hillary clinton for president', 'video_id': 'video1919', 'sen_id': 861}


In [7]:
# generate a map videoID: [senID]
videoID_senIDList = defaultdict(list)
for sent_d in cat15_sent_d:
  senID = sent_d['sen_id']
  videoID = int(sent_d['video_id'][5:])
  videoID_senIDList[videoID].append(senID)

In [8]:
print(videoID_senIDList[203])

[133680, 133681, 133682, 133683, 133684, 133685, 133686, 133687, 133688, 133689, 133690, 133691, 133692, 133693, 133694, 133695, 133696, 133697, 133698, 133699]


In [9]:
# generate a map senID : videoID
senID_videoID = {}
for sent_d in cat15_sent_d:
  senID = sent_d['sen_id']
  videoID = int(sent_d['video_id'][5:])
  senID_videoID[senID] = videoID

In [10]:
print(senID_videoID[861])

1919


In [11]:
# map sentence ID: caption
senID_caption = {}
for sent_d in cat15_sent_d:
  senID = sent_d['sen_id']
  senID_caption[senID] = "[start] " + sent_d['caption'] + " [end]"


In [12]:
print(senID_caption[861])

[start] a group discusses hillary clinton for president [end]


In [13]:
print(len(senID_caption))

2240


Note that same caption may show up for different video. Can not use caption as id for dictionary.

In [14]:
# generate states for each caption
senID_curSenList = defaultdict(list)
for senID, caption in senID_caption.items():
  words = caption.split()

  for i in range(1, len(words) + 1):  # it starts from 1 because the first state is "[start]" (empty sentence)
      cur = ' '.join(words[:i])
      senID_curSenList[senID].append(cur)

In [15]:
print(senID_curSenList[861])

['[start]', '[start] a', '[start] a group', '[start] a group discusses', '[start] a group discusses hillary', '[start] a group discusses hillary clinton', '[start] a group discusses hillary clinton for', '[start] a group discusses hillary clinton for president', '[start] a group discusses hillary clinton for president [end]']


**Note the sentences in the above cell**

In [None]:
# want to compute cider scores for each state
# pip install git+https://github.com/vrama91/cider.git
# from cider_scorer import CiderScorer

**Generate the text encoding using layer TextVectorization**

In [16]:
# collect the captions
video_15_captions = []
for caption in senID_caption.values():
    video_15_captions.append(caption)

print(len(video_15_captions))
print(video_15_captions[0])


2240
[start] 4 men with suits on sitting at a table discussing hiliary clinton [end]


In [17]:
# get to know how many vocabulary in all the captions
video_15_captions_split = [captions.split(' ') for captions in video_15_captions]
print(video_15_captions_split[0])
print('Vocabulary size for outputs is ', len(set( [c for c_list in video_15_captions_split for c in c_list] )))
print('The max length of captions is ', max([len(c_list) for c_list in video_15_captions_split]))
print('The average length of captions is ', sum([len(c_list) for c_list in video_15_captions_split]) / len(video_15_captions_split) )


['[start]', '4', 'men', 'with', 'suits', 'on', 'sitting', 'at', 'a', 'table', 'discussing', 'hiliary', 'clinton', '[end]']
Vocabulary size for outputs is  2638
The max length of captions is  52
The average length of captions is  11.650892857142857


In [18]:
# vectorize the captions
text_vectorization = layers.TextVectorization(
    max_tokens=2636 + 2,
    output_mode="int",
    output_sequence_length=50 + 2,
)

text_vectorization.adapt(video_15_captions)


In [19]:
# check the vocabulary
text_vocab = text_vectorization.get_vocabulary()
index_word_lookup = dict(zip(range(len(text_vocab)), text_vocab))
word_index_lookup = dict(zip(text_vocab, range(len(text_vocab))))

print(index_word_lookup)
print(word_index_lookup['start'])

{0: '', 1: '[UNK]', 2: 'a', 3: 'end', 4: 'start', 5: 'the', 6: 'is', 7: 'and', 8: 'of', 9: 'in', 10: 'man', 11: 'on', 12: 'about', 13: 'talking', 14: 'woman', 15: 'with', 16: 'to', 17: 'are', 18: 'people', 19: 'an', 20: 'clinton', 21: 'news', 22: 'playing', 23: 'two', 24: 'person', 25: 'hillary', 26: 'from', 27: 'water', 28: 'video', 29: 'singing', 30: 'by', 31: 'stage', 32: 'band', 33: 'at', 34: 'while', 35: 'some', 36: 'talks', 37: 'speech', 38: 'for', 39: 'giving', 40: 'women', 41: 'music', 42: 'it', 43: 'there', 44: 'mountain', 45: 'around', 46: 'men', 47: 'being', 48: 'speaking', 49: 'show', 50: 'song', 51: 'other', 52: 'that', 53: 'owl', 54: 'his', 55: 'white', 56: 'shown', 57: 's', 58: 'group', 59: 'walking', 60: 'blue', 61: 'into', 62: 'then', 63: 'discussing', 64: 'as', 65: 'performing', 66: 'hilary', 67: 'talk', 68: 'her', 69: 'beautiful', 70: 'trees', 71: 'something', 72: 'showing', 73: 'over', 74: 'black', 75: 'sitting', 76: 'very', 77: 'tv', 78: 'clip', 79: 'through', 80: 

In [20]:
senID_curSenEncodeList = defaultdict(list)
for senID, curSenList in senID_curSenList.items():
  for curSen in curSenList:
    senID_curSenEncodeList[senID].append( text_vectorization(curSen) )

In [21]:
print(senID_curSenEncodeList[861])
print(senID_curSenEncodeList[861][0])

[<tf.Tensor: shape=(52,), dtype=int64, numpy=
array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])>, <tf.Tensor: shape=(52,), dtype=int64, numpy=
array([4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])>, <tf.Tensor: shape=(52,), dtype=int64, numpy=
array([ 4,  2, 58,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0])>, <tf.Tensor: shape=(52,), dtype=int64, numpy=
array([  4,   2,  58, 132,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
    

**Genrate Clip Encodings for video frames**

video frames are generated in kaggle, we import the video encodings here.

In [22]:
videoID_encode = {}
for videoID in videoID_senIDList:
  video_encoding = torch.load("./cat15_encode_all_frames/" + "video" + str(videoID) + ".mp4.pt") #, map_location=torch.device('cpu'))

  # transform pytorch tensor to tensorflow tensor
  np_tensor = video_encoding.numpy()
  tf_tensor = tf.convert_to_tensor(np_tensor)

  videoID_encode[videoID] = tf_tensor

In [23]:
print(videoID_encode[203])  #shape = (n of frames, 1, 512)
print(videoID_encode[203][0]) #shape = (1, 512)

tf.Tensor(
[[[ 0.09523945  0.07599887  0.05869189 ...  0.69163275  0.14880472
    0.2557331 ]]

 [[ 0.08898377  0.02902466  0.02410539 ...  0.66155636  0.20100373
    0.26796886]]

 [[ 0.07392123  0.05944297  0.03666098 ...  1.0337691   0.16006297
    0.2670282 ]]

 ...

 [[-0.01295494  0.21560001  0.39212385 ...  1.3051381   0.01797567
    0.01183372]]

 [[ 0.01240445  0.20100361  0.34799287 ...  1.1140565   0.00324896
    0.04376515]]

 [[-0.00676813  0.1886001   0.35180557 ...  1.1540161   0.02242367
    0.04899073]]], shape=(400, 1, 512), dtype=float32)
tf.Tensor(
[[ 9.52394530e-02  7.59988651e-02  5.86918928e-02 -3.83632541e-01
   2.90457547e-01 -1.38108820e-01  1.83076814e-01  9.99676466e-01
  -1.50156230e-01  1.79839000e-01 -6.15241639e-02  1.26153484e-01
   3.66688073e-01  2.74560004e-01 -3.08425222e-02 -2.16545805e-01
  -6.98936820e-01  8.90847594e-02 -1.52132168e-01  2.77514935e-01
  -4.20033038e-01  9.28241480e-03  2.20124498e-02  9.67369974e-02
   5.53051531e-02 -1.60417527

Note that same caption may show up in different video

In [None]:
# # create a map for caption: id


# caption_id_d = {}
# count = 0
# for id, senIDList in videoID_senIDList.items():
#   for caption in senIDList:
#     if caption in caption_id_d:
#       print("bad ", id, " ", caption)
#       count += 1
#     caption_id_d[caption] = id

In [24]:
# sample number of frames according to the length of captions
senID_videoEncodeList = {}
for videoID, encode in videoID_encode.items():
  for senID in videoID_senIDList[videoID]:
    caption_split = senID_caption[senID].split(" ")
    word_count = len(caption_split)

    samples = np.round(np.linspace(0, len(encode) - 1, word_count))
    senID_videoEncodeList[senID] = [encode[int(sample)] for sample in samples]


In [25]:
# print(senID_videoEncodeList[861])
print(len(senID_videoEncodeList[861]))
print(len(senID_caption[861].split(" ")))

9
9


In [26]:
print(len(senID_caption[861].split(" ")))
print(senID_caption[861])

9
[start] a group discusses hillary clinton for president [end]


**Generate Episodes (Input Datasets)**

In [27]:
# Generate episodes

### input dataset
# video: clip encoding
# state at one step = return to go + video frame + current generated sentence = encodings_t
#
### one input = [encodings_t]_t

### output dataset
# one output = [action_t (word_t)]_t encoded using TextVectorization
#
#

In [28]:
total_reward = 20

In [29]:
input_rtg_state_list = []
action_list = []
pairs = []

for senID, caption in senID_caption.items():
  words = caption.split(" ")
  word_count = len(words)

  curSenEncodeList = senID_curSenEncodeList[senID]
  videoEncodeList  = senID_videoEncodeList[senID]
  single_reward = total_reward / (word_count - 1)

  episode_rtg_state_list = []
  episode_action_list = text_vectorization(caption)
  episode_action_list = tf.pad(episode_action_list, paddings=[[0, 565*20-52 + 1]])

  for i in range(min(20, len(curSenEncodeList)-1)):   # didn't skip [start], skip [end] b/c the last state_input is sent w/o [end]

    rtg = single_reward * (word_count - 1 - i)
    flattened_videoEncode = videoEncodeList[i].numpy().flatten()  # Flatten the nested array
    integer_array = curSenEncodeList[i].numpy()

    # Concatenate the rtg, flattened array, and integer array
    rtg_state = np.concatenate(([rtg], flattened_videoEncode, integer_array))
    episode_rtg_state_list.append(rtg_state)

  # add paddings so that they have the same length
  for j in range(len(curSenEncodeList) - 1, 20):
    rtg_state = tf.zeros(shape=(565,), dtype=tf.float64)
    episode_rtg_state_list.append(rtg_state)

  pairs.append((tf.reshape(episode_rtg_state_list, (20*565)), tf.reshape(episode_action_list,(20*565 + 1))))


In [30]:
# prepare the dataset
batch_size = 4

def format_dataset(episode_rtg_states, episode_actions):   # in here, eng means the input texts, spa means the output texts.

    return ({"encoder_input": episode_rtg_states, "decoder_input": episode_actions[:, :-1],}, episode_actions[:, 1:])

def make_dataset(pairs):
    episode_rtg_states, episode_actions = zip(*pairs)
    episode_rtg_states = list(episode_rtg_states)
    episode_actions = list(episode_actions)
    dataset = tf.data.Dataset.from_tensor_slices((episode_rtg_states, episode_actions))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(4).cache()

train_ds = make_dataset(pairs)


In [31]:
print(train_ds)
print(train_ds.element_spec)
for element in train_ds.take(3):
    print(element)
    break

<CacheDataset element_spec=({'encoder_input': TensorSpec(shape=(None, 11300), dtype=tf.float64, name=None), 'decoder_input': TensorSpec(shape=(None, 11300), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 11300), dtype=tf.int64, name=None))>
({'encoder_input': TensorSpec(shape=(None, 11300), dtype=tf.float64, name=None), 'decoder_input': TensorSpec(shape=(None, 11300), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 11300), dtype=tf.int64, name=None))
({'encoder_input': <tf.Tensor: shape=(4, 11300), dtype=float64, numpy=
array([[20.        ,  0.14346452,  0.48076335, ...,  0.        ,
         0.        ,  0.        ],
       [20.        ,  0.14346452,  0.48076335, ...,  0.        ,
         0.        ,  0.        ],
       [20.        ,  0.14346452,  0.48076335, ...,  0.        ,
         0.        ,  0.        ],
       [20.        ,  0.14346452,  0.48076335, ...,  0.        ,
         0.        ,  0.        ]])>, 'decoder_input': <tf.Tensor: shape=(4, 11300), dtype=int64, 

In [32]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['encoder_input'].shape: {inputs['encoder_input'].shape}")
    print(f"inputs['decoder_input'].shape: {inputs['decoder_input'].shape}")
    print(f"targets.shape: {targets.shape}")
    # print(inputs)

inputs['encoder_input'].shape: (4, 11300)
inputs['decoder_input'].shape: (4, 11300)
targets.shape: (4, 11300)


### **The Transformer Model**

In [33]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [34]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [35]:
class TransformerDecoder1(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dropout(0.5),                         # add a dropout layer to avoid overfitting
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

In [37]:
print(train_ds)

<CacheDataset element_spec=({'encoder_input': TensorSpec(shape=(None, 11300), dtype=tf.float64, name=None), 'decoder_input': TensorSpec(shape=(None, 11300), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 11300), dtype=tf.int64, name=None))>


In [38]:
embed_dim = 565 * 2
dense_dim = 565 * 2
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="float64", name="encoder_input")
x = PositionalEmbedding(11300, 1024, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_input")
x = PositionalEmbedding(11301, 2638, embed_dim)(decoder_inputs)
x = TransformerDecoder1(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(2638, activation="softmax")(x)
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
transformer.fit(train_ds, epochs=10, \
                callbacks = keras.callbacks.ModelCheckpoint(
          filepath="model3.keras",
          save_best_only=True,),)

Epoch 1/10
