# <font size=6 color='green'><center>**Dynamic Architecture**</center></font>
### **<center>Part 2<br/>**

This file contains the code to train the dynmaic network architecture to learn the modality common network. The network trained here consists of five inputs (face1, face2, video1, video2, audio) and one output label, where label = 0 if audio matches with face1, video1 and label = 1 if audio matches with face2, video2

In [6]:
# importing required libraries
import tensorflow as tf
from tensorflow import keras
from keras import layers
import matplotlib.pyplot as plt

Loading the data using tensorflow data generators for efficient memory usage

In [2]:
# function that parses the tf record
def parser(tfRecord):
   keys_to_features = {
        "img_raw1": tf.io.FixedLenFeature([], tf.string),
        "img_raw2": tf.io.FixedLenFeature([], tf.string),
        "video_raw1": tf.io.FixedLenFeature([], tf.string),
        "video_raw2": tf.io.FixedLenFeature([], tf.string),
        "audio_raw": tf.io.FixedLenFeature([], tf.string),
        "label":     tf.io.FixedLenFeature([], tf.int64)
    }
   
   parsed = tf.io.parse_single_example(tfRecord, keys_to_features)
   image1 = tf.io.parse_tensor(parsed['image1_raw'], out_type=tf.float64)
   image2 = tf.io.parse_tensor(parsed['image2_raw'], out_type=tf.float64)
   video1 = tf.io.parse_tensor(parsed['video_raw1'], out_type=tf.float64)
   video2 = tf.io.parse_tensor(parsed['video_raw2'], out_type=tf.float64)
   audio = tf.io.parse_tensor(parsed['audio_raw'], out_type=tf.double)
   audio = tf.expand_dims(audio,axis=2)
   label = tf.cast(parsed['label'], tf.int32)
   label = tf.one_hot(label,2)
   return {'faceInput1':image1,'faceInput2':image2,'videoInput1':video1,'videoInput2':video2,'voiceInput':audio}, label
 
# function to load dataset from the tfrecords file
def get_train_data(filenames):
  dataset = tf.data.TFRecordDataset(filenames=filenames, num_parallel_reads=40)
  dataset = dataset.map(parser, num_parallel_calls=12)
  dataset = dataset.batch(batch_size=32)
  dataset = dataset.prefetch(buffer_size=2)
  return dataset

In [None]:
# change this path to the path file of tensorflow records
inputDataPath = 'train2.tfrecords'
trainDataset = get_train_data(inputDataPath)

In [None]:
# show a sample images and mfccs
X_data, labels = next(iter(trainDataset))

# plotting the image input
plt.imshow(X_data['faceInput1'][0])
plt.show()
plt.imshow(X_data['faceInput2'][0])
plt.show()

mfcc=X_data['voiceInput'][0]
print(f"Shape of the mfcc co-efficients: {mfcc.shape}")

In [4]:
# defining the face and voice subnetworks as sequential models
faceSubnet = keras.models.Sequential()

faceSubnet.add(keras.layers.Conv2D(filters=96, kernel_size=(7,7), trainable=False, strides=(2,2), padding="same", activation="relu", name='flayer1i',input_shape=(224,224,3)))

faceSubnet.add(keras.layers.MaxPool2D(pool_size=(2,2),padding="valid",name="flayer1o", trainable=False))

faceSubnet.add(keras.layers.Conv2D(filters=256, kernel_size=(5,5), trainable=False, strides=(2,2), padding="same", activation="relu",name='flayer2i'))

faceSubnet.add(keras.layers.MaxPool2D(pool_size=(2,2), trainable=False, padding="valid",name='flayer2o'))

faceSubnet.add(keras.layers.Conv2D(filters=256, kernel_size=(3,3), trainable=False, strides=(1,1),padding="same", activation="relu",name='flayer3i'))

faceSubnet.add(keras.layers.MaxPool2D(pool_size=(2,2), trainable=False, padding="valid",name='flayer3o'))

faceSubnet.add(keras.layers.Dense(units=4096, trainable=False, activation='relu',name='flayer4i'))

faceSubnet.add(keras.layers.Flatten(name='flayer40', trainable=False))

faceSubnet.add(keras.layers.Dense(units=1024, trainable=False, activation='relu',name='flayer5i'))

# defining the voice subnet

voiceSubnet = keras.models.Sequential()

voiceSubnet.add(keras.layers.Conv2D(filters=96, kernel_size=(7,7), strides=(2,2), padding="same", activation="relu",name='vlayer1i',input_shape=(20,130,1)))

voiceSubnet.add(keras.layers.MaxPool2D(pool_size=(2,2),padding="valid",name='vlayer1o'))

voiceSubnet.add(keras.layers.Conv2D(filters=256, kernel_size=(5,5), strides=(2,2), padding="same", activation="relu",name='vlayer2i'))

voiceSubnet.add(keras.layers.MaxPool2D(pool_size=(2,2), padding="valid",name='vlayer2o'))

voiceSubnet.add(keras.layers.Conv2D(filters=256, kernel_size=(3,3), strides=(1,1),padding="same", activation="relu",name='vlayer3i'))

voiceSubnet.add(keras.layers.MaxPool2D(pool_size=(1,2), padding="valid",name='vlayer3o'))

voiceSubnet.add(keras.layers.Dense(units=4096, activation='relu',name='vlayer4i'))

voiceSubnet.add(keras.layers.Flatten(name='valyer4o'))

voiceSubnet.add(keras.layers.Dense(units=1024, activation='relu',name='vlayer5i'))

In [5]:
class Conv3D(keras.layers.Layer):
  # name - name given to the sequential model, used to load pre-trained weights
  def __init__(self, filters, kernel_size, padding,name=None):
    """
      A sequence of convolutional layers that first apply the convolution operation over the
      spatial dimensions, and then the temporal dimension. 
    """
    super().__init__()
    self.seq = keras.Sequential([  
        # Spatial decomposition
        layers.Conv3D(filters=filters,
                      kernel_size=(1, kernel_size[1], kernel_size[2]),
                      padding=padding),
        # Temporal decomposition
        layers.Conv3D(filters=filters, 
                      kernel_size=(kernel_size[0], 1, 1),
                      padding=padding)
        ],name=name)
  
  def call(self, x):
    return self.seq(x)
  
class ResidualMain(keras.layers.Layer):
  """
    Residual block of the model with convolution, layer normalization, and the
    activation function, ReLU.
  """
  def __init__(self, filters, kernel_size,name):
    super().__init__()
    self.seq = keras.Sequential([
        Conv3D(filters=filters,
                    kernel_size=kernel_size,
                    padding='same'),
        layers.LayerNormalization(),
        layers.ReLU(),
        Conv3D(filters=filters, 
                    kernel_size=kernel_size,
                    padding='same'),
        layers.LayerNormalization()
    ], name=name)
    
  def call(self, x):
    return self.seq(x)

In [7]:
videoSubnet = keras.models.Sequential(name='videoSubnet')

videoSubnet.add(Conv3D(filters=32, kernel_size=(3, 7, 7), padding='same',name='videolayer1'))
videoSubnet.add(layers.BatchNormalization())
videoSubnet.add(layers.ReLU())

videoSubnet.add(ResidualMain(64, (3,3,3),name='videolayer2'))
videoSubnet.add(layers.ReLU())

videoSubnet.add(ResidualMain(32, (3,3,3),name='videolayer3'))
videoSubnet.add(layers.ReLU())

videoSubnet.add(ResidualMain(64, (3,3,3),name='videolayer4'))
videoSubnet.add(layers.ReLU())
videoSubnet.add(layers.GlobalAveragePooling3D())
videoSubnet.add(layers.Flatten())
videoSubnet.add(layers.Dense(1024, activation='relu', name='vidoelyaer5'))


In [9]:
# load the layer weights on the basis of layer names
# from the model built in part 1
faceSubnet.load_weights('dynamicModel1.h5',by_name=True)
voiceSubnet.load_weights('dynamicModel1.h5',by_name= True)
videoSubnet.load_weights('dynamicModel1.h5',by_name=True)
faceSubnet.trainable=False
voiceSubnet.trainable=False
videoSubnet.trainable=False

ValueError: Unable to load weights saved in HDF5 format into a subclassed Model which has not created its variables yet. Call the Model first, then load the weights.

In [11]:
Num_Frames_Per_Video = 10
faceInput1 = keras.layers.Input(shape=(224,224,3), name='faceInput1')
faceInput2 = keras.layers.Input(shape=(224,224,3), name='faceInput2')
videoInput1 = keras.layers.Input(shape=(Num_Frames_Per_Video, 224,224,3), name='videoInput1')
videoInput2 = keras.layers.Input(shape=(Num_Frames_Per_Video, 224,224,3), name='videoInput2')
voiceInput = keras.layers.Input(shape=(20,130,1), name='voiceInput')
faceFeatures1 = faceSubnet(faceInput1)
faceFeatures2 = faceSubnet(faceInput2)
videoFeatures1 = videoSubnet(videoInput1)
videoFeatures2 = videoSubnet(videoInput2)
voiceFeatures = voiceSubnet(voiceInput)

In [12]:
concatenatedLayers = keras.layers.concatenate([faceFeatures1, faceFeatures2,videoFeatures1, videoFeatures2, voiceFeatures])
clayer1 = keras.layers.Dense(units=2048, activation='relu',name='clayer1')
clayer2 = keras.layers.Dense(units=512, activation='relu', name='clayer2')
clayer3 = keras.layers.Dense(units=2, activation='sigmoid',name='clayer3')
finalOutput = clayer3(clayer2(clayer1(concatenatedLayers)))

In [13]:
model = keras.Model(
  inputs = [faceInput1, faceInput2, videoInput1, videoInput2, voiceFeatures],
  outputs = finalOutput
)

In [14]:
model.compile()

In [15]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 faceInput1 (InputLayer)        [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 faceInput2 (InputLayer)        [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 videoInput1 (InputLayer)       [(None, 10, 224, 22  0           []                               
                                4, 3)]                                                        

Final architecture looks like this

In [16]:
keras.utils.plot_model(model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [None]:
model.compile(
    optimizer = keras.optimizers.Adam(),
    loss = keras.losses.CategoricalCrossentropy(),
    metrics = ['accuracy']
)

Saving the model

In [None]:
model.save_weights('dynamicModel2.h5')

---