# <font size=6 color='green'><center>**Dynamic Architecture**</center></font>
### **<center>Part 1<br/>**

This file contains the code to train the dynmaic network architecture to learn the modality specific weights. The network trained here consists of three inputs (face, voice, video) and one output label, where label = 1 if face,video and voice are of same person, else label is 0

In [1]:
# importing required libraries
import pandas as pd
import numpy as np
from tensorflow import keras
from keras import layers
import tensorflow as tf

Loading the data using tensorflow data generators for efficient memory usage

In [23]:
# function that parses the tf record
def parser(tfRecord):
   keys_to_features = {
        "image_raw": tf.io.FixedLenFeature([], tf.string),
        "video_raw": tf.io.FixedLenFeature([], tf.string),
        "audio_raw": tf.io.FixedLenFeature([], tf.string),
        "label":     tf.io.FixedLenFeature([], tf.int64)
    }
   
   parsed = tf.io.parse_single_example(tfRecord, keys_to_features)
   image = tf.io.parse_tensor(parsed['image_raw'], out_type=tf.float64)
   audio = tf.io.parse_tensor(parsed['audio_raw'], out_type=tf.double)
   video = tf.io.parse_tensor(parsed['video_raw'], out_type=tf.double)
   audio = tf.expand_dims(audio,axis=2)
   label = tf.cast(parsed['label'], tf.int32)
   label = tf.one_hot(label,2)
   return {'faceInput':image,'voiceInput':audio, 'videoInput':video}, label
 
# function to load dataset from the tfrecords file
def get_train_data(filenames):
  dataset = tf.data.TFRecordDataset(filenames=filenames, num_parallel_reads=40)
  dataset = dataset.map(parser, num_parallel_calls=12)
  dataset = dataset.batch(batch_size=32)
  dataset = dataset.prefetch(buffer_size=2)
  return dataset

In [24]:
# change this path to the path file of tensorflow records
inputDataPath = '/kaggle/input/tf-records-for-data/train.tfrecords'
trainDataset = get_train_data(inputDataPath)

In [25]:
# show a sample images and mfccs
X_data, labels = next(iter(trainDataset))

# plotting the image input
plt.imshow(X_data['faceInput'][0])
plt.show()

mfcc=X_data['voiceInput'][0]
print(f"Shape of the mfcc co-efficients: {mfcc.shape}")

NotFoundError: {{function_node __wrapped__IteratorGetNext_output_types_4_device_/job:localhost/replica:0/task:0/device:CPU:0}} NewRandomAccessFile failed to Create/Open: /kaggle/input/tf-records-for-data/train.tfrecords : The system cannot find the path specified.
; No such process [Op:IteratorGetNext]

3D CNN Layer for processing videos

In [6]:
class Conv3D(keras.layers.Layer):
  # name - name given to the sequential model, used to load pre-trained weights
  def __init__(self, filters, kernel_size, padding,name=None):
    """
      A sequence of convolutional layers that first apply the convolution operation over the
      spatial dimensions, and then the temporal dimension. 
    """
    super().__init__()
    self.seq = keras.Sequential([  
        # Spatial decomposition
        layers.Conv3D(filters=filters,
                      kernel_size=(1, kernel_size[1], kernel_size[2]),
                      padding=padding),
        # Temporal decomposition
        layers.Conv3D(filters=filters, 
                      kernel_size=(kernel_size[0], 1, 1),
                      padding=padding)
        ],name=name)
  
  def call(self, x):
    return self.seq(x)

In [7]:
class ResidualMain(keras.layers.Layer):
  """
    Residual block of the model with convolution, layer normalization, and the
    activation function, ReLU.
  """
  def __init__(self, filters, kernel_size,name):
    super().__init__()
    self.seq = keras.Sequential([
        Conv3D(filters=filters,
                    kernel_size=kernel_size,
                    padding='same'),
        layers.LayerNormalization(),
        layers.ReLU(),
        Conv3D(filters=filters, 
                    kernel_size=kernel_size,
                    padding='same'),
        layers.LayerNormalization()
    ], name=name)
    
  def call(self, x):
    return self.seq(x)

In [22]:
Num_Frames_Per_Video = 10
videoInput = layers.Input(shape = (Num_Frames_Per_Video, 224, 224, 3),name='videoInput')
x = videoInput
x = Conv3D(filters=32, kernel_size=(3, 7, 7), padding='same',name='videolayer1')(x)
x = layers.BatchNormalization()(x)
x = layers.ReLU()(x)

x = ResidualMain(64, (3,3,3),name='videolayer2')(x)
x = layers.ReLU()(x)

x = ResidualMain(32, (3,3,3),name='videolayer3')(x)
x = layers.ReLU()(x)

x = ResidualMain(64, (3,3,3),name='videolayer4')(x)
x = layers.ReLU()(x)

x = layers.GlobalAveragePooling3D()(x)
x = layers.Flatten()(x)
x = layers.Dense(1024,name='vidoelyaer5')(x)
videoFeatures = x

In [16]:
# Building a face subnetwork
w_decay = 0.001

# def FaceSubnet():
faceInput = keras.layers.Input(shape=(224,224,3),name='faceInput')

flayer1 = keras.layers.Conv2D(filters=96, kernel_size=(7,7), strides=(2,2), padding="same", activation="relu", name='flayer1i')

fout1 = keras.layers.MaxPool2D(pool_size=(2,2),padding="valid",name="flayer1o")(flayer1(faceInput))

flayer2 = keras.layers.Conv2D(filters=256, kernel_size=(5,5), strides=(2,2), padding="same", activation="relu",name='flayer2i')

fout2 = keras.layers.MaxPool2D(pool_size=(2,2), padding="valid",name='flayer2o')(flayer2(fout1))

flayer3 = keras.layers.Conv2D(filters=256, kernel_size=(3,3), strides=(1,1),padding="same", activation="relu",name='flayer3i')

fout3 = keras.layers.MaxPool2D(pool_size=(2,2), padding="valid",name='flayer3o')(flayer3(flayer3(flayer3(fout2))))

flayer4 = keras.layers.Dense(units=4096, activation='relu',name='flayer4i')

fout4 = keras.layers.Flatten(name='flayer40')(flayer4(fout3))

flayer5 = keras.layers.Dense(units=1024, activation='relu',name='flayer5i')

faceFeatures = flayer5(fout4)
    

# def VoiceSubnet():

voiceInput = keras.layers.Input(shape=(20,130,1),name='voiceInput')

vlayer1 = keras.layers.Conv2D(filters=96, kernel_size=(7,7), strides=(2,2), padding="same", activation="relu",name='vlayer1i')

vout1 = keras.layers.MaxPool2D(pool_size=(2,2),padding="valid",name='vlayer1o')(vlayer1(voiceInput))

vlayer2 = keras.layers.Conv2D(filters=256, kernel_size=(5,5), strides=(2,2), padding="same", activation="relu",name='vlayer2i')

vout2 = keras.layers.MaxPool2D(pool_size=(2,2), padding="valid",name='vlayer2o')(vlayer2(vout1))

vlayer3 = keras.layers.Conv2D(filters=256, kernel_size=(3,3), strides=(1,1),padding="same", activation="relu",name='vlayer3i')

vout3 = keras.layers.MaxPool2D(pool_size=(1,2), padding="valid",name='vlayer3o')(vlayer3(vlayer3(vlayer3(vout2))))

vlayer4 = keras.layers.Dense(units=4096, activation='relu',name='vlayer4i')

vout4 = keras.layers.Flatten(name='valyer4o')(vlayer4(vout3))

vlayer5 = keras.layers.Dense(units=1024, activation='relu',name='vlayer5i')

voiceFeatures = vlayer5(vout4)

In [17]:
combinedInputs = keras.layers.concatenate([faceFeatures,videoFeatures, voiceFeatures])

clayer1 = keras.layers.Dense(1024, activation='relu',name='mlayer1')

clayer2 = keras.layers.Dense(512, activation='relu',name='mlayer2')

clayer3 = keras.layers.Dense(2, activation='relu',name='mlayer3')

finalOutput = clayer3(clayer2(clayer1(combinedInputs)))

In [18]:
model = keras.Model(
    inputs=[faceInput,videoInput, voiceInput],
    outputs = finalOutput
)

In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 10, 224, 22  0           []                               
                                4, 3)]                                                            
                                                                                                  
 faceInput (InputLayer)         [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv3d_75 (Conv3D)             (None, 10, 224, 224  7840        ['input_6[0][0]']                
                                , 32)                                                         

In [20]:
keras.utils.plot_model(model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [21]:
model.compile(
    optimizer = keras.optimizers.Adam(),
    loss = keras.losses.BinaryCrossentropy(),
    metrics = ['accuracy']
)

In [None]:
history = model.fit(trainDataset,epochs = 5)

Saving the learned model weights

In [26]:
model.save_weights('dynamicModel1.h5')

---