In [1]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV3Large
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv3D, Activation, MaxPooling3D, TimeDistributed, Flatten, Bidirectional, LSTM, Dropout, Dense, Input

# Define constants
frame_rate = 120  # Number of frames per video
input_height = 60  # Height of each frame
input_width = 80  # Width of each frame
num_channels = 1  # Grayscale
vocab_size = 40  # Assuming char_to_num is defined

# Define the feature extractor (MobileNetV3 Large)
mobilenetv3 = MobileNetV3Large(
    input_shape=(input_height, input_width, num_channels),
    include_top=False,
    weights=None  # Use pretrained weights if available
)

# Adjust MobileNetV3 to take grayscale images by modifying the input layer
input_layer = Input(shape=(input_height, input_width, num_channels))
mobilenetv3_output = mobilenetv3(input_layer)
feature_extractor = Model(inputs=input_layer, outputs=mobilenetv3_output)

# # Freeze the feature extractor layers if necessary
# for layer in feature_extractor.layers:
#     layer.trainable = False

# Build the full model
model = Sequential()
model.add(TimeDistributed(feature_extractor, input_shape=(frame_rate, input_height, input_width, num_channels)))

# Adding the LSTM part
model.add(TimeDistributed(Flatten()))

model.add(Bidirectional(LSTM(256, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(0.5))

model.add(Bidirectional(LSTM(256, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(0.5))

model.add(Dense(vocab_size, kernel_initializer='he_normal', activation='softmax'))

# Summary of the model
model.summary(show_trainable=True)


2024-05-24 16:33:39.646404: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
____________________________________________________________________________
 Layer (type)                Output Shape              Param #   Trainable  
 time_distributed (TimeDist  (None, 120, 2, 3, 960)    2996064   Y          
 ributed)                                                                   
                                                                            
 time_distributed_1 (TimeDi  (None, 120, 5760)         0         Y          
 stributed)                                                                 
                                                                            
 bidirectional (Bidirection  (None, 120, 512)          1232281   Y          
 al)                                                   6                    
                                                                            
 dropout (Dropout)           (None, 120, 512)          0         Y          
                                                        

In [2]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import TimeDistributed, Flatten, Bidirectional, LSTM, Dropout, Dense, Input

# Define constants
frame_rate = 120  # Number of frames per video
input_height = 60  # Height of each frame
input_width = 80  # Width of each frame
num_channels = 1  # Grayscale
vocab_size = 40  # Assuming char_to_num is defined

# Define the feature extractor (VGG16)
vgg16 = VGG16(
    input_shape=(input_height, input_width, num_channels),
    include_top=False,
    weights=None  # Use pretrained weights if available
)

# Adjust VGG16 to take grayscale images by modifying the input layer
input_layer = Input(shape=(input_height, input_width, num_channels))
vgg16_output = vgg16(input_layer)
feature_extractor = Model(inputs=input_layer, outputs=vgg16_output)

# # Freeze the feature extractor layers if necessary
# for layer in feature_extractor.layers:
#     layer.trainable = False

# Build the full model
model = Sequential()
model.add(TimeDistributed(feature_extractor, input_shape=(frame_rate, input_height, input_width, num_channels)))

# Adding the LSTM part
model.add(TimeDistributed(Flatten()))

model.add(Bidirectional(LSTM(256, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(0.5))

model.add(Bidirectional(LSTM(256, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(0.5))

model.add(Dense(vocab_size, kernel_initializer='he_normal', activation='softmax'))

# Summary of the model
model.summary(show_trainable=True)


Model: "sequential_1"
____________________________________________________________________________
 Layer (type)                Output Shape              Param #   Trainable  
 time_distributed_2 (TimeDi  (None, 120, 1, 2, 512)    1471353   Y          
 stributed)                                            6                    
                                                                            
 time_distributed_3 (TimeDi  (None, 120, 1024)         0         Y          
 stributed)                                                                 
                                                                            
 bidirectional_2 (Bidirecti  (None, 120, 512)          2623488   Y          
 onal)                                                                      
                                                                            
 dropout_2 (Dropout)         (None, 120, 512)          0         Y          
                                                      

In [3]:
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import TimeDistributed, Flatten, Bidirectional, LSTM, Dropout, Dense, Input

# Define constants
frame_rate = 120  # Number of frames per video
input_height = 75  # Height of each frame (minimum required by InceptionV3)
input_width = 75  # Width of each frame (minimum required by InceptionV3)
num_channels = 1  # Grayscale
vocab_size = 40  # Assuming char_to_num is defined

# Define the feature extractor (InceptionV3)
inceptionv3 = InceptionV3(
    input_shape=(input_height, input_width, num_channels),
    include_top=False,
    weights=None  # Use pretrained weights if available
)

# Adjust InceptionV3 to take grayscale images by modifying the input layer
input_layer = Input(shape=(input_height, input_width, num_channels))
inceptionv3_output = inceptionv3(input_layer)
feature_extractor = Model(inputs=input_layer, outputs=inceptionv3_output)

# # Freeze the feature extractor layers if necessary
# for layer in feature_extractor.layers:
#     layer.trainable = False

# Build the full model
model = Sequential()
model.add(TimeDistributed(feature_extractor, input_shape=(frame_rate, input_height, input_width, num_channels)))

# Adding the LSTM part
model.add(TimeDistributed(Flatten()))

model.add(Bidirectional(LSTM(256, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(0.5))

model.add(Bidirectional(LSTM(256, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(0.5))

model.add(Dense(vocab_size, kernel_initializer='he_normal', activation='softmax'))

# Summary of the model
model.summary(show_trainable=True)


Model: "sequential_2"
____________________________________________________________________________
 Layer (type)                Output Shape              Param #   Trainable  
 time_distributed_4 (TimeDi  (None, 120, 1, 1, 2048)   2180220   Y          
 stributed)                                            8                    
                                                                            
 time_distributed_5 (TimeDi  (None, 120, 2048)         0         Y          
 stributed)                                                                 
                                                                            
 bidirectional_4 (Bidirecti  (None, 120, 512)          4720640   Y          
 onal)                                                                      
                                                                            
 dropout_4 (Dropout)         (None, 120, 512)          0         Y          
                                                      