In [None]:
%pip install tensorflow
import tensorflow as tf
import numpy as np
import cv2
import librosa
from tqdm import tqdm

Step 1: Loading Metadata

The first step is to load the metadata (labels, file paths, etc.) from a JSON file.

In [None]:
import json

def load_metadata(metadata_path):
    try:
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error loading metadata: {e}")
        metadata = {}
    return metadata

metadata_path = "/path/to/metadata.json"
metadata = load_metadata(metadata_path)
print(metadata)

Step 2: Video Loading and Processing

This function handles video loading, resizing, normalization, and padding.

In [None]:
import os
import numpy as np
import cv2

def load_video(video_path, max_frames=100, target_size=(224, 224)):
    cap = cv2.VideoCapture(video_path)
    frames = []
    
    if not cap.isOpened():
        print(f"Error opening video file {video_path}")
        return np.zeros((max_frames, target_size[0], target_size[1], 3))
    
    while cap.isOpened() and len(frames) < max_frames:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, target_size)
        frame = frame / 255.0  # Normalize frame values to [0, 1]
        frames.append(frame)
    
    cap.release()
    
    # Padding with black frames if the video is too short
    while len(frames) < max_frames:
        frames.append(np.zeros((target_size[0], target_size[1], 3)))  # Black frame padding (RGB)
    
    return np.array(frames)

Step 3: Mel-Spectrogram Extraction (Audio)

We extract Mel-spectrogram features from audio for further processing.

In [None]:
import librosa

def extract_mel_spectrogram(audio_input, sr=16000, n_mels=128):
    mel_spec = librosa.feature.melspectrogram(audio_input, sr=sr, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    return log_mel_spec

Step 4: Audio Feature Extraction Models

1. CRNN (Convolutional Recurrent Neural Network)

CRNN combines CNN and RNN layers to capture both spatial and temporal features in the audio.

In [None]:
from tensorflow.keras.layers import Conv1D, GRU, TimeDistributed, Flatten

def create_crnn_model(input_shape=(100, 128)):
    inputs = layers.Input(shape=input_shape)
    x = Conv1D(128, kernel_size=3, activation='relu')(inputs)
    x = TimeDistributed(GRU(64, return_sequences=True))(x)
    x = Flatten()(x)
    model = models.Model(inputs, x)
    return model

def extract_crnn_features(audio_input):
    crnn_model = create_crnn_model()
    return crnn_model.predict(audio_input)

2. WavNet

WavNet processes raw audio waveforms using dilated convolutions.

In [None]:
from tensorflow.keras.layers import Conv1D, Flatten

def create_wavenet_model(input_shape=(16000, 1)):  # Assuming raw audio waveform input
    inputs = layers.Input(shape=input_shape)
    x = Conv1D(64, kernel_size=2, dilation_rate=2, padding='causal', activation='relu')(inputs)
    x = Conv1D(128, kernel_size=2, dilation_rate=4, padding='causal', activation='relu')(x)
    x = Flatten()(x)
    x = layers.Dense(128, activation='relu')(x)
    model = models.Model(inputs, x)
    return model

def extract_wavenet_features(audio_input):
    wavenet_model = create_wavenet_model()
    return wavenet_model.predict(audio_input)

4. Custom CNN for Audio

A simple CNN model for audio feature extraction.

In [None]:
def create_audio_cnn_model(input_shape=(128, 128, 1)):  # Assuming Mel-spectrogram input
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv2D(32, (3, 3), activation='relu')(inputs)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Conv2D(64, (3, 3), activation='relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = Flatten()(x)
    model = models.Model(inputs, x)
    return model

def extract_audio_cnn_features(audio_input):
    audio_cnn_model = create_audio_cnn_model()
    return audio_cnn_model.predict(audio_input)

Step 5: Video Feature Extraction Models

1. C3D (3D Convolutional Neural Network)

The C3D model captures both spatial and temporal information using 3D convolutions.

In [None]:
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Flatten

def create_c3d_model(input_shape=(16, 112, 112, 3)):
    inputs = layers.Input(shape=input_shape)
    x = Conv3D(64, kernel_size=(3, 3, 3), activation='relu')(inputs)
    x = MaxPooling3D(pool_size=(1, 2, 2))(x)
    x = Conv3D(128, kernel_size=(3, 3, 3), activation='relu')(x)
    x = MaxPooling3D(pool_size=(2, 2, 2))(x)
    x = Flatten()(x)
    model = models.Model(inputs, x)
    return model

def extract_c3d_features(video_input):
    c3d_model = create_c3d_model()
    return c3d_model.predict(video_input)

2. CNN for Video

A simple CNN for extracting features from video frames.

In [None]:
def create_video_cnn_model(input_shape=(100, 224, 224, 3)):  # Assuming frames as input
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv3D(64, (3, 3, 3), activation='relu')(inputs)
    x = layers.MaxPooling3D(pool_size=(2, 2, 2))(x)
    x = layers.Conv3D(128, (3, 3, 3), activation='relu')(x)
    x = layers.MaxPooling3D(pool_size=(2, 2, 2))(x)
    x = Flatten()(x)
    model = models.Model(inputs, x)
    return model

def extract_video_cnn_features(video_input):
    video_cnn_model = create_video_cnn_model()
    return video_cnn_model.predict(video_input)

3. TCN for Video

A TCN model for sequential video frames.

In [None]:
def create_video_tcn_model(input_shape=(100, 128)):
    inputs = layers.Input(shape=input_shape)
    x = Conv1D(128, kernel_size=3, padding='causal', activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = Conv1D(256, kernel_size=3, padding='causal', activation='relu')(x)
    x = BatchNormalization()(x)
    x = Flatten()(x)
    model = models.Model(inputs, x)
    return model

def extract_video_tcn_features(video_input):
    video_tcn_model = create_video_tcn_model()
    return video_tcn_model.predict(video_input)

4. Custom RNN for Video

A recurrent neural network for capturing temporal dependencies in video frames.

In [None]:
def create_video_rnn_model(input_shape=(100, 128)):
    inputs = layers.Input(shape=input_shape)
    x = TimeDistributed(Conv1D(128, 3, activation='relu'))(inputs)
    x = TimeDistributed(Flatten())(x)
    x = layers.LSTM(64, return_sequences=False)(x)
    model = models.Model(inputs, x)
    return model

def extract_video_rnn_features(video_input):
    video_rnn_model = create_video_rnn_model()
    return video_rnn_model.predict(video_input)

Step 6: Feature Fusion Using Attention

Attention blocks assign importance to different features before they are fused.

In [None]:
from tensorflow.keras.layers import Dense, Concatenate

def attention_block(inputs):
    attention = Dense(256, activation='softmax')(inputs)
    return inputs * attention

def fuse_video_features(c3d_features, cnn_features, tcn_features, rnn_features):
    concatenated_video = Concatenate()([c3d_features, cnn_features, tcn_features, rnn_features])
    return attention_block(concatenated_video)

def fuse_audio_features(crnn_features, wavenet_features, tcn_features, cnn_features):
    concatenated_audio = Concatenate()([crnn_features, wavenet_features, tcn_features, cnn_features])
    return attention_block(concatenated_audio)

Step 7: Final Multimodal Model

Combine the feature extractors and fuse their features into a unified model.

In [None]:
from tensorflow.keras import layers, models

def create_final_model(video_shape, audio_shape, num_classes=10):
    # Inputs for video and audio
    video_input = layers.Input(shape=video_shape)
    audio_input = layers.Input(shape=audio_shape)

    # Extract video and audio features using four models each
    c3d_features = extract_c3d_features(video_input)
    video_cnn_features = extract_video_cnn_features(video_input)
    video_tcn_features = extract_video_tcn_features(video_input)
    video_rnn_features = extract_video_rnn_features(video_input)

    crnn_features = extract_crnn_features(audio_input)
    wavenet_features = extract_wavenet_features(audio_input)
    audio_tcn_features = extract_tcn_features(audio_input)
    audio_cnn_features = extract_audio_cnn_features(audio_input)

    # Fuse features with attention
    fused_video_features = fuse_video_features(c3d_features, video_cnn_features, video_tcn_features, video_rnn_features)
    fused_audio_features = fuse_audio_features(crnn_features, wavenet_features, audio_tcn_features, audio_cnn_features)

    # Combine fused audio and video features
    combined_features = Concatenate()([fused_video_features, fused_audio_features])

    # Final classification layers
    x = layers.Dense(256, activation='relu')(combined_features)
    x = layers.Dropout(0.5)(x)
    output = layers.Dense(num_classes, activation='softmax')(x)

    # Create the final model
    model = models.Model(inputs=[video_input, audio_input], outputs=output)
    return model

# Example usage:
video_shape = (100, 224, 224, 3)  # Shape of video input
audio_shape = (128, 128, 1)       # Shape of audio (Mel-spectrogram)
num_classes = 10                  # Number of output classes

model = create_final_model(video_shape, audio_shape, num_classes)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])