In [None]:
import torch
import torch.nn as nn
import numpy as np
from transformers import Wav2Vec2Model, Wav2Vec2Processor
from keras.models import load_model
from keras.layers import Input, Dense, Dropout, Flatten, concatenate
from keras.models import Model
from keras.utils import to_categorical

# Load the pretrained image model (Keras)
image_model = load_model("emotion_model.h5")
for layer in image_model.layers:
    layer.trainable = False  # Freeze image model layers

# Load the pretrained audio model (PyTorch)
class Wav2Vec2WithAttention(nn.Module):
    def __init__(self, pretrained_model_name, num_classes):
        super(Wav2Vec2WithAttention, self).__init__()
        self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name)
        self.attention = nn.MultiheadAttention(embed_dim=self.wav2vec2.config.hidden_size, num_heads=4, batch_first=True)
        self.fc = nn.Linear(self.wav2vec2.config.hidden_size, num_classes)

    def forward(self, input_values, attention_mask=None):
        hidden_states = self.wav2vec2(input_values, attention_mask=attention_mask).last_hidden_state
        attention_output, _ = self.attention(hidden_states, hidden_states, hidden_states)
        pooled_output = torch.mean(attention_output, dim=1)
        logits = self.fc(pooled_output)
        return logits

audio_model = Wav2Vec2WithAttention(pretrained_model_name="facebook/wav2vec2-base", num_classes=8)
for param in audio_model.parameters():
    param.requires_grad = False  # Freeze audio model layers

# Define the multimodal model
def create_multimodal_model():
    # Image input and feature extraction
    image_input = Input(shape=(48, 48, 1), name="image_input")
    image_features = image_model(image_input)

    # Audio input and feature extraction
    audio_input = Input(shape=(1, 16000), name="audio_input")
    audio_features = Dense(128, activation="relu")(Flatten()(audio_input))  # Placeholder for PyTorch audio features

    # Physiological input
    physio_input = Input(shape=(5,), name="physio_input")  # Assuming 5 physiological features
    physio_features = Dense(64, activation="relu")(physio_input)

    # Combine features
    combined_features = concatenate([image_features, audio_features, physio_features])
    x = Dense(256, activation="relu")(combined_features)
    x = Dropout(0.3)(x)
    output = Dense(8, activation="softmax")(x)  # Assuming 8 emotion classes

    # Create model
    model = Model(inputs=[image_input, audio_input, physio_input], outputs=output)
    return model

multimodal_model = create_multimodal_model()

# Compile the multimodal model
multimodal_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Training the multimodal model
def train_multimodal_model():
    # Example dummy data
    num_samples = 100
    image_data = np.random.rand(num_samples, 48, 48, 1)
    audio_data = np.random.rand(num_samples, 1, 16000)
    physio_data = np.random.rand(num_samples, 5)
    labels = np.random.randint(0, 8, size=(num_samples,))

    # Preprocess labels for training
    labels = to_categorical(labels, num_classes=8)

    # Train the model
    multimodal_model.fit(
        {"image_input": image_data, "audio_input": audio_data, "physio_input": physio_data},
        labels,
        batch_size=16,
        epochs=10,
        validation_split=0.2,
    )

# Train the model
train_multimodal_model()
