In [1]:
!git clone https://github.com/pliang279/MultiBench.git

Cloning into 'MultiBench'...
remote: Enumerating objects: 6943, done.[K
remote: Counting objects: 100% (154/154), done.[K
remote: Compressing objects: 100% (94/94), done.[K
remote: Total 6943 (delta 72), reused 121 (delta 60), pack-reused 6789[K
Receiving objects: 100% (6943/6943), 51.07 MiB | 17.78 MiB/s, done.
Resolving deltas: 100% (4258/4258), done.


In [2]:
%cd MultiBench

/content/MultiBench


In [3]:
!mkdir data
!pip install gdown && gdown https://drive.google.com/u/0/uc?id=1szKIqO0t3Be_W91xvf6aYmsVVUa7wDHU

Downloading...
From (original): https://drive.google.com/u/0/uc?id=1szKIqO0t3Be_W91xvf6aYmsVVUa7wDHU
From (redirected): https://drive.google.com/uc?id=1szKIqO0t3Be_W91xvf6aYmsVVUa7wDHU&confirm=t&uuid=2ba79ccf-8145-440f-80a9-d01c403c9fab
To: /content/MultiBench/mosi_raw.pkl
100% 357M/357M [00:02<00:00, 144MB/s]


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.metrics import accuracy_score
import numpy as np

In [5]:
from datasets.affect.get_data import get_dataloader

# Load the dataset
traindata, validdata, testdata = get_dataloader(
    'mosi_raw.pkl', robust_test=False, max_pad=True, data_type='mosi', max_seq_len=50)

In [6]:
class SubNet(nn.Module):
    '''
    The subnetwork that is used in LMF for video and audio in the pre-fusion stage
    '''

    def __init__(self, in_size, hidden_size, dropout):
        super(SubNet, self).__init__()
        self.norm = nn.BatchNorm1d(in_size)
        self.drop = nn.Dropout(p=dropout)
        self.linear_1 = nn.Linear(in_size, hidden_size)
        self.linear_2 = nn.Linear(hidden_size, hidden_size)
        self.linear_3 = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        normed = self.norm(x)
        dropped = self.drop(normed)
        y_1 = F.relu(self.linear_1(dropped))
        y_2 = F.relu(self.linear_2(y_1))
        y_3 = F.relu(self.linear_3(y_2))

        return y_3


In [7]:
class TextSubNet(nn.Module):
    '''
    The LSTM-based subnetwork that is used in LMF for text
    '''

    def __init__(self, in_size, hidden_size, out_size, num_layers=1, dropout=0.2, bidirectional=False):
        super(TextSubNet, self).__init__()
        self.rnn = nn.LSTM(in_size, hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear_1 = nn.Linear(hidden_size, out_size)

    def forward(self, x):
        _, final_states = self.rnn(x)
        h = self.dropout(final_states[0].squeeze())
        y_1 = self.linear_1(h)
        return y_1


In [28]:
import torch
import torch.nn as nn

class LMF(nn.Module):
    def __init__(self, input_dims, hidden_dims, text_out, dropouts, output_dim, rank, use_softmax):
        super(LMF, self).__init__()

        # Unpack input_dims
        audio_dim, video_dim, text_dim = input_dims

        # Define audio network
        self.audio_net = nn.Sequential(
            nn.Linear(audio_dim, hidden_dims[0]),
            nn.BatchNorm1d(hidden_dims[0]),
            nn.ReLU(),
            nn.Dropout(dropouts[0]),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.BatchNorm1d(hidden_dims[1]),
            nn.ReLU(),
            nn.Dropout(dropouts[1]),
            nn.Linear(hidden_dims[1], hidden_dims[2]),
            nn.BatchNorm1d(hidden_dims[2]),
            nn.ReLU(),
            nn.Dropout(dropouts[2])
        )

        # Define video network
        self.video_net = nn.Sequential(
            nn.Linear(video_dim, hidden_dims[0]),
            nn.BatchNorm1d(hidden_dims[0]),
            nn.ReLU(),
            nn.Dropout(dropouts[0]),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.BatchNorm1d(hidden_dims[1]),
            nn.ReLU(),
            nn.Dropout(dropouts[1]),
            nn.Linear(hidden_dims[1], hidden_dims[2]),
            nn.BatchNorm1d(hidden_dims[2]),
            nn.ReLU(),
            nn.Dropout(dropouts[2])
        )

        # Define text network
        self.text_net = nn.Sequential(
            nn.Linear(text_dim, text_out),
            nn.BatchNorm1d(text_out),
            nn.ReLU(),
            nn.Dropout(dropouts[3])
        )

        # Fusion layer
        self.fusion_layer = nn.Linear(text_out, rank)

        # Output layer
        self.output_layer = nn.Linear(rank, output_dim)

        self.use_softmax = use_softmax

    def forward(self, audio_x, video_x, text_x):
        # Pass inputs through respective networks
        audio_out = self.audio_net(audio_x)
        video_out = self.video_net(video_x)
        text_out = self.text_net(text_x)

                # Concatenate audio and video features
        av_concat = torch.cat((audio_out, video_out), dim=1)

        # Concatenate audio-video features with text features
        avt_concat = torch.cat((av_concat, text_out), dim=1)

        # Fusion layer
        fusion_out = self.fusion_layer(avt_concat)

        # Output layer
        output = self.output_layer(fusion_out)

        if self.use_softmax:
          output = torch.softmax(output, dim=0)
          output = torch.argmax(output, dim=-1)

        print (output)

        return output


In [29]:
# Create the low-rank multimodal fusion model
input_dims = (35, 74, 300)
hidden_dims = (50,50,50)
text_out = 50
dropouts = (0.1, 0.1, 0.1, 0.1)
output_dim = 5
rank = 5
use_softmax = True
final_model = LMF(input_dims, hidden_dims, text_out, dropouts, output_dim, rank, use_softmax)


In [30]:
# Loss function: CrossEntropyLoss for classification tasks
Loss = torch.nn.CrossEntropyLoss()

# Optimizer: Adam optimizer for training the model parameters
optimizer = torch.optim.Adam(list(final_model.parameters())[2:], lr=0.0005, weight_decay=0.01)

# Number of epochs for training
num_epochs = 100

In [34]:
for epoch in range(num_epochs):

    print("EPOCH : ",epoch+1)
    # Training
    total_train_loss=0.0
    num_sequences=0
    final_model.train()  # Set the model to training mode
    for batch in traindata:
        # targets=targets.unsqueeze(1).repeat(1, 50, 1)
        optimizer.zero_grad()  # Zero the gradients
        outputs = final_model(*batch[:-1])
        loss = Loss(outputs, batch[-1].long())
        total_train_loss+=loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights
        num_sequences+=1

    average_train_loss = total_train_loss / num_sequences
    print("-------------Training----------------")
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_train_loss:.4f}')

    # Validation
    final_model.eval()  # Set the model to evaluation mode
    total_val_loss = 0.0
    num_sequences=0
    val_all_predictions = []
    val_all_targets = []
    with torch.no_grad():
        best_val_loss = np.inf
        patience=3
        current_patience = patience

        for batch in validdata:
            val_outputs = final_model(batch[:-1])
            val_targets = batch[-1]
            val_targets = torch.clamp(val_targets, min=-2, max=2)
            val_targets += 2
            val_targets = torch.round(val_targets)
            val_loss = Loss(val_outputs, val_targets)
            total_val_loss += val_loss.item()

            # Calculate R2 score
            val_targets_np = val_targets.cpu().numpy()
            val_outputs_np = val_outputs.cpu().numpy()


            val_all_predictions.extend(val_outputs_np)
            val_all_targets.extend(val_targets_np)
            num_sequences+=1

    average_val_loss = total_val_loss / num_sequences
    accuracy = accuracy_score(val_all_targets, val_all_predictions)

    print("--------------Validation----------")
    print(f'Epoch [{epoch + 1}/{num_epochs}], Validation Loss: {average_val_loss:.4f}, Accuracy Score: {accuracy:.4f}')

    # if average_val_loss < best_val_loss:
    #     best_val_loss=average_val_loss
    #     current_patience=patience
    # else:
    #     current_patience-=1
    #     if current_patience==0:
    #       print("Model performance degarding , Early stopping!!")
    #       break



    print("\n \n")


EPOCH :  1


  self.pid = os.fork()


tensor([[0, 1, 0,  ..., 1, 4, 3],
        [1, 3, 2,  ..., 2, 3, 2],
        [1, 1, 0,  ..., 3, 1, 0],
        ...,
        [2, 2, 2,  ..., 3, 0, 3],
        [1, 1, 4,  ..., 2, 3, 0],
        [0, 4, 4,  ..., 2, 4, 2]])


RuntimeError: "log_softmax_lastdim_kernel_impl" not implemented for 'Long'

In [36]:
batch[-1]

tensor([[ 0.0000],
        [-0.5000],
        [-1.0000],
        [-2.0000],
        [-0.3333],
        [ 2.2000],
        [-2.2000],
        [-1.0000],
        [-0.4000],
        [ 2.6000],
        [-0.2000],
        [ 3.0000],
        [ 0.6000],
        [-1.2000],
        [-0.6000],
        [-2.0000],
        [ 1.6000],
        [ 2.0000],
        [-2.2000],
        [-0.8000],
        [ 2.6000],
        [ 1.4000],
        [ 2.0000],
        [-1.4000],
        [ 0.2000],
        [ 1.8000],
        [-1.2000],
        [-1.6000],
        [ 0.2500],
        [-1.2000],
        [-1.4000],
        [ 1.8000]])

In [86]:
torch.argmax(outputs, dim=-1)

tensor([[0, 0, 0,  ..., 3, 3, 3],
        [3, 0, 0,  ..., 3, 3, 0],
        [0, 3, 0,  ..., 3, 3, 3],
        ...,
        [3, 3, 0,  ..., 3, 3, 3],
        [0, 3, 3,  ..., 3, 3, 3],
        [3, 0, 3,  ..., 3, 3, 3]])

In [None]:
def calculate_accuracy(model, dataloader):
    """
    Calculate accuracy for the low-rank multimodal fusion model.

    Args:
    - model: The low-rank multimodal fusion model.
    - dataloader: Dataloader containing the validation data.

    Returns:
    - accuracy: Accuracy of the model on the validation set.
    """
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            audio_data, video_data, text_data, labels = batch
            predictions = model(audio_data, video_data, text_data)
            predictions = torch.argmax(predictions, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_predictions)
    return accuracy


In [None]:
import matplotlib.pyplot as plt

def display_results(train_losses, val_losses, accuracies, fusion_params):
    """
    Display training and validation results along with fusion parameters.

    Args:
    - train_losses (list): List of training losses for each epoch.
    - val_losses (list): List of validation losses for each epoch.
    - accuracies (list): List of accuracies for each epoch.
    - fusion_params (dict): Dictionary containing fusion parameters.

    Returns:
    - None
    """

    # Plot training and validation losses
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss', color='blue')
    plt.plot(val_losses, label='Validation Loss', color='orange')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Losses')
    plt.legend()

    # Plot accuracies
    plt.subplot(1, 2, 2)
    plt.plot(accuracies, label='Accuracy', color='green')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Accuracy')
    plt.legend()

    # Display fusion parameters
    print("Fusion Parameters:")
    for key, value in fusion_params.items():
        print(f"{key}: {value}")

    plt.show()

display_results(train_losses, val_losses, accuracies, fusion_params)