In [5]:
import os
import math
import json
import numpy as np
import torch
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset
from pathlib import Path
from collections import defaultdict

import argparse

In [6]:

# ========== Dataset Class ==========
class SummaryDataset(Dataset):
    def __init__(self, list_file, wav_dir, label_dir, max_seq_len=250, resample_sr=2000):
        self.video_names = self.read_video_list(list_file)
        self.wav_dir = Path(wav_dir)
        self.label_dir = Path(label_dir)
        self.max_seq_len = max_seq_len
        self.resample_sr = resample_sr

        self.label_files = self.prefetch_label_files()
        self.data_size, self.index_to_chunk, self.labels = self.prefetch_and_index()

    def read_video_list(self, list_path):
        with open(list_path, 'r') as f:
            return [line.strip() for line in f]

    def prefetch_label_files(self):
        name_set = set(self.video_names)
        label_files = defaultdict(list)

        for label_file in self.label_dir.glob("**/*.json"):
            file_name = label_file.stem
            annotator_id_index = len(file_name) - file_name[::-1].find("_") - 1
            video_name = file_name[:annotator_id_index]

            if video_name in name_set:
                label_files[video_name].append(label_file)

        return label_files

    def extract_label(self, video_name):
        label_files = self.label_files.get(video_name, [])
        labels = []

        for label_file in label_files:
            with open(label_file, "r") as rf:
                data = json.load(rf)
            video_length = math.ceil(data["metadata"]["length"])
            annotator_label = np.zeros(video_length)

            for timeline in data["timelines"]:
                for t in range(timeline["start"], timeline["end"] + 1):
                    if t < video_length:
                        annotator_label[t] = 1

            labels.append(annotator_label)

        return np.array(labels)

    def prefetch_and_index(self):
        index = 0
        index_to_chunk = {}
        all_labels = {}

        for video_name in self.video_names:
            labels = self.extract_label(video_name)
            if labels is None or len(labels) == 0 or len(labels[0]) == 0:
                print(f"⚠️ Skipping {video_name}: no valid labels")
                continue

            all_labels[video_name] = labels
            chunk_count = math.ceil(len(labels[0]) / self.max_seq_len)

            for chunk_index in range(chunk_count):
                index_to_chunk[index + chunk_index] = (video_name, chunk_index)

            index += chunk_count

        return index, index_to_chunk, all_labels

    def __len__(self):
        return self.data_size

    def __getitem__(self, index):
        video_name, chunk_index = self.index_to_chunk[index]
        start = chunk_index * self.max_seq_len
        end = start + self.max_seq_len

        labels = self.labels[video_name][:, start:end]

        try:
            wav_path = self.wav_dir / f"{video_name}.wav"
            audio_data, sr = torchaudio.load(str(wav_path))
        except:
            print(f"🚫 Error loading: {wav_path}")
            return None

        resampler = T.Resample(sr, self.resample_sr, dtype=audio_data.dtype)
        audio_data = resampler(audio_data)
        audio_data = torch.mean(audio_data, axis=0).numpy()

        # Crop or pad the audio
        audio_data = audio_data[start * self.resample_sr : end * self.resample_sr]

        total_segments = self.max_seq_len
        num_frames_per_segment = len(audio_data) // total_segments
        audio_list = []

        for i in range(0, len(audio_data) - num_frames_per_segment + 1, num_frames_per_segment):
            segment = audio_data[i : i + num_frames_per_segment]

            if len(segment) < self.resample_sr:
                pad = self.resample_sr - len(segment)
                segment = np.pad(segment, (0, pad), mode="constant")
            elif len(segment) > self.resample_sr:
                segment = segment[:self.resample_sr]

            audio_list.append(segment)

        audio_array = np.vstack(audio_list)

        # Convert labels
        labels = torch.from_numpy(labels).squeeze(0)
        labels = torch.sum(labels, dim=0)
        labels = torch.min(labels, torch.ones(labels.shape[0], device=labels.device))

        return video_name, audio_array, labels


In [22]:
# ========== Dataset ==========
sd_train_av = SummaryDataset(
    list_file="/home/jovyan/EmotionDetection/video_data/av_train.txt",
    wav_dir="/home/jovyan/EmotionDetection/audio_data/av_train",
    label_dir="/home/jovyan/EmotionDetection/video_data/label"
)

sd_test_av = SummaryDataset(
    list_file="/home/jovyan/EmotionDetection/video_data/av_test.txt",
    wav_dir="/home/jovyan/EmotionDetection/audio_data/av_test",
    label_dir="/home/jovyan/EmotionDetection/video_data/label"
)

sd_test_mul = SummaryDataset(
    list_file="/home/jovyan/EmotionDetection/video_data/mul_test.txt",
    wav_dir="/home/jovyan/EmotionDetection/audio_data/mul_test",
    label_dir="/home/jovyan/EmotionDetection/video_data/label"
)

# ========== DataLoader ==========
# Custom collate function to skip None and unpack correctly
def safe_collate(batch):
    batch = [b for b in batch if b is not None]
    if len(batch) == 0:
        return None
    return tuple(zip(*batch))  # returns (video_names, inputs, labels)

dl_train_av = DataLoader(
    sd_train,
    batch_size=1,
    shuffle=True,
    num_workers=2,
    collate_fn=safe_collate)

dl_test_av = DataLoader(
    sd_test_av,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    collate_fn=safe_collate)

dl_test_mul = DataLoader(
    sd_test_mul,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    collate_fn=safe_collate)

# ========== Info ==========
print(f"📦 Train dataset size: {len(sd_train_av)}")
print(f"🎬 AV Test dataset size: {len(sd_test_av)}")
print(f"🎬 MUL Test dataset size: {len(sd_test_mul)}")

📦 Train dataset size: 401
🎬 AV Test dataset size: 140
🎬 MUL Test dataset size: 163


# Model Load
# Extract Emotional Feature


In [15]:
# -*- coding: cp949 -*-
from glob import glob
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import f1_score

from transformers import Wav2Vec2Processor, Wav2Vec2Model
import argparse 
from torch.nn.utils.rnn import pad_sequence

2025-04-13 07:36:19.909488: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-13 07:36:20.262358: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-13 07:36:20.262437: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-13 07:36:20.262499: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-13 07:36:20.288485: I tensorflow/core/platform/cpu_feature_g

In [16]:
# 주피터 노트북에서 명령행 인자 대신 변수를 직접 설정합니다.
args = argparse.Namespace()
args.num_labels = 7
args.seed = 1234
args.lr = 1e-5

In [17]:
class wav2vec_classifier(nn.Module):
    def __init__(self, extractor, num_labels, dropout_prob=0.1):
        super(wav2vec_classifier, self).__init__()

        self.extractor = extractor
        self.dropout = nn.Dropout(dropout_prob)
        self.nu_labels = num_labels
        self.classifier = nn.Linear(512, num_labels)
        #self.softmax = F.softmax()

    def forward(self, wav):
        extracted_wav = self.extractor(wav)
        
        #last_hidden_states = extracted_wav.last_hidden_state
        last_hidden_states = extracted_wav.extract_features
        last_hidden_states = self.dropout(last_hidden_states)
        output = self.classifier(last_hidden_states)
           
#         hidden = extracted_wav.hidden_states[-1]
        out_last = last_hidden_states[:, -1, :]
        
        return F.softmax(output[:, -1], dim=-1), last_hidden_states, out_last


extractor = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
emotion_model = wav2vec_classifier(extractor, args.num_labels)
emotion_model.load_state_dict(torch.load('wav2vec_affective_audio/wac2vec_emotion_classification_model.pt'))
emotion_model.cuda()
######################################################

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


wav2vec_classifier(
  (extractor): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (enc

In [18]:
optimizer = AdamW(emotion_model.parameters(), lr = args.lr,  eps = 1e-8)
criterion = nn.MultiLabelSoftMarginLoss()

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [25]:
import torch
from tqdm.notebook import tqdm_notebook

# train emotion data
emotion_waveform_av_train = []

for batch in tqdm_notebook(dl_train_av, total=len(dl_train_av), desc='Processing dataset'):
    if batch is None:
        continue

    video_names, inputs, labels = batch
    video_name = video_names[0]
    input_array = inputs[0]  # numpy array

    with torch.no_grad():
        emotion_model.eval()
        input_tensor = torch.tensor(input_array, dtype=torch.float32).to(device)  # 변환
        print(input_tensor.shape)

        outputs, last_hidden_states, out_last = emotion_model(input_tensor)
        emotion_waveform_av_train.append(out_last)


Processing dataset:   0%|          | 0/401 [00:00<?, ?it/s]

torch.Size([250, 2000])


  return F.conv1d(input, weight, bias, self.stride,


torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250,

In [30]:
import numpy as np

# Move all tensors to CPU and convert to numpy
cpu_array = [t.cpu().numpy() for t in emotion_waveform_av_train]

# Save to .npy as object (since shapes might vary)
np.save("waveform_emotion_av_train.npy", np.array(cpu_array, dtype=object))


In [31]:
for i, t in enumerate(emotion_waveform_av_train):
    print(f"{i}: {t.shape}")


0: torch.Size([250, 512])
1: torch.Size([250, 512])
2: torch.Size([250, 512])
3: torch.Size([250, 512])
4: torch.Size([250, 512])
5: torch.Size([250, 512])
6: torch.Size([250, 512])
7: torch.Size([250, 512])
8: torch.Size([250, 512])
9: torch.Size([250, 512])
10: torch.Size([250, 512])
11: torch.Size([250, 512])
12: torch.Size([250, 512])
13: torch.Size([250, 512])
14: torch.Size([250, 512])
15: torch.Size([250, 512])
16: torch.Size([250, 512])
17: torch.Size([250, 512])
18: torch.Size([250, 512])
19: torch.Size([250, 512])
20: torch.Size([250, 512])
21: torch.Size([250, 512])
22: torch.Size([250, 512])
23: torch.Size([250, 512])
24: torch.Size([250, 512])
25: torch.Size([250, 512])
26: torch.Size([250, 512])
27: torch.Size([250, 512])
28: torch.Size([250, 512])
29: torch.Size([250, 512])
30: torch.Size([250, 512])
31: torch.Size([250, 512])
32: torch.Size([250, 512])
33: torch.Size([250, 512])
34: torch.Size([250, 512])
35: torch.Size([250, 512])
36: torch.Size([250, 512])
37: torch.S

In [33]:
import torch
from tqdm.notebook import tqdm_notebook

# train emotion data
waveform_emotion_av_test = []

for batch in tqdm_notebook(dl_test_av, total=len(dl_test_av), desc='Processing dataset'):
    if batch is None:
        continue

    video_names, inputs, labels = batch
    video_name = video_names[0]
    input_array = inputs[0]  # numpy array

    with torch.no_grad():
        emotion_model.eval()
        input_tensor = torch.tensor(input_array, dtype=torch.float32).to(device)  # 변환
        print(input_tensor.shape)

        outputs, last_hidden_states, out_last = emotion_model(input_tensor)
        waveform_emotion_av_test.append(out_last)


Processing dataset:   0%|          | 0/140 [00:00<?, ?it/s]

torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([254, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250,

In [34]:
import torch
from tqdm.notebook import tqdm_notebook

# train emotion data
waveform_emotion_mul_test = []

for batch in tqdm_notebook(dl_test_mul, total=len(dl_test_mul), desc='Processing dataset'):
    if batch is None:
        continue

    video_names, inputs, labels = batch
    video_name = video_names[0]
    input_array = inputs[0]  # numpy array

    with torch.no_grad():
        emotion_model.eval()
        input_tensor = torch.tensor(input_array, dtype=torch.float32).to(device)  # 변환
        print(input_tensor.shape)

        outputs, last_hidden_states, out_last = emotion_model(input_tensor)
        waveform_emotion_mul_test.append(out_last)


Processing dataset:   0%|          | 0/163 [00:00<?, ?it/s]

torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250,

In [35]:
waveform_emotion_av_test[0].shape, len(waveform_emotion_av_test)

(torch.Size([250, 512]), 140)

In [36]:
waveform_emotion_mul_test[0].shape, len(waveform_emotion_mul_test)

(torch.Size([250, 512]), 163)

In [39]:
import numpy as np

# Move all tensors to CPU and convert to numpy
cpu_array = [t.cpu().numpy() for t in waveform_emotion_av_test]

# Save to .npy as object (since shapes might vary)
np.save("waveform_emotion_av_test.npy", np.array(cpu_array, dtype=object))


In [40]:

import numpy as np

# Move all tensors to CPU and convert to numpy
cpu_array = [t.cpu().numpy() for t in waveform_emotion_mul_test]

# Save to .npy as object (since shapes might vary)
np.save("waveform_emotion_mul_test.npy", np.array(cpu_array, dtype=object))


# Extract Arousal, Valence Feature

In [41]:
# 주피터 노트북에서 명령행 인자 대신 변수를 직접 설정합니다.
args = argparse.Namespace()
args.seed = 1234
args.regress = 1
args.lr = 1e-5

In [43]:
class wav2vec_classifier(nn.Module):
    def __init__(self, extractor, num_labels, dropout_prob=0.1):
        super(wav2vec_classifier, self).__init__()

        self.extractor = extractor
        self.dropout = nn.Dropout(dropout_prob)
        self.nu_labels = num_labels
        self.valence_classifier = nn.Linear(512, num_labels)
        self.arousal_classifier = nn.Linear(512, num_labels)

    def forward(self, wav):
        extracted_wav = self.extractor(wav)
        
        #last_hidden_states = extracted_wav.last_hidden_state
        last_hidden_states = extracted_wav.extract_features # B, seq, 512
        last_hidden_states = self.dropout(last_hidden_states) #B, seq, 512
        
        output_valence = self.valence_classifier(last_hidden_states) # B, Seq, 1
        output_arousal = self.arousal_classifier(last_hidden_states) # B, Seq, 1
                
        out_last = last_hidden_states[:, -1, :]
        return output_valence[:, -1], output_arousal[:, -1], out_last

extractor = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
arousal_valence_model = wav2vec_classifier(extractor,args.regress)
arousal_valence_model.load_state_dict(torch.load('wav2vec_affective_audio/wac2vec_arousal_valence_model_epoch_4.pt'))
arousal_valence_model.cuda()

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


wav2vec_classifier(
  (extractor): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (enc

In [44]:
optimizer = torch.optim.AdamW(arousal_valence_model.parameters(), lr = args.lr,  eps = 1e-8)
criterion = nn.MSELoss()

In [45]:
!nvidia-smi

Sun Apr 13 16:04:07 2025       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.182.03   Driver Version: 470.182.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A4000    Off  | 00000000:65:00.0 Off |                  Off |
| 41%   35C    P2    34W / 140W |   6788MiB / 16117MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [47]:
from tqdm.notebook import tqdm_notebook

waveform_arousal_valence_av_train = []
train_labels = []

for batch in tqdm_notebook(dl_train_av, total=len(dl_train_av), desc='Processing dataset'):
    if batch is None:
        continue
    video_names, inputs, labels = batch
    video_name = video_names[0]
    input_array = inputs[0]  # Unpack the tuple to get the actual array

    with torch.no_grad():
        arousal_valence_model.eval()
        input_tensor = torch.tensor(input_array, dtype=torch.float32).to(device)
        print(input_tensor.shape)
        outputs_valence, outputs_arousal, out_last = arousal_valence_model(input_tensor)
        waveform_arousal_valence_av_train.append(out_last.cpu())  # Move to CPU for later save
        train_labels.append(labels)


Processing dataset:   0%|          | 0/401 [00:00<?, ?it/s]

torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([251, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250,

In [49]:
import numpy as np

# Move all tensors to CPU and convert to numpy
cpu_array = [t.cpu().numpy() for t in waveform_arousal_valence_av_train]

# Save to .npy as object (since shapes might vary)
np.save("Features/waveform_arousal_valence_av_train.npy", np.array(cpu_array, dtype=object))


In [50]:
from tqdm.notebook import tqdm_notebook

waveform_arousal_valence_av_test = []
av_test_labels = []

for batch in tqdm_notebook(dl_test_av, total=len(dl_test_av), desc='Processing dataset'):
    if batch is None:
        continue
    video_names, inputs, labels = batch
    video_name = video_names[0]
    input_array = inputs[0]  # Unpack the tuple to get the actual array

    with torch.no_grad():
        arousal_valence_model.eval()
        input_tensor = torch.tensor(input_array, dtype=torch.float32).to(device)
        print(input_tensor.shape)
        outputs_valence, outputs_arousal, out_last = arousal_valence_model(input_tensor)
        waveform_arousal_valence_av_test.append(out_last.cpu())  # Move to CPU for later save
        av_test_labels.append(labels)

Processing dataset:   0%|          | 0/140 [00:00<?, ?it/s]

torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([254, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250,

In [51]:
from tqdm.notebook import tqdm_notebook

waveform_arousal_valence_mul_test = []
mul_test_labels = []

for batch in tqdm_notebook(dl_test_mul, total=len(dl_test_mul), desc='Processing dataset'):
    if batch is None:
        continue
    video_names, inputs, labels = batch
    video_name = video_names[0]
    input_array = inputs[0]  # Unpack the tuple to get the actual array

    with torch.no_grad():
        arousal_valence_model.eval()
        input_tensor = torch.tensor(input_array, dtype=torch.float32).to(device)
        print(input_tensor.shape)
        outputs_valence, outputs_arousal, out_last = arousal_valence_model(input_tensor)
        waveform_arousal_valence_mul_test.append(out_last.cpu())  # Move to CPU for later save
        mul_test_labels.append(labels)

Processing dataset:   0%|          | 0/163 [00:00<?, ?it/s]

torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250, 2000])
torch.Size([250,

In [52]:
import numpy as np

# Move all tensors to CPU and convert to numpy
cpu_array = [t.cpu().numpy() for t in waveform_arousal_valence_mul_test]

# Save to .npy as object (since shapes might vary)
np.save("Features/waveform_arousal_valence_mul_test.npy", np.array(cpu_array, dtype=object))

In [53]:
import numpy as np

# Move all tensors to CPU and convert to numpy
cpu_array = [t.cpu().numpy() for t in waveform_arousal_valence_mul_test]

# Save to .npy as object (since shapes might vary)
np.save("Features/waveform_arousal_valence_mul_test.npy", np.array(cpu_array, dtype=object))

In [54]:
emotion_train = np.load('Features/waveform_emotion_av_train.npy',allow_pickle=True)
arousal_valence_train = np.load('Features/waveform_arousal_valence_av_train.npy',allow_pickle=True)
len(emotion_train), len(arousal_valence_train),emotion_train[0].shape ,arousal_valence_train[0].shape

(401, 401, (250, 512), (250, 512))

In [55]:
waveform_emotion_av_train[0].shape, len(waveform_emotion_av_train)

(torch.Size([250, 512]), 140)

In [56]:
waveform_emotion_av_test[0].shape, len(waveform_emotion_av_test)

(torch.Size([250, 512]), 140)

In [57]:
waveform_emotion_mul_test[0].shape, waveform_emotion_mul_test[0].shape

(torch.Size([250, 512]), torch.Size([250, 512]))

# Feature Concatenate (train)

In [58]:
import os

import numpy as np
import pandas as pd
import torch

In [61]:
features_concatenate_train = []

for av_feature, emotion_feature in zip(arousal_valence_train, emotion_train):
    av_tensor = torch.tensor(av_feature)
    emo_tensor = torch.tensor(emotion_feature)

    min_len = min(av_tensor.shape[0], emo_tensor.shape[0])
    av_tensor = av_tensor[:min_len]
    emo_tensor = emo_tensor[:min_len]

    concated_feature = torch.cat((av_tensor, emo_tensor), dim=1)
    features_concatenate_train.append(concated_feature)


In [62]:
features_concatenate_train[0].shape, len(features_concatenate_train)

(torch.Size([250, 1024]), 401)

In [64]:
data_train = np.array([t.cpu().numpy() for t in features_concatenate_train])

# 저장 경로와 파일 이름 지정
file_path = 'Features/concatenate_emotion_waveform_av_train.npy'

# 넘파이 배열을 npy 파일로 저장
np.save(file_path, data_train)

In [73]:
import numpy as np

# Unpack tensors from tuple and convert to numpy
new_np_labels_train = [l[0].cpu().numpy().squeeze() for l in train_labels]

# Convert to numpy array (may still be ragged depending on shape)
labels_train = np.array(new_np_labels_train, dtype=object)

# Save
np.save('Features/labels_av_train.npy', labels_train)


In [74]:
import numpy as np

# AV TEST
np_av_test_labels = [l[0].cpu().numpy().squeeze() for l in av_test_labels]
np_av_test_labels = np.array(np_av_test_labels, dtype=object)
np.save('Features/labels_av_test.npy', np_av_test_labels)

# MUL TEST
np_mul_test_labels = [l[0].cpu().numpy().squeeze() for l in mul_test_labels]
np_mul_test_labels = np.array(np_mul_test_labels, dtype=object)
np.save('Features/labels_mul_test.npy', np_mul_test_labels)

print("✅ Saved all label arrays!")


✅ Saved all label arrays!


In [45]:
len(labels_train), len(labels_val), labels_train[0].shape, labels_val[0].shape

(293, 48, (250,), (250,))

In [75]:
from tqdm.notebook import tqdm

# label과 feature index 개수 맞추기
# feature에서 임의로 0으로 padding 해놨던 구간 인덱스 개수 맞추기
new_features_concatenate_list_train = []
new_labels_train = []

for concat, label in tqdm(zip(data_train, labels_train), total=len(labels_train), desc="Processing"):
    #print(concat.shape, label.shape)
    if label.shape == ():
         # concat 길이 줄이기
        target_length = 1 
        new_concat = concat[0:target_length,:]
        new_features_concatenate_list_train.append(new_concat)
        new_labels_train.append(label)
        print(f'scalar check1: {new_concat.shape}, {label.shape}')
        continue  # or any other logic you want to apply for scalar tensors
    
    if len(concat) == len(label):
        new_features_concatenate_list_train.append(concat)
        new_labels_train.append(label)
    elif len(concat) > len(label):
        # concat 길이 줄이기
        target_length = len(label)  
        new_concat = concat[0:target_length,:]
        new_features_concatenate_list_train.append(new_concat)
        new_labels_train.append(label)
        print(f'check1: {new_concat.shape}, {label.shape}')
    elif len(concat) < len(label):
        # concat 길이 줄이기
        target_length = len(concat.cuda())  
        new_label = label[0:target_length]
        new_labels_train.append(new_label)
        new_features_concatenate_list_train.append(concat)
        print(f'check2: {concat.shape}, {new_label.shape}')

Processing:   0%|          | 0/401 [00:00<?, ?it/s]

check1: (246, 1024), (246,)
check1: (106, 1024), (106,)
check1: (133, 1024), (133,)
check1: (102, 1024), (102,)
check1: (124, 1024), (124,)
check1: (28, 1024), (28,)
check1: (95, 1024), (95,)
check1: (69, 1024), (69,)
check1: (18, 1024), (18,)
check1: (86, 1024), (86,)
check1: (219, 1024), (219,)
check1: (79, 1024), (79,)
check1: (136, 1024), (136,)
check1: (118, 1024), (118,)
check1: (83, 1024), (83,)
check1: (159, 1024), (159,)
check1: (106, 1024), (106,)
check1: (196, 1024), (196,)
check1: (47, 1024), (47,)
check1: (201, 1024), (201,)
check1: (67, 1024), (67,)
check1: (102, 1024), (102,)
check1: (246, 1024), (246,)
check1: (134, 1024), (134,)
check1: (59, 1024), (59,)
check1: (163, 1024), (163,)
check1: (241, 1024), (241,)
check1: (71, 1024), (71,)
check1: (30, 1024), (30,)
check1: (231, 1024), (231,)
check1: (111, 1024), (111,)
check1: (93, 1024), (93,)
check1: (94, 1024), (94,)
check1: (105, 1024), (105,)
check1: (158, 1024), (158,)
check1: (109, 1024), (109,)
check1: (43, 1024), 

In [76]:
len(new_labels_train), new_labels_train[0].shape

(401, (250,))

In [83]:
data_train = np.array([t for t in new_features_concatenate_list_train], dtype=object)
file_path = 'Features/concatenate/concatenate_waveform_av_train.npy'
np.save(file_path, data_train)
print("✅ Saved with dtype=object")


✅ Saved with dtype=object


In [84]:
labels_train = np.array([t for t in new_labels_train], dtype=object)
label_path = 'Features/labels_av_train.npy'
np.save(label_path, labels_train)
print("✅ Saved with dtype=object")


✅ Saved with dtype=object


# Feature Concatenate (av_test,mul_test)

In [88]:
import torch
import numpy as np
from pathlib import Path

# Utility Function to Concatenate Feature Pairs Safely
def concatenate_features(av_list, emo_list):
    concat_list = []
    for av_feature, emotion_feature in zip(av_list, emo_list):
        av_tensor = av_feature.cpu() if av_feature.is_cuda else av_feature
        emo_tensor = emotion_feature.cpu() if emotion_feature.is_cuda else emotion_feature
        concat_tensor = torch.cat((av_tensor, emo_tensor), dim=1)
        concat_list.append(concat_tensor)
    return concat_list

# Save function
def save_npy(data_list, file_path):
    np_array = np.array([t.cpu().numpy() for t in data_list], dtype=object)
    np.save(file_path, np_array)
    print(f"✅ Saved: {file_path}")

# Output directory
Path("Features/concatenate").mkdir(parents=True, exist_ok=True)

# 1. AV TEST
features_concatenate_av_test = concatenate_features(
    waveform_arousal_valence_av_test, waveform_emotion_av_test
)
save_npy(features_concatenate_av_test, "Features/concatenate/concatenate_waveform_av_test.npy")

# 2. MUL TEST
features_concatenate_mul_test = concatenate_features(
    waveform_arousal_valence_mul_test, waveform_emotion_mul_test
)
save_npy(features_concatenate_mul_test, "Features/concatenate/concatenate_waveform_mul_test.npy")


✅ Saved: Features/concatenate/concatenate_waveform_av_test.npy
✅ Saved: Features/concatenate/concatenate_waveform_mul_test.npy


In [97]:
import numpy as np
from tqdm.notebook import tqdm
def align_features_labels(features, labels, verbose=True):
    aligned_features = []
    aligned_labels = []

    for concat, label in tqdm(zip(features, labels), total=len(labels), desc="Aligning"):
        # ⚠️ Unpack if label is a tuple like (tensor,)
        if isinstance(label, tuple) and len(label) == 1 and isinstance(label[0], torch.Tensor):
            label = label[0]
        elif isinstance(label, tuple):
            if verbose:
                print(f"⚠️ Skipping invalid label (tuple of unexpected format): {label}")
            continue

        # Check scalar
        if hasattr(label, "ndim") and label.ndim == 0:
            if verbose:
                print(f"⚠️ Skipping scalar label: {label}")
            continue

        # Tensor to numpy
        concat = concat.cpu().numpy() if hasattr(concat, "cpu") else concat
        label = label.cpu().numpy() if hasattr(label, "cpu") else label

        # Length match logic
        if len(concat) == len(label):
            aligned_features.append(concat)
            aligned_labels.append(label)
        elif len(concat) > len(label):
            aligned_features.append(concat[:len(label)])
            aligned_labels.append(label)
            if verbose:
                print(f"✂️ Trimmed feature: {len(concat)} → {len(label)}")
        elif len(concat) < len(label):
            aligned_features.append(concat)
            aligned_labels.append(label[:len(concat)])
            if verbose:
                print(f"✂️ Trimmed label: {len(label)} → {len(concat)}")

    return aligned_features, aligned_labels


In [98]:
# 예시: AV-TEST 셋에 대해 저장
features_concatenate_val_aligned, labels_val_aligned = align_features_labels(
    features_concatenate_av_test,  # ex: waveform_arousal_valence_av_test + emotion_av_test concat
    av_test_labels
)

# numpy 배열 저장
np.save("Features/concatenate/concatenate_waveform_av_test.npy", np.array(features_concatenate_val_aligned, dtype=object))
np.save("Features/labels_av_test.npy", np.array(labels_val_aligned, dtype=object))


Aligning:   0%|          | 0/140 [00:00<?, ?it/s]

✂️ Trimmed feature: 250 → 97
✂️ Trimmed feature: 250 → 94
✂️ Trimmed feature: 250 → 103
✂️ Trimmed feature: 250 → 106
✂️ Trimmed feature: 250 → 82
✂️ Trimmed feature: 250 → 105
✂️ Trimmed feature: 250 → 168
✂️ Trimmed feature: 250 → 115
✂️ Trimmed feature: 250 → 170
✂️ Trimmed feature: 250 → 93
✂️ Trimmed feature: 250 → 102
✂️ Trimmed feature: 250 → 104
✂️ Trimmed feature: 250 → 95
✂️ Trimmed feature: 250 → 111
✂️ Trimmed feature: 250 → 191
✂️ Trimmed feature: 250 → 90
✂️ Trimmed feature: 250 → 117
✂️ Trimmed feature: 250 → 169
✂️ Trimmed feature: 250 → 179
✂️ Trimmed feature: 250 → 102
✂️ Trimmed feature: 250 → 151
✂️ Trimmed feature: 250 → 97
✂️ Trimmed feature: 250 → 81
✂️ Trimmed feature: 250 → 226
✂️ Trimmed feature: 250 → 108
✂️ Trimmed feature: 250 → 191
✂️ Trimmed feature: 250 → 124
✂️ Trimmed feature: 254 → 8
✂️ Trimmed feature: 250 → 75
✂️ Trimmed feature: 250 → 161
✂️ Trimmed feature: 250 → 98
✂️ Trimmed feature: 250 → 213
✂️ Trimmed feature: 250 → 72
✂️ Trimmed feature: 250

In [102]:
# mul_test
features_concatenate_mul_aligned, labels_mul_aligned = align_features_labels(
    features_concatenate_mul_test, mul_test_labels
)
np.save("Features/concatenate/concatenate_waveform_mul_test.npy", np.array(features_concatenate_mul_aligned, dtype=object))
np.save("Features/labels_mul_test.npy", np.array(labels_mul_aligned, dtype=object))


Aligning:   0%|          | 0/163 [00:00<?, ?it/s]

✂️ Trimmed feature: 250 → 96
✂️ Trimmed feature: 250 → 107
✂️ Trimmed feature: 250 → 94
✂️ Trimmed feature: 250 → 87
✂️ Trimmed feature: 250 → 84
✂️ Trimmed feature: 250 → 125
✂️ Trimmed feature: 250 → 106
✂️ Trimmed feature: 250 → 237
✂️ Trimmed feature: 250 → 87
✂️ Trimmed feature: 250 → 162
✂️ Trimmed feature: 250 → 94
✂️ Trimmed feature: 250 → 90
✂️ Trimmed feature: 250 → 106
✂️ Trimmed feature: 250 → 177
✂️ Trimmed feature: 250 → 106
✂️ Trimmed feature: 250 → 95
✂️ Trimmed feature: 250 → 214
✂️ Trimmed feature: 250 → 231
✂️ Trimmed feature: 250 → 156
✂️ Trimmed feature: 250 → 108
✂️ Trimmed feature: 250 → 99
✂️ Trimmed feature: 250 → 185
✂️ Trimmed feature: 250 → 208
✂️ Trimmed feature: 250 → 54
✂️ Trimmed feature: 250 → 18
✂️ Trimmed feature: 250 → 181
✂️ Trimmed feature: 250 → 192
✂️ Trimmed feature: 250 → 128
✂️ Trimmed feature: 250 → 195
✂️ Trimmed feature: 250 → 181
✂️ Trimmed feature: 250 → 238
✂️ Trimmed feature: 250 → 118
✂️ Trimmed feature: 250 → 101
✂️ Trimmed feature: 2

In [103]:
len(labels_val_aligned), len(labels_mul_aligned)

(140, 163)

# Data Loader

In [104]:
from torch.utils.data import DataLoader, Dataset
class MyDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [108]:
# 리스트 데이터 로드
features_concatenate_train = np.load('Features/concatenate/concatenate_waveform_av_train.npy',allow_pickle=True)
len(features_concatenate_train), features_concatenate_train[0].shape

(401, (250, 1024))

In [111]:
# 리스트 데이터 로드
features_concatenate_val= np.load( 'Features/concatenate/concatenate_waveform_av_test.npy',allow_pickle=True)
len(features_concatenate_val), features_concatenate_val[0].shape

(140, (97, 1024))

In [131]:
# 리스트 데이터 로드
features_concatenate_test= np.load( 'Features/concatenate/concatenate_waveform_mul_test.npy',allow_pickle=True)
len(features_concatenate_test), features_concatenate_test[0].shape

(163, (96, 1024))

In [112]:
wav2vec2_labels_train=np.load('Features/labels_av_train.npy',allow_pickle=True)
len(wav2vec2_labels_train), wav2vec2_labels_train[0].shape

(401, (250,))

In [114]:
wav2vec2_labels_val=np.load('Features/labels_av_test.npy',allow_pickle=True)
len(wav2vec2_labels_val), wav2vec2_labels_val[0].shape

(140, (97,))

In [128]:
wav2vec2_labels_test=np.load('Features/labels_mul_test.npy',allow_pickle=True)
len(wav2vec2_labels_test), wav2vec2_labels_test[0].shape

(163, (96,))

In [129]:
new_wav2vec2_labels_train =   [torch.tensor(arr) for arr in wav2vec2_labels_train]
new_wav2vec2_labels_val =   [torch.tensor(arr) for arr in wav2vec2_labels_val]
new_wav2vec2_labels_test =   [torch.tensor(arr) for arr in wav2vec2_labels_test]

In [132]:
md_train = MyDataset(features_concatenate_train,new_wav2vec2_labels_train)
md_val = MyDataset(features_concatenate_val,new_wav2vec2_labels_val)
md_test = MyDataset(features_concatenate_test,new_wav2vec2_labels_test)

In [134]:
train_dataloader = DataLoader(md_train)
val_dataloader = DataLoader(md_val)
test_dataloader = DataLoader(md_test)

In [118]:
len(wav2vec2_labels_train[2]), len(wav2vec2_labels_train[2])

(246, 246)

# Evaluation

In [135]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_rate=0.5):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))
        
        # Apply dropout
        out = self.dropout(out[:, -1, :])
        
        # Decode with fully connected layer
        output = self.fc(out)
        
        # Apply sigmoid activation function to output
        output = self.sigmoid(output)
        
        return output, out


In [136]:
input_size = 1024
hidden_size = 128
num_layers = 3
output_size = 1

model = LSTMClassifier(input_size, hidden_size, num_layers, output_size)
model.load_state_dict(torch.load('/home/jovyan/EmotionDetection/model/lstm_audio_waveform_emotion_160_best.pt'))

<All keys matched successfully>

In [137]:
from tqdm.notebook import tqdm_notebook

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

y_pred = []
lstm_emotion_waveform_multimodal_train = []

for inputs, labels in tqdm_notebook(train_dataloader,total=len(train_dataloader), desc='Processing dataset'):
    with torch.no_grad():
        inputs_dim = len(inputs.shape)
        labels_dim = len(labels.shape)
        if inputs_dim < 2 or labels_dim < 2:
            model.eval()
            print(f"Mismatched dimensions: inputs({inputs.shape}), labels({labels.shape})")
            inputs = inputs.transpose(0,1).to(device)
            labels = labels.squeeze(0).to(device)

            y_p, y_f = model(inputs.cpu())
            y_pred.append(y_p)
            lstm_emotion_waveform_multimodal_train.append(y_f)
            continue
            
        
        if (inputs.shape[1])!= (labels.shape[1]):
            print( inputs.shape[1], labels.shape[1])
            continue
        
        model.eval()
        inputs = inputs.transpose(0,1).to(device)
        labels = labels.squeeze(0).to(device)
        print(f'inputs.shape: {inputs.shape}, labels.shape: {labels.shape}')
        y_p, y_f = model(inputs.cpu())
        y_pred.append(y_p)
        lstm_emotion_waveform_multimodal_train.append(y_f)
        
    

Processing dataset:   0%|          | 0/401 [00:00<?, ?it/s]

inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([246, 1, 1024]), labels.shape: torch.Size([246])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([106, 1, 1024]), labels.shape: torch.Size([106])
inputs.shape: torch.Size([133, 1, 1024]), labels.shape: torch.Size([133])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([102, 1, 1024]), labels.shape: torch.Size([102])
inputs.shape: torch.Size([124, 1, 1024]), labels.shape: torch.Size([124])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024

In [124]:
import numpy as np

# 리스트 내부 tensor들을 numpy object 배열로 저장
lstm_emotion_waveform_multimodal_train_np = np.array(
    [p.cpu().numpy() for p in lstm_emotion_waveform_multimodal_train],
    dtype=object  # 다양한 shape 허용
)

# 저장
np.save("Features/lstm_emotion_waveform_av_train.npy", lstm_emotion_waveform_multimodal_train_np)


  lstm_emotion_waveform_multimodal_train  = np.array([p.cpu() for p in lstm_emotion_waveform_multimodal_train])


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (401,) + inhomogeneous part.

In [125]:
len(lstm_emotion_waveform_multimodal_train)

401

In [115]:
np.save('../multimodal/lstm_emotion_waveform_av_train.npy',lstm_emotion_waveform_multimodal_train)

In [116]:
from tqdm.notebook import tqdm_notebook

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

y_pred = []
lstm_emotion_waveform_multimodal_val = []

for inputs, labels in tqdm_notebook(val_dataloader,total=len(val_dataloader), desc='Processing dataset'):
    with torch.no_grad():
        inputs_dim = len(inputs.shape)
        labels_dim = len(labels.shape)
        if inputs_dim < 2 or labels_dim < 2:
            model.eval()
            print(f"Mismatched dimensions: inputs({inputs.shape}), labels({labels.shape})")
            inputs = inputs.transpose(0,1).to(device)
            labels = labels.squeeze(0).to(device)

            y_p, y_f = model(inputs.cpu())
            y_pred.append(y_p)
            lstm_emotion_waveform_multimodal_val.append(y_f)
            continue
            
        if inputs.shape[1]!= labels.shape[1] :
            print("mismatch")
            continue
        model.eval()
        inputs = inputs.transpose(0,1).to(device)
        labels = labels.squeeze(0).to(device)
        print(f'inputs.shape: {inputs.shape}, labels.shape: {labels.shape}')
        y_p, y_f = model(inputs.cpu())
        y_pred.append(y_p)
        lstm_emotion_waveform_multimodal_val.append(y_f)
        
    

Processing dataset:   0%|          | 0/48 [00:00<?, ?it/s]

inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([55, 1, 1024]), labels.shape: torch.Size([55])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([72, 1, 1024]), labels.shape: torch.Size([72])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([60, 1, 1024]), labels.shape: torch.Size([60])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([250, 1, 1024]), labels.shape: torch.Size([250])
inputs.shape: torch.Size([182, 1, 1024]), la

In [117]:
len(lstm_emotion_waveform_multimodal_val)

48

In [118]:
lstm_emotion_waveform_multimodal_val  = np.array([p.cpu() for p in lstm_emotion_waveform_multimodal_val])

  lstm_emotion_waveform_multimodal_val  = np.array([p.cpu() for p in lstm_emotion_waveform_multimodal_val])
  lstm_emotion_waveform_multimodal_val  = np.array([p.cpu() for p in lstm_emotion_waveform_multimodal_val])


In [119]:
lstm_emotion_waveform_multimodal_val[0].shape

torch.Size([250, 128])

In [120]:
np.save('../multimodal/lstm_emotion_waveform_av_val.npy',lstm_emotion_waveform_multimodal_val)