In [2]:
import os

In [3]:
import json
import math
import librosa
from collections import defaultdict
from pathlib import Path
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset

In [4]:
class SummaryDataset(Dataset):
    def __init__(self, video_dir, wav_dir, directory, max_seq_len=250):
        self.directory = directory
        self.video_names = self.get_video_names(video_dir)
        self.wav_dir = wav_dir
        self.max_seq_len = max_seq_len

        # Because we can't use DDP with IterableDataset,
        # data must be pre-chunked to combat OOM.
        self.label_files = self.prefetch_label_files()
        self.data_size, self.index_to_chunk, self.labels = self.prefetch_and_index()

    def get_video_names(self, mp4_dir):
        # mp4 파일이 있는 디렉토리 경로
        # mp4_dir = "/workspace/EmotionShortForm/aihub/2.Validation/Video_data/VS_유튜브_04"

        video_names = []
        for filename in os.listdir(mp4_dir):
            if filename.endswith('.mp4'):
                name = os.path.splitext(filename)[0]
                video_names.append(name)
        return video_names

    def prefetch_label_files(self):
        # video_names 는 이름만 들어있을것 .mp4 제거
        name_set = set(self.video_names)

        label_files = defaultdict(list)

        for label_file in Path(self.directory).glob(f"**/*.json"):

            file_name = label_file.stem

            # 예시: [KBS]kim370_대법원 업무 과부하…상고 법원이 대안_18567498.json
            # annotator id 제거하면 비디오 이름 추출.
            # 파일 이름 reverse ([::-1]) 후 "_" 찾음.
            annotator_id_index = len(file_name) - file_name[::-1].find("_") - 1
            video_name = file_name[:annotator_id_index]

            if video_name in name_set:
                label_files[video_name].append(label_file)

        return label_files

    def prefetch_and_index(self):
        index = 0
        index_to_chunk = {}
        all_labels = {}

        for video_name in self.video_names:

            labels = self.extract_label(video_name)

            all_labels[video_name] = labels

            chunk_count = math.ceil(len(labels[0]) / self.max_seq_len)
            for chunk_index in range(0, chunk_count):
                index_to_chunk[index + chunk_index] = (video_name, chunk_index)

            index += chunk_count

        return index, index_to_chunk, all_labels

    def __len__(self):
        return self.data_size

    def __getitem__(self, index):        
        video_name, chunk_index = self.index_to_chunk[index]
        start = chunk_index * self.max_seq_len
        end = start + self.max_seq_len
        
        labels = self.labels[video_name][:,start:end]

        # audio_data: 음성 데이터, sr: sampling rate, max_seq_len: chunk 단위 길이
        audio_data, sr = librosa.load(f"{self.wav_dir}/{video_name}.wav", sr=None)
        # print(f'audio_data 길이: {len(audio_data)/sr}')
        # print(f'sr: {sr}')
        
        audio_data = audio_data[start*sr:end*sr]
        # print(f'audio_data 길이: {len(audio_data)/sr}')
        
        max_seq_len = labels.shape[-1]

        
        # 1초 단위로 MFCC 추출하여 리스트에 추가
        sec = 1
        mfcc_list = []
        for i in range(0, len(audio_data), sec*sr):
            audio_segment = audio_data[i : i + sec*sr]
                
            mfcc = librosa.feature.mfcc(y=audio_segment, sr=sr, n_mfcc=32).T
            mfcc_mean = np.mean(mfcc, axis=0)
            mfcc_list.append(mfcc_mean)

        # 리스트를 배열로 변환
        mfcc_array = np.vstack(mfcc_list)
        
        # Convert labels to 1D array
        labels = torch.from_numpy(labels)
        # majority voting
        labels = labels.squeeze(0)
        labels = torch.sum(labels, dim=0) 
        labels = torch.min(
            labels,
            torch.ones(
                labels.shape[0],
            ).to(labels.device),
        )
        return video_name, mfcc_array, labels

    def extract_label(self, video_name):

        label_files = self.label_files[video_name]
        labels = []

        for label_file in label_files:

            with open(label_file, "r") as rf:
                data = json.load(rf)

            metadata = data["metadata"]
            video_length = math.ceil(metadata["length"])
            annotator_label = np.zeros((video_length,))

            for timeline in data["timelines"]:
                for time_index in range(timeline["start"], timeline["end"] + 1):
                    # annotator_label[time_index] += 1
                    if time_index < video_length:
                        annotator_label[time_index] = 1

            labels.append(annotator_label)

        labels = np.array(labels)
        return labels


In [6]:
# 라벨링 디렉토리 경로
train_label_path = "/workspace/EmotionShortForm/data_AIHub/1.Training/Labeling_data/TL_youtube"
val_label_path = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Labeling_data/VL_youtube"

# mp4 파일이 있는 디렉토리 경로
mp4_dir = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Video_data/VS_유튜브_04"

# wav 파일이 있는 디렉토리 경로
wav_dir = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Audio_data/VS_유튜브_04"

In [7]:
sd = SummaryDataset(mp4_dir, wav_dir, val_label_path)

In [8]:
sd[0][1].shape

(250, 32)

In [9]:
dl = DataLoader(sd,batch_size=1)

# Model Load
## Load valence model

In [10]:
import torch
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, (hidden_state, cell_state) = self.lstm(x, (h0, c0))
        output = self.fc(out[:,-1,:])
        hidden = self.fc(hidden_state[-1])
        return output, hidden,  out[:,-1,:]

In [11]:
input_size = 32
hidden_size = 32 # 32
num_layers = 4 # 2
output_size = 1
learning_rate = 0.001
num_epochs = 100 # 200

In [12]:
# load model
model_valence = LSTM(input_size, hidden_size, num_layers, output_size)
model_valence.load_state_dict(torch.load('../model/lstm_valence_model.pt'))
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_valence.parameters(), lr=learning_rate)


In [14]:
from tqdm.notebook import tqdm_notebook

valence_lstm_features = []
valence_lstm_labels = []

for video_name, inputs, labels in tqdm_notebook(dl):
    with torch.no_grad():
        model_valence.eval()
        outputs, hidden, out = model_valence(inputs.reshape(-1,1,32))
        valence_lstm_features.append(out)
        valence_lstm_labels.append(labels) 
        print(out.shape) # 32 size

  0%|          | 0/556 [00:00<?, ?it/s]

torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([202, 32])
torch.Size([250, 32])
torch.Size([136, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([125, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([186, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([230, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([119, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([134, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([154, 32])
torch.Size([250, 32])
torch.Size([104, 32])
torch.Size([250, 32])
torch.Size([53, 32])
torch.Size([250, 32])
torch.Size([205, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([7, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([162, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([2

torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([161, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([202, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([239, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([108, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([200, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([220, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([232, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([175, 32])
torch.Size([250, 32])
torch.Size([234, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([131, 32])
torch.Size([250, 32])
torch.Size([243, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size([227, 32])
torch.Size([250, 32])
torch.Size([250, 32])
torch.Size

In [19]:
valence_lstm_features[:5]

[tensor([[-0.3545, -0.2704,  0.0326, -0.6426,  0.4787, -0.3702, -0.5512, -0.5260,
           0.6424, -0.5656,  0.2070, -0.6843, -0.5953,  0.4547,  0.6654, -0.6220,
          -0.6226, -0.6094,  0.6114, -0.6559, -0.5807,  0.6082, -0.6511,  0.5501,
          -0.5300,  0.6425, -0.4526, -0.5558, -0.5971,  0.5971,  0.5117, -0.6571]]),
 tensor([[-0.3546, -0.2705,  0.0326, -0.6426,  0.4788, -0.3703, -0.5512, -0.5260,
           0.6424, -0.5656,  0.2070, -0.6843, -0.5954,  0.4548,  0.6654, -0.6221,
          -0.6226, -0.6094,  0.6114, -0.6560, -0.5808,  0.6083, -0.6511,  0.5501,
          -0.5301,  0.6426, -0.4527, -0.5558, -0.5972,  0.5971,  0.5117, -0.6572]]),
 tensor([[-0.3545, -0.2704,  0.0326, -0.6426,  0.4787, -0.3702, -0.5512, -0.5260,
           0.6424, -0.5656,  0.2070, -0.6843, -0.5953,  0.4547,  0.6654, -0.6220,
          -0.6226, -0.6094,  0.6114, -0.6559, -0.5807,  0.6082, -0.6511,  0.5501,
          -0.5300,  0.6425, -0.4526, -0.5558, -0.5971,  0.5971,  0.5117, -0.6571]]),
 tensor

##  Save valence_lstm_features.npy

In [15]:
data_array = np.array([t.numpy() for t in valence_lstm_features])

# 저장 경로와 파일 이름 지정
file_path = 'valence_lstm_features.npy'

# 넘파이 배열을 npy 파일로 저장
np.save(file_path, data_array)

In [17]:
valence_lstm_labels[0].shape

torch.Size([1, 250])

In [23]:
label_list =[t.squeeze(0).numpy() for t in valence_lstm_labels]
label_array = np.array(label_list)

# 저장 경로와 파일 이름 지정
file_path = 'labels.npy'

# 넘파이 배열을 npy 파일로 저장
np.save(file_path, label_array)

In [25]:
d = np.load(file_path,allow_pickle=True)

In [26]:
d

array([array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.,

In [27]:
len(d)

556

In [28]:
d[0].shape

(250,)