In [1]:
import os

import argparse
from collections import defaultdict
import transformers
import numpy as np
import torch
import torch.nn as nn
from pytorch_lightning import LightningModule
from torch import optim
# from torchmetrics import F1
from transformers import ViTModel
import cv2
from tqdm import tqdm
import matplotlib.pyplot as plt
from pathlib import Path
import h5py
import seaborn as sns
from torch.utils.data import DataLoader, Dataset
import math
import json

In [2]:
import torch
from transformers import ViTImageProcessor
import argparse
from collections import defaultdict

import numpy as np
import torch
import torch.nn as nn
from torch import optim
# from torchmetrics import F1
from transformers import ViTModel

from pytorch_lightning import LightningModule
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import seaborn as sns
import numpy as np
from moviepy.editor import VideoFileClip, concatenate_videoclips

# 데이터

In [3]:
class SummaryDataset(Dataset):
    def __init__(self, video_dir, directory, video_features_file, max_seq_len=250):
        self.directory = directory
        self.video_names = self.get_video_names(video_dir)
        self.video_features_file = video_features_file
        self.max_seq_len = max_seq_len

        # Because we can't use DDP with IterableDataset,
        # data must be pre-chunked to combat OOM.
        self.label_files = self.prefetch_label_files()
        self.data_size, self.index_to_chunk, self.labels = self.prefetch_and_index()
    
    def get_video_names(self, mp4_dir):
        # mp4 파일이 있는 디렉토리 경로
        # mp4_dir = "/workspace/EmotionShortForm/aihub/2.Validation/Video_data/VS_유튜브_04"

        video_names = []
        for filename in os.listdir(mp4_dir):
            if filename.endswith('.mp4'):
                name = os.path.splitext(filename)[0]
                video_names.append(name)
        return video_names
    
    def prefetch_label_files(self):

        name_set = set(self.video_names)

        label_files = defaultdict(list)

        for label_file in Path(self.directory).glob(f"**/*.json"):

            file_name = label_file.stem

            # 예시: [KBS]kim370_대법원 업무 과부하…상고 법원이 대안_18567498.json
            # annotator id 제거하면 비디오 이름 추출.
            # 파일 이름 reverse ([::-1]) 후 "_" 찾음.
            annotator_id_index = len(file_name) - file_name[::-1].find("_") - 1
            video_name = file_name[:annotator_id_index]

            if video_name in name_set:
                label_files[video_name].append(label_file)
        
        
        return label_files

    def prefetch_and_index(self):
        index = 0
        index_to_chunk = {}
        all_labels = {}

        for video_name in self.video_names:

#            if video_name == m"news_footage_1710":
#                continue

            labels = self.extract_label(video_name)
#             print(len(labels)) # 3
            all_labels[video_name] = labels
        
            chunk_count = math.ceil(len(labels[0]) / self.max_seq_len)
            for chunk_index in range(0, chunk_count):
                index_to_chunk[index + chunk_index] = (video_name, chunk_index)

            index += chunk_count

        return index, index_to_chunk, all_labels

    def __len__(self):
        return self.data_size

    def __getitem__(self, index):

        video_name, chunk_index = self.index_to_chunk[index]
        start = chunk_index * self.max_seq_len 

        end = start + self.max_seq_len
#         print(f'start: {start}, end: {end}')
        with h5py.File(self.video_features_file, "r") as rf:

            labels = self.labels[video_name][:, start:end]
             # Convert labels to 1D array
            
#             video_features = rf[video_name][:][: len(labels[0])][start:end]
            
            video_features = rf[video_name][start:end]
#             print(f'{len(video_features)}, {len(video_features[0])}')
#             print(f'video_name: {video_name}, label len: {len(labels[0])}, video_features_len: {len(video_features)}')
            labels = torch.from_numpy(labels)
            # majority voting
            labels = labels.squeeze(0)
            labels = torch.sum(labels, dim=0) 
            labels = torch.min(
                labels,
                torch.ones(
                    labels.shape[0],
                ).to(labels.device),
            )
            return video_name, video_features, labels

    def extract_label(self, video_name):

        label_files = self.label_files[video_name]
        labels = []

        for label_file in label_files:

            with open(label_file, "r") as rf:
                data = json.load(rf)

            metadata = data["metadata"]
            video_length = math.ceil(metadata["length"])
            annotator_label = np.zeros((video_length,))

            for timeline in data["timelines"]:
                for time_index in range(timeline["start"], timeline["end"] + 1):
                    # annotator_label[time_index] += 1
                    if time_index < video_length:
                        annotator_label[time_index] = 1

            labels.append(annotator_label)

        labels = np.array(labels)
        return labels

In [4]:
# mp4 파일이 있는 디렉토리 경로
mp4_dir = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Video_data/VS_유튜브_01"
# video_name = '유튜브_기타_21516'
# video_path = f'{mp4_dir}/{video_name}.mp4'
args = argparse.Namespace(
    videos = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Video_data/VS_유튜브_01",
    data_directory =  "/workspace/EmotionShortForm/data_AIHub/2.Validation/Labeling_data/VL_youtube",
    video_features_file = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Video_data/VS_유튜브_01.h5"
)

val_label_path = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Labeling_data/VL_youtube"

# Model

In [5]:
class SummaryModel(LightningModule):
    def __init__(self, hidden_dim=768, individual_logs=None):
        super().__init__()
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.scorer = nn.Linear(hidden_dim, 1)
        self.feature_extractor = nn.Identity() # Feature extraction layer
      
        # self.sigmoid = nn.Sigmoid()
        self.loss = nn.BCEWithLogitsLoss()
        # self.train_f1 = F1()
        # self.val_f1 = F1()
        # self.test_f1 = F1()
        self.individual_logs = individual_logs
        self.tta_logs = defaultdict(list)

    def forward(self, x):
        x = self.vit(x).pooler_output
        score = self.scorer(x)
        feature = self.feature_extractor(x) # extract features
        # x = self.sigmoid(x)
        return score, feature

    def run_batch(self, batch, batch_idx, metric, training=False):
        video_name, image_features, labels = batch
        video_name = video_name[0]
        image_features = image_features.squeeze(0)
        labels = labels.squeeze(0)

        # Score - aggregated labels.
        score = torch.sum(labels, dim=0)
        score = torch.min(
            score,
            torch.ones(
                score.shape[0],
            ).to(score.device),
        )
        out = self(image_features).squeeze(1)
        try:
            loss = self.loss(out.double(), score)
            preds = (torch.sigmoid(out) > 0.7).int()
            metric.update(preds, score.int())
            f1 = metric.compute()
            tp, fp, tn, fn = metric._get_final_stats()
            self.tta_logs[video_name].append((tp.item(), fp.item(), fn.item()))
        except Exception as e:
            print(e)
            loss = 0
        return loss

    def training_step(self, batch, batch_idx):
        loss = self.run_batch(batch, batch_idx, self.train_f1, training=True)
        self.log("train_loss", loss)
        return loss

    def training_epoch_end(self, training_step_outputs):
        self.log("train_f1", self.train_f1.compute())
        self.train_f1.reset()

    def validation_step(self, batch, batch_idx):
        loss = self.run_batch(batch, batch_idx, self.val_f1)
        self.log("val_loss", loss)
        return loss

    def validation_epoch_end(self, validation_step_outputs):
        self.log("val_f1", self.val_f1.compute())
        self.val_f1.reset()

    def test_step(self, batch, batch_idx):
        loss = self.run_batch(batch, batch_idx, self.test_f1)
        self.log("test_loss", loss)
        return loss

    def test_epoch_end(self, outputs):
        f1 = self.test_f1.compute()
        self.log("test_f1", f1)
        tp, fp, tn, fn = self.test_f1._get_final_stats()
        print(f"\nTest f1: {f1}, TP: {tp}, FP: {fp}, TN: {tn}, fn: {fn}")
        self.test_f1.reset()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-4)
        return optimizer

'''
if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("-a", "--argument", help="Example argument")
    args = parser.parse_args()
    
'''

In [6]:
from transformers import ViTImageProcessor
preprocessor = ViTImageProcessor.from_pretrained(
    "google/vit-base-patch16-224", size=224, device='cuda'
)

In [7]:
model = SummaryModel()
model.to('cuda')
model.eval()

In [8]:
PATH = '../vit/summary.ckpt'
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])

In [9]:
from tqdm.notebook import tqdm_notebook

d = SummaryDataset(args.videos, args.data_directory, args.video_features_file)
dl = DataLoader(d, batch_size=1)

vit_features = []
y_pred=[]

for video_name, inputs, labels in tqdm_notebook(dl,total=len(dl), desc='Processing dataset' ):
    with torch.no_grad():
        model.eval()
        y_pred_list = []
        vit_features_list=[]
        print(f'video_name: {video_name}, inputs.shape: {inputs.shape}, labels.shape: {labels.shape}')
        for f in tqdm(inputs.squeeze(0)):
#             print(f.unsqueeze(0).shape)
        #     print(frame.unsqueeze(0).shape)
            y_p, y_f = model(f.cuda().unsqueeze(0))
#            print(y_f.shape)
#             print(y_f.cpu().detach().numpy().shape)
            y_p = torch.sigmoid(y_p)
            y_pred_list.append(y_p.cpu().detach().numpy().squeeze())    
            vit_features_list.append(y_f.cpu().detach().numpy().squeeze())        
        y_pred.append(np.array(y_pred_list))    
        vit_features.append(np.array(vit_features_list)) 
y_pred = np.array(y_pred)

In [10]:
vit_features = [torch.tensor(a) for a in vit_features]

In [11]:
vit_features

In [12]:
vit_features[0].shape

# audio 

In [13]:
import json
import math
import librosa
from collections import defaultdict
from pathlib import Path
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset

In [14]:
class SummaryDataset(Dataset):
    def __init__(self, video_dir, wav_dir, directory, max_seq_len=250):
        self.directory = directory
        self.video_names = self.get_video_names(video_dir)
        self.wav_dir = wav_dir
        self.max_seq_len = max_seq_len

        # Because we can't use DDP with IterableDataset,
        # data must be pre-chunked to combat OOM.
        self.label_files = self.prefetch_label_files()
        self.data_size, self.index_to_chunk, self.labels = self.prefetch_and_index()

    def get_video_names(self, mp4_dir):
        # mp4 파일이 있는 디렉토리 경로
        # mp4_dir = "/workspace/EmotionShortForm/aihub/2.Validation/Video_data/VS_유튜브_04"

        video_names = []
        for filename in os.listdir(mp4_dir):
            if filename.endswith('.mp4'):
                name = os.path.splitext(filename)[0]
                video_names.append(name)
        return video_names

    def prefetch_label_files(self):
        # video_names 는 이름만 들어있을것 .mp4 제거
        name_set = set(self.video_names)

        label_files = defaultdict(list)

        for label_file in Path(self.directory).glob(f"**/*.json"):

            file_name = label_file.stem

            # 예시: [KBS]kim370_대법원 업무 과부하…상고 법원이 대안_18567498.json
            # annotator id 제거하면 비디오 이름 추출.
            # 파일 이름 reverse ([::-1]) 후 "_" 찾음.
            annotator_id_index = len(file_name) - file_name[::-1].find("_") - 1
            video_name = file_name[:annotator_id_index]

            if video_name in name_set:
                label_files[video_name].append(label_file)

        return label_files

    def prefetch_and_index(self):
        index = 0
        index_to_chunk = {}
        all_labels = {}

        for video_name in self.video_names:

            labels = self.extract_label(video_name)

            all_labels[video_name] = labels

            chunk_count = math.ceil(len(labels[0]) / self.max_seq_len)
            for chunk_index in range(0, chunk_count):
                index_to_chunk[index + chunk_index] = (video_name, chunk_index)

            index += chunk_count

        return index, index_to_chunk, all_labels

    def __len__(self):
        return self.data_size

    def __getitem__(self, index):        
        video_name, chunk_index = self.index_to_chunk[index]
        start = chunk_index * self.max_seq_len
        end = start + self.max_seq_len
        
        labels = self.labels[video_name][:,start:end]

        # audio_data: 음성 데이터, sr: sampling rate, max_seq_len: chunk 단위 길이
        audio_data, sr = librosa.load(f"{self.wav_dir}/{video_name}.wav", sr=None)
        # print(f'audio_data 길이: {len(audio_data)/sr}')
        # print(f'sr: {sr}')
        
        audio_data = audio_data[start*sr:end*sr]
        # print(f'audio_data 길이: {len(audio_data)/sr}')
        
        max_seq_len = labels.shape[-1]

        
        # 1초 단위로 MFCC 추출하여 리스트에 추가
        sec = 1
        mfcc_list = []
        for i in range(0, len(audio_data), sec*sr):
            audio_segment = audio_data[i : i + sec*sr]
                
            mfcc = librosa.feature.mfcc(y=audio_segment, sr=sr, n_mfcc=32).T
            mfcc_mean = np.mean(mfcc, axis=0)
            mfcc_list.append(mfcc_mean)

        # 리스트를 배열로 변환
        mfcc_array = np.vstack(mfcc_list)
        
        # Convert labels to 1D array
        labels = torch.from_numpy(labels)
        # majority voting
        labels = labels.squeeze(0)
        labels = torch.sum(labels, dim=0) 
        labels = torch.min(
            labels,
            torch.ones(
                labels.shape[0],
            ).to(labels.device),
        )
        return video_name, mfcc_array, labels

    def extract_label(self, video_name):

        label_files = self.label_files[video_name]
        labels = []

        for label_file in label_files:

            with open(label_file, "r") as rf:
                data = json.load(rf)

            metadata = data["metadata"]
            video_length = math.ceil(metadata["length"])
            annotator_label = np.zeros((video_length,))

            for timeline in data["timelines"]:
                for time_index in range(timeline["start"], timeline["end"] + 1):
                    # annotator_label[time_index] += 1
                    if time_index < video_length:
                        annotator_label[time_index] = 1

            labels.append(annotator_label)

        labels = np.array(labels)
        return labels


In [15]:
# 라벨링 디렉토리 경로
train_label_path = "/workspace/EmotionShortForm/data_AIHub/1.Training/Labeling_data/TL_youtube"
val_label_path = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Labeling_data/VL_youtube"

# mp4 파일이 있는 디렉토리 경로
mp4_dir = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Video_data/VS_유튜브_01"

# wav 파일이 있는 디렉토리 경로
wav_dir = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Audio_data/VS_유튜브_01"

In [16]:
sd = SummaryDataset(mp4_dir, wav_dir, val_label_path)

dl = DataLoader(sd,batch_size=1)

## model

In [17]:
import torch
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, (hidden_state, cell_state) = self.lstm(x, (h0, c0))
        output = self.fc(out[:,-1,:])
        hidden = self.fc(hidden_state[-1])
        return output, hidden, out[:,-1,:]

In [19]:
# load model
input_size = 32
hidden_size = 128
num_layers=3
output_size=7

emotion_model = LSTM(input_size, hidden_size, num_layers, output_size)
emotion_model.load_state_dict(torch.load('../model/lstm_emotion_classification_model.pt'))

input_size = 32
hidden_size = 64
num_layers = 3
output_size = 1

arousal_model = LSTM(input_size, hidden_size, num_layers, output_size)
arousal_model.load_state_dict(torch.load('../model/lstm_arousal_model_best.pt'))

input_size = 32
hidden_size = 32
num_layers=4
output_size=1

valence_model = LSTM(input_size, hidden_size, num_layers, output_size)
valence_model.load_state_dict(torch.load('../model/lstm_valence_model.pt'))

In [20]:
from tqdm.notebook import tqdm_notebook

emotion_lstm_features = []
label = []
for video_name, inputs, labels in tqdm_notebook(dl,total=len(dl), desc='Extracting emotion features'):
    label.append(labels)
    with torch.no_grad():
        emotion_model.eval()
        outputs, hidden, out = emotion_model(inputs.reshape(-1,1,32))
        emotion_lstm_features.append(out)


In [21]:
from tqdm.notebook import tqdm_notebook

arousal_lstm_features = []

for video_name, inputs, labels in tqdm_notebook(dl,total=len(dl), desc='Extracting arousal features'):
    with torch.no_grad():
        arousal_model.eval()
        outputs, hidden, out = arousal_model(inputs.reshape(-1,1,32))
        arousal_lstm_features.append(out)


In [22]:
from tqdm.notebook import tqdm_notebook

valence_lstm_features = []

for video_name, inputs, labels in tqdm_notebook(dl,total=len(dl), desc='Extracting valence features'):
    with torch.no_grad():
        valence_model.eval()
        outputs, hidden, out = valence_model(inputs.reshape(-1,1,32))
        valence_lstm_features.append(out)


In [23]:
concatenate_lstm_features = []
for e, a, v in zip(emotion_lstm_features, arousal_lstm_features, valence_lstm_features):
    concatenate_lstm_features.append(torch.cat((e,a,v),dim=1))

In [24]:
len(concatenate_lstm_features)

In [25]:
concatenate_lstm_features[0].shape # emotion + arousal + valence

## Audio LSTMClassifier, multimodal LSTMClassifier

In [26]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))
        
        # Decode the hidden state of the last time step
        output = self.fc(out[:, -1, :])
        
        # Apply sigmoid activation function to output
        output = self.sigmoid(output)
        
        return output, out[:, -1, :]

In [63]:
input_size = 224
hidden_size = 128
num_layers = 3
output_size = 1
audio_concat_model = LSTMClassifier(input_size, hidden_size, num_layers, output_size)
audio_concat_model.load_state_dict(torch.load('concatenate_lstm_model.pt'))

In [57]:
audio_concat_features = []

for inputs in tqdm_notebook(concatenate_lstm_features,total=len(concatenate_lstm_features), desc='Extracting audio features'):
    with torch.no_grad():
        audio_concat_model.eval()
        outputs, out = audio_concat_model(inputs.reshape(-1,1,input_size))
        audio_concat_features.append(out)

In [58]:
len(audio_concat_features), audio_concat_features[0].shape

## Feature concatenate w/o lstm
+ audio lstm 없이! -> model input size를 바꿔줘야 해서 나중에 비교!

In [30]:
# final_concat_lstm_feature  =  torch.cat((concatenate_lstm_feature, video_features_torch), dim=1)

In [31]:
# final_concat_lstm_feature.shape

## Feature concatenate

In [32]:
final_concat_lstm_features = []
for v, a in zip(vit_features, audio_concat_features):
    final_concat_lstm_features.append(torch.cat((v,a),dim=1))
    

In [33]:
len(final_concat_lstm_features), final_concat_lstm_features[0].shape

## Multimodal Classification

In [72]:
input_size = 896
hidden_size = 128
num_layers = 3
output_size = 1
multimodal_model = LSTMClassifier(input_size, hidden_size, num_layers, output_size)
multimodal_model.load_state_dict(torch.load('multimodal_model.pt'))


In [73]:
for name, param in multimodal_model.named_parameters():
    if 'weight' in name:
        print(f'Layer: {name} | Size: {param.size()} | Values: {param}')
    if 'bias' in name:
        print(f'Layer: {name} | Size: {param.size()} | Values: {param}')


In [74]:
y_pred = []

for inputs in tqdm_notebook(final_concat_lstm_features,total=len(final_concat_lstm_features), desc='Evaluation'):
    with torch.no_grad():
        multimodal_model.eval()
        outputs, out = multimodal_model(inputs.reshape(-1,1,input_size))
        y_pred.append(outputs)

In [75]:
len(y_pred),y_pred[0].shape,y_pred[0].squeeze().shape

In [38]:
len(label), label[0].shape, label[0].squeeze().shape

In [39]:
y_pred_flat = [y.item() for x in y_pred for y in x]

label = [t.squeeze().tolist() for t in label]
label = sum(label, [])

In [40]:
len(y_pred_flat), len(label)

In [41]:
y_pred = np.array(y_pred_flat)
y_true = np.array(label)

In [42]:
y_pred, y_true

In [62]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Threshold 하나 일때만
THRES = 0.4

# calculate scores
f1 = f1_score(y_true, np.where(y_pred > THRES, 1, 0), average='binary')
acc = accuracy_score(y_true, np.where(y_pred > THRES, 1, 0))
prec = precision_score(y_true, np.where(y_pred > THRES, 1, 0))
rec = recall_score(y_true, np.where(y_pred > THRES, 1, 0))

print("F1 Score: ", f1)
print("Accuracy: ", acc)
print("Precision: ", prec)
print("Recall: ", rec)

In [68]:
import matplotlib.pyplot as plt
import seaborn as sns

# initialize lists for storing results
f1_scores = []
acc_scores = []
prec_scores = []
rec_scores = []

# vary threshold and calculate scores
for THRES in np.arange(0, 1.05, 0.05):
    # calculate scores
    f1 = f1_score(y_true, np.where(y_pred > THRES, 1, 0), average='binary')
    acc = accuracy_score(y_true, np.where(y_pred > THRES, 1, 0))
    prec = precision_score(y_true, np.where(y_pred > THRES, 1, 0))
    rec = recall_score(y_true, np.where(y_pred > THRES, 1, 0))
    
    # store scores in lists
    f1_scores.append(f1)
    acc_scores.append(acc)
    prec_scores.append(prec)
    rec_scores.append(rec)

# plot results
sns.set_style("whitegrid")
plt.plot(np.arange(0, 1.05, 0.05), f1_scores, label='F1 Score')
plt.plot(np.arange(0, 1.05, 0.05), acc_scores, label='Accuracy')
plt.plot(np.arange(0, 1.05, 0.05), prec_scores, label='Precision')
plt.plot(np.arange(0, 1.05, 0.05), rec_scores, label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.legend()
plt.show()


In [69]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# calculate fpr and tpr for different thresholds
fpr, tpr, thresholds = roc_curve(y_true, y_pred)

# calculate AUC
roc_auc = auc(fpr, tpr)

# plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [76]:
import matplotlib.pyplot as plt
plt.plot(outputs.detach().numpy())
plt.title("Output")

In [77]:
sns.displot(outputs.detach().numpy())

In [80]:
THRESHOLD = 0.5

total_secs = 0
SAMPLE_EVERY_SEC = 2

for i, y_p in enumerate(outputs.detach().numpy()):
    #print(i, y_p)
    if y_p >= THRESHOLD:
        print(i * SAMPLE_EVERY_SEC)
        total_secs += SAMPLE_EVERY_SEC

total_secs

In [81]:
video_path = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Video_data/VS_유튜브_01/유튜브_여행_7640.mp4"

In [83]:
clip = VideoFileClip(video_path)

subclips = []

for i, y_p in enumerate(outputs.detach().numpy()):
    sec = i * SAMPLE_EVERY_SEC

    if y_p >= THRESHOLD:
        subclip = clip.subclip(sec, sec + SAMPLE_EVERY_SEC)
        subclips.append(subclip)

result = concatenate_videoclips(subclips)

result.write_videofile("videos/유튜브_여행_7640_result.mp4")

result.ipython_display(width=640, maxduration=240)