# Synthesis S4 (random 4)

## 1. stitch frame

In [None]:
from utils import stitch_frames
import os
import random

dataset_dir = '/mnt/data1/jiali/avsbench_data/Single-source/s4_data/visual_frames'

for split in ['test', 'train', 'val']:
    base_dir = os.path.join(dataset_dir, split)
    entries = os.listdir(base_dir)
    categories = [entry for entry in entries if '.DS_Store' not in entry]
    for temp_cate in categories:
        print(split, temp_cate)
        video_list = os.listdir(os.path.join(base_dir, temp_cate))
        for temp_video in video_list:
            input_video_path = os.path.join(base_dir, temp_cate, temp_video)

            stitch_frames(base_dir, input_video_path, split, save_floder_name = '/avsbench_synthesis_visual_random4/', num_with_audio_and_mask=random.randint(1, 4))

## 2. extract audio log-mel feature

In [None]:
import pickle
import os
from utils import extract_log_mel_features


def load_audio_lm(audio_lm_path):
    with open(audio_lm_path, 'rb') as fr:
        audio_log_mel = pickle.load(fr)
    audio_log_mel = audio_log_mel.detach()# [5, 1, 96, 64]
    return audio_log_mel

dataset_dir = '/mnt/data1/jiali/avsbench_synthesis_visual_random4/Single-source/s4_data/audio_wav'

for split in ['test', 'train', 'val']:
    base_dir = os.path.join(dataset_dir, split)
    entries = os.listdir(base_dir)
    categories = [entry for entry in entries if '.DS_Store' not in entry]
    for temp_cate in categories:
        print(split, temp_cate)
        audio_list = os.listdir(os.path.join(base_dir, temp_cate))
        for temp_audio in audio_list:
            input_audio_path = os.path.join(base_dir, temp_cate, temp_audio)
            log_mel_spectrogram = extract_log_mel_features(input_audio_path, n_mels=64, n_fft=2048, hop_length=512, num_frames=96)
            
            # Create the directory for saving if it doesn't exist
            save_path1 = input_audio_path.replace('/audio_wav/', '/audio_log_mel/')
            save_path = save_path1.replace('.wav', '.pkl')
            
            # Save the features
            with open(save_path, 'wb') as f:  # Changed 'w' to 'wb' for binary write
                pickle.dump(log_mel_spectrogram, f)

            audio_log_mel = load_audio_lm(save_path)
            print('audio_log_mel', audio_log_mel.shape)

# Synthesis MS3 (random 4)

## 1. stitch frame

In [None]:
import os
import random
from utils import stitch_frames_ms3
import pandas as pd

def get_split_from_video_id(csv_file, video_id):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    
    # Search for the row with the given video_id
    row = df[df['video_id'] == video_id]
    
    # If the row is found, return the split value
    if not row.empty:
        return row['split'].values[0]
    else:
        return None


base_dir = '/mnt/data1/jiali/avsbench_data/Multi-sources/ms3_data/visual_frames'
csv_file = '/mnt/data1/jiali/avsbench_data/Multi-sources/ms3_meta_data.csv'

video_list = os.listdir(base_dir)
for temp_video in video_list:
    input_video_path = os.path.join(base_dir, temp_video)
    split = get_split_from_video_id(csv_file, temp_video) 
    stitch_frames_ms3(base_dir, input_video_path, split, csv_file, save_floder_name = '/avsbench_synthesis_visual_random4/',  num_with_audio_and_mask=random.randint(1, 4))

## 2. extract audio log-mel feature

In [None]:
import pickle
import os
import librosa
import numpy as np
import torch



def extract_log_mel_features(wav_path, n_mels=64, n_fft=1024, hop_length=512, num_frames=96, duration=5):
    y, sr = librosa.load(wav_path, duration=duration)
    
    # 确保音频长度为5秒
    if len(y) < sr * duration:
        y = np.pad(y, (0, sr * duration - len(y)))
    
    # 分割音频为5个1秒的片段
    y_segments = np.array_split(y, 5)
    
    log_mel_segments = []
    for segment in y_segments:
        mel_spectrogram = librosa.feature.melspectrogram(y=segment, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
        log_mel = librosa.power_to_db(mel_spectrogram)
        log_mel = (log_mel - log_mel.mean()) / log_mel.std()
        
        # 调整时间帧数
        if log_mel.shape[1] < num_frames:
            pad_width = num_frames - log_mel.shape[1]
            log_mel = np.pad(log_mel, ((0, 0), (0, pad_width)), mode='constant')
        elif log_mel.shape[1] > num_frames:
            log_mel = log_mel[:, :num_frames]
        
        log_mel_segments.append(log_mel)
    
    # 堆叠5个片段
    log_mel_stack = np.stack(log_mel_segments)
    
    # 转换为PyTorch张量并调整形状为 [5, 1, 96, 64]
    log_mel_tensor = torch.from_numpy(log_mel_stack).float().permute(0, 2, 1).unsqueeze(1)
    
    return log_mel_tensor

def load_audio_lm(audio_lm_path):
    with open(audio_lm_path, 'rb') as fr:
        audio_log_mel = pickle.load(fr)
    audio_log_mel = audio_log_mel.detach()# [5, 1, 96, 64]
    return audio_log_mel

dataset_dir = '/mnt/data1/jiali/avsbench_synthesis_visual_random4/Multi-sources/ms3_data/audio_wav'

for split in ['test', 'train', 'val']:
    base_dir = os.path.join(dataset_dir, split)
    audio_list = os.listdir(base_dir)
    for temp_audio in audio_list:
        input_audio_path = os.path.join(base_dir, temp_audio)
        log_mel_spectrogram = extract_log_mel_features(input_audio_path, n_mels=64, n_fft=2048, hop_length=512, num_frames=96)
        
        # Create the directory for saving if it doesn't exist
        save_path1 = input_audio_path.replace('/audio_wav/', '/audio_log_mel/')
        save_path = save_path1.replace('.wav', '.pkl')
        
        # Save the features
        with open(save_path, 'wb') as f:  # Changed 'w' to 'wb' for binary write
            pickle.dump(log_mel_spectrogram, f)

        audio_log_mel = load_audio_lm(save_path)
        print('audio_log_mel', audio_log_mel.shape)

In [4]:
from utils import extract_log_mel_features


input_audio_path = '/home/jiali/AVSBench/synthesis_data/slience.wav'
log_mel_spectrogram = extract_log_mel_features(input_audio_path, n_mels=64, n_fft=2048, hop_length=512, num_frames=96)
save_path = input_audio_path.replace('.wav', '.pkl')
with open(save_path, 'wb') as f:  # Changed 'w' to 'wb' for binary write
    pickle.dump(log_mel_spectrogram, f)

In [6]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

with open('/home/jiali/AVSBench/synthesis_data/slience.pkl', 'rb') as f:
    log_mel_tensor  = pickle.load(f)

print("Log Mel Tensor Shape:", log_mel_tensor.shape)
print("Log Mel Tensor Values Range:", log_mel_tensor.min(), log_mel_tensor.max())

Log Mel Tensor Shape: torch.Size([5, 1, 96, 64])
Log Mel Tensor Values Range: tensor(0.) tensor(0.)
