# Synthesis test dataset

In [4]:
import os
from pydub import AudioSegment
import numpy as np
import io
import random
import shutil
import pickle
import librosa
import torch

## 1. add audio noise

In [None]:
def merge_audio_files(file1_path, file2_path, output_path):
    """
    Merge two audio files and save the result.
    
    :param file1_path: Path to the first audio file
    :param file2_path: Path to the second audio file
    :param output_path: Path to save the merged audio file
    """
    # Load the audio files
    audio1 = AudioSegment.from_wav(file1_path)
    audio2 = AudioSegment.from_wav(file2_path)

    # Ensure both audios have the same length
    min_length = min(len(audio1), len(audio2))
    audio1 = audio1[:min_length]
    audio2 = audio2[:min_length]

    # Merge the audio files (overlay them)
    merged_audio = audio1.overlay(audio2)

    # Export the merged audio
    merged_audio.export(output_path, format="wav")

## 2. misalign audio-visual pairs

In [2]:
def copy_and_rename_audio(src_path, dst_path):
    """
    Copy an audio file and rename it to the destination filename.
    """
    # Load the source audio
    audio = AudioSegment.from_wav(src_path)
    
    # Export the audio with the new filename
    audio.export(dst_path, format="wav")

def copy_and_rename_pkl(src_path, dst_path):
    """
    Copy a .pkl file and rename it to the destination filename.
    """
    try:
        # Ensure the destination directory exists
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        
        # Copy the file
        shutil.copy2(src_path, dst_path)
        print(f"Successfully copied and renamed: {src_path} -> {dst_path}")
    except IOError as e:
        print(f"Unable to copy file. {e}")
    except:
        print(f"Unexpected error:", sys.exc_info())

## 3. extract audio log-mel feature

In [10]:
def extract_log_mel_features(wav_path, n_mels=64, n_fft=1024, hop_length=512, num_frames=96, duration=5):
    y, sr = librosa.load(wav_path, duration=duration)
    
    if len(y) < sr * duration:
        y = np.pad(y, (0, sr * duration - len(y)))
    
    y_segments = np.array_split(y, 5)
    
    log_mel_segments = []
    for segment in y_segments:
        mel_spectrogram = librosa.feature.melspectrogram(y=segment, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
        log_mel = librosa.power_to_db(mel_spectrogram)
        log_mel = (log_mel - log_mel.mean()) / log_mel.std()
        
        if log_mel.shape[1] < num_frames:
            pad_width = num_frames - log_mel.shape[1]
            log_mel = np.pad(log_mel, ((0, 0), (0, pad_width)), mode='constant')
        elif log_mel.shape[1] > num_frames:
            log_mel = log_mel[:, :num_frames]
        
        log_mel_segments.append(log_mel)
    
    log_mel_stack = np.stack(log_mel_segments)
    log_mel_tensor = torch.from_numpy(log_mel_stack).float().permute(0, 2, 1).unsqueeze(1)
    
    return log_mel_tensor

## Process！

In [None]:
s4_base_dir = '/mnt/data1/jiali/avsbench_data/Single-source/s4_data/audio_wav'
merged_output_dir = '/mnt/data1/jiali/avsbench_test_audio/Single-source/s4_data/audio_wav_merged'
# misaligned_output_dir = '/mnt/data1/jiali/avsbench_test_audio/Single-source/s4_data/audio_log_mel_misaligned'

misaligned_output_dir = '/mnt/data1/jiali/avsbench_test_audio/Single-source/s4_data/audio_wav_misaligned'

for split in ['test']: #, 'train', 'val']:
    base_dir = os.path.join(s4_base_dir, split)
    merged_split_dir = os.path.join(merged_output_dir, split)
    misaligned_split_dir = os.path.join(misaligned_output_dir, split)
    os.makedirs(merged_split_dir, exist_ok=True)
    os.makedirs(misaligned_split_dir, exist_ok=True)

    entries = os.listdir(base_dir)
    categories = [entry for entry in entries if '.DS_Store' not in entry]
    
    for temp_cate in categories:
        print(f"Processing {split}, {temp_cate}")
        category_dir = os.path.join(base_dir, temp_cate)
        merged_category_dir = os.path.join(merged_split_dir, temp_cate)
        misaligned_category_dir = os.path.join(misaligned_split_dir, temp_cate)
        os.makedirs(merged_category_dir, exist_ok=True)
        os.makedirs(misaligned_category_dir, exist_ok=True)
        
        audio_list = [f for f in os.listdir(category_dir) if f.endswith('.wav')]
        pkl_list = [f for f in os.listdir(category_dir.replace('audio_wav', 'audio_log_mel')) if f.endswith('.pkl')]
        
        # Get a list of all other categories
        other_categories = [c for c in categories if c != temp_cate]
        
        # # For merging
        # for audio_file in audio_list:
        #     # Select a random category different from the current one
        #     random_category = random.choice(other_categories)
        #     random_category_dir = os.path.join(base_dir, random_category)
            
        #     random_audio_file = random.choice([f for f in os.listdir(random_category_dir) if f.endswith('.wav')])
        #     file1_path = os.path.join(category_dir, audio_file)
        #     file2_path = os.path.join(random_category_dir, random_audio_file)
        #     output_path = os.path.join(merged_category_dir, audio_file)
        #     merge_audio_files(file1_path, file2_path, output_path)
        #     log_mel_spectrogram = extract_log_mel_features(output_path, n_mels=64, n_fft=2048, hop_length=512, num_frames=96)

        #     # Create the directory for saving if it doesn't exist
        #     save_path1 = output_path.replace('/audio_wav', '/audio_log_mel')
        #     save_path = save_path1.replace('.wav', '.pkl')
        #     # Save the features
        #     directory = os.path.dirname(save_path)
        #     os.makedirs(directory, exist_ok=True)
        #     print('save_path', save_path)
        #     with open(save_path, 'wb') as f:  # Changed 'w' to 'wb' for binary write
        #         pickle.dump(log_mel_spectrogram, f)
        
        # For misaligning
        # for temp_pkl in pkl_list:
        #     random_category = random.choice(other_categories)
        #     random_category_dir = os.path.join(base_dir.replace('/audio_wav', '/audio_log_mel'), random_category)
        #     random_pkl_file = random.choice([f for f in os.listdir(random_category_dir) if f.endswith('.pkl')])
        #     src_path = os.path.join(random_category_dir, random_pkl_file)
        #     dst_path = os.path.join(misaligned_category_dir, temp_pkl)
        #     copy_and_rename_pkl(src_path, dst_path)
        # print(f"Processed {temp_cate} split")

        for temp_wav in audio_list:
            random_category = random.choice(other_categories)
            random_category_dir = os.path.join(base_dir, random_category)
            random_wav_file = random.choice([f for f in os.listdir(random_category_dir) if f.endswith('.wav')])
            src_path = os.path.join(random_category_dir, random_wav_file)
            dst_path = os.path.join(misaligned_category_dir, temp_wav)
            copy_and_rename_audio(src_path, dst_path)
        # print(f"Processed {temp_cate} split")

    print(f"Processed {split} split")
print("Audio misalignment completed!")


### Extract audio log-mel features

In [13]:
input_dir = '/mnt/data1/jiali/avsbench_test_audio/Single-source/s4_data/audio_wav_misaligned/'

for split in ['test']:
    base_dir = os.path.join(input_dir, split)

    entries = os.listdir(base_dir)
    categories = [entry for entry in entries if '.DS_Store' not in entry]
    
    for temp_cate in categories:
        print(temp_cate)
        category_dir = os.path.join(base_dir, temp_cate)
        audio_list = [f for f in os.listdir(category_dir) if f.endswith('.wav')]
        for temp_wav in audio_list:
            wav_path = os.path.join(base_dir, temp_cate, temp_wav)
            log_mel_spectrogram = extract_log_mel_features(wav_path, n_mels=64, n_fft=2048, hop_length=512, num_frames=96)
            # Create the directory for saving if it doesn't exist
            save_path1 = wav_path.replace('/audio_wav', '/audio_log_mel')
            save_path = save_path1.replace('.wav', '.pkl')
            # Save the features
            directory = os.path.dirname(save_path)
            os.makedirs(directory, exist_ok=True)
            # print('save_path', save_path)
            with open(save_path, 'wb') as f:  # Changed 'w' to 'wb' for binary write
                pickle.dump(log_mel_spectrogram, f)

mynah_bird_singing
dog_barking
female_singing
coyote_howling
playing_glockenspiel
male_speech
playing_acoustic_guitar
lawn_mowing
playing_ukulele
cat_meowing
helicopter
race_car
driving_buses
horse_clip-clop
playing_piano
playing_tabla
playing_violin
ambulance_siren
baby_laughter
typing_on_computer_keyboard
cap_gun_shooting
lions_roaring
chainsawing_trees


In [None]:
ms3_base_dir = '/mnt/data1/jiali/avsbench_data/Multi-sources/ms3_data/audio_wav'
merged_output_dir = '/mnt/data1/jiali/avsbench_test_synthesis_visual_random4/Multi-sources/ms3_data/audio_wav_merged'
misaligned_output_dir = '/mnt/data1/jiali/avsbench_test_synthesis_visual_random4/Multi-sources/ms3_data/audio_log_mel_misaligned'

merged_output_dir = '/mnt/data1/jiali/avsbench_test_synthesis_visual_random4/Multi-sources/ms3_data/audio_wav_merged/'

for split in ['test', 'train', 'val']:
    base_dir = os.path.join(ms3_base_dir, split)
    merged_split_dir = os.path.join(merged_output_dir, split)
    misaligned_split_dir = os.path.join(misaligned_output_dir, split)
    os.makedirs(merged_split_dir, exist_ok=True)
    os.makedirs(misaligned_split_dir, exist_ok=True)
    
    audio_list = [f for f in os.listdir(base_dir) if f.endswith('.wav')]
    # pkl_list = [f for f in os.listdir(base_dir.replace('audio_wav', 'audio_log_mel')) if f.endswith('.pkl')]

    # # For merging
    # for temp_audio in audio_list:
    #     input_audio_path = os.path.join(base_dir, temp_audio)
    #     random_audio = random.choice([a for a in audio_list if a != temp_audio])
    #     random_audio_path = os.path.join(base_dir, random_audio)
    #     merged_output_path = os.path.join(merged_split_dir, f"{temp_audio}")
    #     merge_audio_files(input_audio_path, random_audio_path, merged_output_path)
    #     log_mel_spectrogram = extract_log_mel_features(merged_output_path, n_mels=64, n_fft=2048, hop_length=512, num_frames=96)

        # # Create the directory for saving if it doesn't exist
        # save_path1 = os.path.join(merged_output_dir.replace('/audio_wav', '/audio_log_mel'), split, temp_audio)
        # save_path = save_path1.replace('.wav', '.pkl')
        # directory = os.path.dirname(save_path)
        # os.makedirs(directory, exist_ok=True)
        # print('save_path', save_path)
        # # Save the features
        # with open(save_path, 'wb') as f:  # Changed 'w' to 'wb' for binary write
        #     pickle.dump(log_mel_spectrogram, f)

    # # For misaligning
    # for temp_pkl in pkl_list:
    #     input_pkl_path = os.path.join(base_dir, temp_pkl)
    #     misaligned_pkl = random.choice([p for p in pkl_list if p != temp_pkl])
    #     misaligned_pkl_path = os.path.join(base_dir.replace('/audio_wav', '/audio_log_mel'), misaligned_pkl)
    #     misaligned_output_path = os.path.join(misaligned_output_dir.replace('/audio_wav', '/audio_log_mel'), split, temp_pkl)
    #     copy_and_rename_pkl(misaligned_pkl_path, misaligned_output_path)

    # For misaligning
    misaligned_output_dir = '/mnt/data1/jiali/avsbench_test_synthesis_visual_random4/Multi-sources/ms3_data/audio_wav_misaligned'
    for temp_audio in audio_list:
        input_wav_path = os.path.join(base_dir, temp_audio)
        misaligned_wav = random.choice([p for p in audio_list if p != temp_audio])
        misaligned_wav_path = os.path.join(base_dir, misaligned_wav)
        misaligned_output_path = os.path.join(misaligned_output_dir, split, temp_audio)
        copy_and_rename_pkl(misaligned_wav_path, misaligned_output_path)
    
    print(f"Processed {split} split")

print("Audio processing completed!")