### Data Preparation

In [None]:
import os
import glob
import json

# Base path to aac audio directory
aac_root = "/DATA2/PMCAll/speech/vox2/aac"

# List and sort all speaker folders
all_speakers = sorted([
    d for d in os.listdir(aac_root)
    if os.path.isdir(os.path.join(aac_root, d)) and d.startswith("id")
])

# Split into train and test
train_speakers = all_speakers[:50]
test_speakers = all_speakers[50:100]

def build_utterance_map(speaker_ids, save_path):
    utt_map = {}
    for speaker_id in speaker_ids:
        speaker_dir = os.path.join(aac_root, speaker_id)
        # Recursively find all .m4a files for the speaker
        m4a_files = glob.glob(os.path.join(speaker_dir, "*", "*.m4a"))
        if m4a_files:
            utt_map[speaker_id] = sorted(m4a_files)
    # Save mapping to JSON
    with open(save_path, "w") as f:
        json.dump(utt_map, f, indent=2)
    print(f"Saved: {save_path} ({len(utt_map)} speakers)")

# Build and save both maps
build_utterance_map(train_speakers, "train_utterance_map.json")
build_utterance_map(test_speakers, "test_utterance_map.json")


In [None]:
import json
import random

# Load speaker-to-utterance mappings
with open("train_utterance_map.json", "r") as f:
    train_utt_map = json.load(f)

with open("test_utterance_map.json", "r") as f:
    test_utt_map = json.load(f)

def generate_pairings(utt_map, num_pairs=1000):
    """Generate random pairs of utterances from different speakers."""
    speaker_ids = list(utt_map.keys())
    metadata = {}

    for i in range(num_pairs):
        # Randomly select two different speakers
        spk1, spk2 = random.sample(speaker_ids, 2)

        # Randomly select an utterance for each speaker
        utt1_path = random.choice(utt_map[spk1])
        utt2_path = random.choice(utt_map[spk2])

        mix_id = f"mix_{i+1:05d}"
        metadata[mix_id] = {
            "speaker1_id": spk1,
            "utt1_path": utt1_path,
            "speaker2_id": spk2,
            "utt2_path": utt2_path
        }

    return metadata

# Generate 1000 pairings for training and 200 for testing
train_metadata = generate_pairings(train_utt_map, num_pairs=1000)
test_metadata = generate_pairings(test_utt_map, num_pairs=200)

# Save metadata to JSON files
with open("train_voxmix_metadata.json", "w") as f:
    json.dump(train_metadata, f, indent=2)

with open("test_voxmix_metadata.json", "w") as f:
    json.dump(test_metadata, f, indent=2)

print("Pairing metadata generated successfully!")
