# 00: Data Collection (T2, T3, T4)

Run on Thunder Compute (A100, 500GB storage). Downloads raw audio, segments into
30s clips, extracts MuQ embeddings, then syncs only embeddings + metadata back to
GDrive. Raw audio is discarded after processing.

**Tiers:**
- **T2** Competition: Chopin 2021 -- ordinal ranking signal (~2,000 segments, ~760MB embeddings)
- **T3** MAESTRO audio: cross-performer contrastive signal (~10,000 segments, ~3.8GB embeddings)
- **T4** YouTube piano: augmentation invariance signal (~50,000 segments, ~38GB embeddings) -- optional, run if needed

**Storage budget on remote:**
- MAESTRO audio download: ~200GB (temporary)
- Competition audio: ~50GB (temporary)
- YouTube audio: ~100GB (temporary, only if running T4)
- Embeddings to keep: ~4.6GB (T2+T3) or ~42GB (T2+T3+T4)

---

## 1. Setup

In [None]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

!curl -fsSL https://rclone.org/install.sh | sudo bash 2>&1 | grep -E '(successfully|already)' || echo 'rclone installed'

In [None]:
!git clone https://github.com/Jai-Dhiman/crescendAI.git /workspace/crescendai
%cd /workspace/crescendai/model

!curl -LsSf https://astral.sh/uv/install.sh | sh

!uv pip install -e /workspace/crescendai/model --python {sys.executable} --system
!apt-get update && apt-get install -y ffmpeg
os.environ["HF_TOKEN"] = ""

# Sync existing data from GDrive (percepiano labels, pretrain cache metadata, etc.)
!rclone copy gdrive:crescendai_data/model_improvement/data/percepiano_cache ./data/percepiano_cache --progress
!rclone copy gdrive:crescendai_data/model_improvement/data/percepiano_midi ./data/percepiano_midi --progress

In [None]:
import sys
import subprocess
import logging
from pathlib import Path

sys.path.insert(0, 'src')

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    datefmt='%H:%M:%S',
)

DATA_DIR = Path('/workspace/crescendai/model/data')
print(f'Data directory: {DATA_DIR}')
print('GPU: ', end='')
!nvidia-smi --query-gpu=name,memory.total --format=csv,noheader

## 2. T2: Competition Recordings (Chopin 2021)

Scrapes metadata from Wikipedia, discovers YouTube URLs from Chopin Institute
playlists, downloads audio, segments into 30s clips, extracts MuQ embeddings.

**Expected output:** ~2,000 segments, ~760MB embeddings.

In [None]:
from model_improvement.competition import (
    scrape_chopin_results,
    discover_youtube_urls,
    download_competition_audio,
    segment_and_embed_competition,
    load_competition_metadata,
)
from model_improvement.data import CompetitionPairSampler

comp_cache = DATA_DIR / 'competition_cache' / 'chopin2021'
comp_metadata = comp_cache / 'recordings.jsonl'

print('Step 1: Scraping competition results...')
results = scrape_chopin_results(comp_cache)
print(f'Found {len(results)} performers')

print('Step 2: Discovering YouTube URLs...')
url_mapping = discover_youtube_urls(comp_cache, results)
total_videos = sum(len(vids) for rounds in url_mapping.values() for vids in rounds.values())
print(f'Found {total_videos} videos for {len(url_mapping)} performers')

In [None]:
# Step 3: Download audio
print('Step 3: Downloading audio...')
records = download_competition_audio(url_mapping, results, comp_cache, comp_metadata)
print(f'Downloaded {len(records)} new recordings')

In [None]:
# Step 4: Segment and extract MuQ embeddings
print('Step 4: Segmenting and extracting MuQ embeddings...')
n_segments = segment_and_embed_competition(comp_cache)
print(f'Processed {n_segments} new segments')

# Summary
all_records = load_competition_metadata(comp_cache)
emb_dir = comp_cache / 'muq_embeddings'
n_emb = len(list(emb_dir.glob('*.pt'))) if emb_dir.exists() else 0
print(f'Total metadata records: {len(all_records)}')
print(f'Total MuQ embeddings: {n_emb}')

if all_records:
    sampler = CompetitionPairSampler(all_records)
    print(f'Within-piece pairs: {sampler.n_within_piece_pairs}')
    print(f'Cross-round pairs: {sampler.n_cross_round_pairs}')

In [None]:
# Sync T2 embeddings + metadata to GDrive (not raw audio)
!rclone copy {comp_cache}/metadata.jsonl gdrive:crescendai_data/model_improvement/data/competition_cache/chopin2021/ --progress
!rclone copy {comp_cache}/recordings.jsonl gdrive:crescendai_data/model_improvement/data/competition_cache/chopin2021/ --progress
!rclone sync {comp_cache}/muq_embeddings/ gdrive:crescendai_data/model_improvement/data/competition_cache/chopin2021/muq_embeddings/ --progress
print('T2 synced to GDrive')

# Delete raw audio to free space for T3
import shutil
audio_dir = comp_cache / 'audio'
if audio_dir.exists():
    size_gb = sum(f.stat().st_size for f in audio_dir.rglob('*')) / 1e9
    shutil.rmtree(audio_dir)
    print(f'Deleted {size_gb:.1f}GB of raw competition audio')

## 3. T3: MAESTRO Audio (Contrastive Learning)

Downloads MAESTRO v3 audio (~200GB), segments into 30s clips, extracts MuQ
embeddings, builds piece-performer contrastive mapping.

**Expected output:** ~10,000 segments, ~3.8GB embeddings, ~150 contrastive pieces.

In [None]:
# Download MAESTRO v3 audio
maestro_raw = DATA_DIR / 'maestro_raw'
maestro_raw.mkdir(parents=True, exist_ok=True)

maestro_zip = maestro_raw / 'maestro-v3.0.0.zip'
maestro_dir = maestro_raw / 'maestro-v3.0.0'

if not maestro_dir.exists():
    print('Downloading MAESTRO v3 audio (~200GB)... this will take a while.')
    !wget -c -O {maestro_zip} https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0.zip
    print('Extracting...')
    !unzip -q {maestro_zip} -d {maestro_raw}
    # Remove zip to free ~200GB
    maestro_zip.unlink(missing_ok=True)
    print('MAESTRO extracted, zip deleted')
else:
    print(f'MAESTRO already extracted at {maestro_dir}')

In [None]:
import json
from model_improvement.maestro import (
    parse_maestro_audio_metadata,
    segment_and_embed_maestro,
    build_piece_performer_mapping,
)

maestro_cache = DATA_DIR / 'maestro_cache'

# Step 1: Parse metadata
print('Step 1: Parsing MAESTRO metadata...')
records = parse_maestro_audio_metadata(maestro_dir)
n_exists = sum(1 for r in records if (maestro_dir / r['audio_filename']).exists())
print(f'Found {len(records)} records, {n_exists} audio files on disk')

if n_exists == 0:
    raise FileNotFoundError('No MAESTRO audio files found. Check download.')

In [None]:
# Step 2: Segment and extract MuQ embeddings
print('Step 2: Segmenting and extracting MuQ embeddings...')
n_segments = segment_and_embed_maestro(maestro_dir, maestro_cache, segment_duration=30.0)
print(f'Processed {n_segments} new segments')

In [None]:
# Step 3: Build contrastive mapping
print('Step 3: Building contrastive mapping...')
mapping = build_piece_performer_mapping(maestro_cache)
print(f'Contrastive pairs: {len(mapping)} pieces with 2+ recordings')
print(f'Total segments in contrastive set: {sum(len(v) for v in mapping.values())}')

mapping_path = maestro_cache / 'contrastive_mapping.json'
with open(mapping_path, 'w') as f:
    json.dump(mapping, f, indent=2)
print(f'Saved mapping to {mapping_path}')

In [None]:
# Sync T3 embeddings + metadata to GDrive
!rclone copy {maestro_cache}/metadata.jsonl gdrive:crescendai_data/model_improvement/data/maestro_cache/ --progress
!rclone copy {maestro_cache}/contrastive_mapping.json gdrive:crescendai_data/model_improvement/data/maestro_cache/ --progress
!rclone sync {maestro_cache}/muq_embeddings/ gdrive:crescendai_data/model_improvement/data/maestro_cache/muq_embeddings/ --progress
print('T3 synced to GDrive')

# Delete MAESTRO raw audio to free ~200GB
import shutil
if maestro_raw.exists():
    print('Deleting MAESTRO raw audio...')
    shutil.rmtree(maestro_raw)
    print('Deleted MAESTRO raw audio (~200GB freed)')

## 4. T4: YouTube Piano (Optional)

**Only run this section if robustness metrics fall below target** (augmented pairwise
accuracy drop > 10% or cross-condition Pearson r < 0.9). T4 is additive.

Downloads audio from 22 curated YouTube channels, segments, extracts clean + augmented
MuQ embedding pairs.

**Expected output:** ~50,000 segments, ~38GB embeddings (clean + augmented).

In [None]:
from model_improvement.youtube_piano import (
    load_channel_list,
    discover_channel_videos,
    download_piano_audio,
    segment_and_embed_piano,
)
from model_improvement.augmentation import augment_and_embed_piano

yt_cache = DATA_DIR / 'youtube_piano_cache'
channels_file = yt_cache / 'channels.yaml'

# Step 1: Discover videos
channels = load_channel_list(channels_file)
print(f'Loaded {len(channels)} channels')

all_videos = []
for ch in channels:
    videos = discover_channel_videos(ch['url'], max_videos=100)
    for v in videos:
        v['channel'] = ch['name']
    all_videos.extend(videos)
    print(f'  {ch["name"]}: {len(videos)} videos')
print(f'Total videos: {len(all_videos)}')

In [None]:
# Step 2: Download audio
records = download_piano_audio(all_videos, yt_cache)
print(f'Downloaded {len(records)} new recordings')

In [None]:
# Step 3: Segment and extract clean MuQ embeddings
n = segment_and_embed_piano(yt_cache)
print(f'Processed {n} new segments')

In [None]:
# Step 4: Generate augmented embeddings
n_aug = augment_and_embed_piano(yt_cache)
print(f'Generated {n_aug} augmented embeddings')

In [None]:
# Step 5: Sync T4 to GDrive and cleanup
!rclone copy {yt_cache}/metadata.jsonl gdrive:crescendai_data/model_improvement/data/youtube_piano_cache/ --progress
!rclone copy {yt_cache}/recordings.jsonl gdrive:crescendai_data/model_improvement/data/youtube_piano_cache/ --progress
!rclone sync {yt_cache}/muq_embeddings/ gdrive:crescendai_data/model_improvement/data/youtube_piano_cache/muq_embeddings/ --progress
!rclone sync {yt_cache}/muq_embeddings_augmented/ gdrive:crescendai_data/model_improvement/data/youtube_piano_cache/muq_embeddings_augmented/ --progress
print('T4 synced to GDrive')

# Delete raw audio
import shutil
audio_dir = yt_cache / 'audio'
if audio_dir.exists():
    shutil.rmtree(audio_dir)
    print('Deleted YouTube raw audio')

## 5. Summary

In [None]:
import subprocess

print('Data Collection Summary')
print('=' * 60)

# T2
comp_emb = DATA_DIR / 'competition_cache' / 'chopin2021' / 'muq_embeddings'
n_comp = len(list(comp_emb.glob('*.pt'))) if comp_emb.exists() else 0
print(f'T2 Competition embeddings: {n_comp}')

# T3
maestro_emb = DATA_DIR / 'maestro_cache' / 'muq_embeddings'
n_maestro = len(list(maestro_emb.glob('*.pt'))) if maestro_emb.exists() else 0
maestro_map = DATA_DIR / 'maestro_cache' / 'contrastive_mapping.json'
n_contrastive = 0
if maestro_map.exists():
    import json
    with open(maestro_map) as f:
        n_contrastive = len(json.load(f))
print(f'T3 MAESTRO embeddings: {n_maestro} ({n_contrastive} contrastive pieces)')

# T4
yt_emb = DATA_DIR / 'youtube_piano_cache' / 'muq_embeddings'
yt_aug = DATA_DIR / 'youtube_piano_cache' / 'muq_embeddings_augmented'
n_yt = len(list(yt_emb.glob('*.pt'))) if yt_emb.exists() else 0
n_yt_aug = len(list(yt_aug.glob('*.pt'))) if yt_aug.exists() else 0
print(f'T4 YouTube clean embeddings: {n_yt}')
print(f'T4 YouTube augmented embeddings: {n_yt_aug}')

print()
print('GDrive contents after sync:')
!rclone lsd gdrive:crescendai_data/model_improvement/data/ 2>/dev/null
print()
!rclone size gdrive:crescendai_data/model_improvement/data/ 2>/dev/null

print()
print('Local disk usage:')
!du -sh {DATA_DIR}/*/