In [None]:
!pip install mirdata

Collecting mirdata
  Downloading mirdata-1.0.0-py3-none-any.whl.metadata (9.1 kB)
Collecting Deprecated>=1.2.14 (from mirdata)
  Downloading deprecated-1.3.1-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting pretty_midi>=0.2.10 (from mirdata)
  Downloading pretty_midi-0.2.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty_midi>=0.2.10->mirdata)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Collecting boto3>=1.9.17 (from smart_open>=5.0.0->smart_open[all]>=5.0.0->mirdata)
  Downloading boto3-1.40.73-py3-none-any.whl.metadata (6.8 kB)
Collecting azure-storage-blob (from smart_open>=5.0.0->smart_open[all]>=5.0.0->mirdata)
  Downloading azure_storage_blob-12.27.1-py3-none-any.whl.metadata (26 kB)
Collecting azure-common (from smart_open>=5.0.0->smart_open[all]>=5.0.0->mirdata)
  Downloading azure

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = '/content/drive/MyDrive/MedleyDB-Pitch'

In [None]:
import mirdata

In [None]:
medleydb_pitch = mirdata.initialize('medleydb_pitch', data_home=data_dir)

In [None]:
medleydb_pitch.download()

48.0kB [00:01, 47.7kB/s]
    To download this dataset, visit:
    https://zenodo.org/record/2620624#.XKZc7hNKh24
    and request access.

    Once downloaded, unzip the file MedleyDB-Pitch.zip
    and copy the result to:
    /content/drive/MyDrive/MedleyDB-Pitch



In [None]:

import os
import random
import numpy as np
import librosa

import IPython.display as ipd


dataset = medleydb_pitch
tracks = dataset.load_tracks()  # dict: {track_id: Track}

print(f"Loaded {len(tracks)} tracks from medleydb_pitch")

Loaded 103 tracks from medleydb_pitch


In [None]:
import pandas as pd

random.seed(78)
np.random.seed(78)

PROJECT_ROOT = '/content/drive/MyDrive/MedleyDB-Pitch-tuning'
os.makedirs(PROJECT_ROOT, exist_ok=True)

# Detune tags: + / -
TUNING_TAGS = ["+", "-"]

rows = []
for tid in dataset.track_ids:
    tr = dataset.track(tid)
    instrument = getattr(tr, "instrument", None) or "Unknown"

    rows.append({
        "track_id": tid,
        "instrument_raw": instrument,
    })

df = pd.DataFrame(rows)

# Rule: instrument -> class_tag (vocal / instrument)
def map_class_tag(instr: str) -> str:
    s = (instr or "").lower()
    if "male singer" in s or "female singer" in s:
        return "vocal"
    return "instrument"

df["class_tag"] = df["instrument_raw"].apply(map_class_tag)

print("Class tag distribution:")
print(df["class_tag"].value_counts(), "\n")


plan = []
for cls, g in df.groupby("class_tag"):
    # Randomly assign
    g = g.sample(frac=1.0, random_state=42).reset_index(drop=True)


    parts = np.array_split(g, len(TUNING_TAGS))

    for tuning_tag, part in zip(TUNING_TAGS, parts):
        for _, r in part.iterrows():
            plan.append({
                "track_id": r.track_id,
                "class_tag": cls,
                "tuning_tag": tuning_tag,   # '+' or '-'
            })

df_plan = pd.DataFrame(plan)

# See the distribution (class_tag x tuning_tag)
pivot = df_plan.pivot_table(
    index="class_tag",
    columns="tuning_tag",
    values="track_id",
    aggfunc="count",
    fill_value=0,
)
print("Stratified counts (class_tag × tuning_tag):")
print(pivot, "\n")

# Save manifest
csv_path = os.path.join(PROJECT_ROOT, "manifest_tuning.csv")
df_plan.to_csv(csv_path, index=False)
print(f"Saved manifest to: {csv_path}\n")

print("df_plan.head():")
print(df_plan.head())

class_tag 分布：
class_tag
instrument    52
vocal         51
Name: count, dtype: int64 

Stratified counts (class_tag × tuning_tag):
tuning_tag   +   -
class_tag         
instrument  26  26
vocal       26  25 

Saved manifest to: /content/drive/MyDrive/MedleyDB-Pitch-tuning/manifest_tuning.csv

df_plan.head():
                                  track_id   class_tag tuning_tag
0             MusicDelta_Beethoven_STEM_10  instrument          +
1             MusicDelta_LatinJazz_STEM_04  instrument          +
2             MusicDelta_Pachelbel_STEM_03  instrument          +
3  MatthewEntwistle_TheFlaxenField_STEM_02  instrument          +
4             MusicDelta_ModalJazz_STEM_02  instrument          +


  return bound(*args, **kwds)


In [None]:
import soundfile as sf
import librosa

# Configuration
PROJECT_ROOT = '/content/drive/MyDrive/MedleyDB-Pitch-tuning'
MANIFEST_CSV = os.path.join(PROJECT_ROOT, 'manifest_tuning.csv')

AUDIO_OUT_DIR = os.path.join(PROJECT_ROOT, '25cents', 'audio')
PITCH_OUT_DIR = os.path.join(PROJECT_ROOT, '25cents', 'pitch')

os.makedirs(AUDIO_OUT_DIR, exist_ok=True)
os.makedirs(PITCH_OUT_DIR, exist_ok=True)

# Skip if file already exists
SKIP_IF_EXISTS = True

# Read manifest
df_plan = pd.read_csv(MANIFEST_CSV)

print("manifest_tuning.csv preview:")
print(df_plan.head(), "\n")

# Main detuning loop
for i, row in df_plan.iterrows():
    track_id   = row["track_id"]
    tuning_tag = row["tuning_tag"]   # '+' or '-'

    tr = dataset.track(track_id)
    in_audio_path = tr.audio_path
    in_pitch_path = tr.pitch_path

    # Output path (keep original filename)
    audio_fname = os.path.basename(in_audio_path)
    pitch_fname = os.path.basename(in_pitch_path)

    out_audio_path = os.path.join(AUDIO_OUT_DIR, audio_fname)
    out_pitch_path = os.path.join(PITCH_OUT_DIR, pitch_fname)

    if SKIP_IF_EXISTS and os.path.exists(out_audio_path) and os.path.exists(out_pitch_path):
        print(f"[{i+1}/{len(df_plan)}] {track_id} already exists, skipping")
        continue

    # 1) Read original audio
    y, sr = sf.read(in_audio_path)  # y: (n_samples,) or (n_samples, n_channels)

    # Calculate detune parameters
    cents = 25 if tuning_tag == "+" else -25
    n_steps = cents / 100.0                 # semitones
    factor = 2.0 ** (cents / 1200.0)        # frequency scaling factor

    # 2) Perform pitch shift on audio (maintain length)
    if y.ndim == 1:
        # Mono
        y_shift = librosa.effects.pitch_shift(y.astype(np.float32), sr=sr, n_steps=n_steps)
    else:
        # Multi-channel, process channel by channel
        y_shift = np.zeros_like(y, dtype=np.float32)
        for ch in range(y.shape[1]):
            y_shift[:, ch] = librosa.effects.pitch_shift(
                y[:, ch].astype(np.float32),
                sr=sr,
                n_steps=n_steps
            )

    # Ensure length is exactly the same as original (pitch_shift might have 1-2 frame deviation)
    if y_shift.shape[0] > y.shape[0]:
        y_shift = y_shift[:y.shape[0], ...]
    elif y_shift.shape[0] < y.shape[0]:
        pad_width = y.shape[0] - y_shift.shape[0]
        if y_shift.ndim == 1:
            y_shift = np.pad(y_shift, (0, pad_width), mode="constant")
        else:
            y_shift = np.pad(y_shift, ((0, pad_width), (0, 0)), mode="constant")

    # 3) Write detuned audio
    sf.write(out_audio_path, y_shift, sr)
    # print(f"Saved detuned audio to {out_audio_path}")

    # 4) Modify pitch CSV
    # MedleyDB-Pitch's pitch CSV is typically two columns: time_sec, f0_Hz
    pitch_df = pd.read_csv(
        in_pitch_path,
        header=None,
        names=["time", "f0"]
    )

    # Only scale where f0 > 0 (0 usually means no sound/unannotated)
    mask = pitch_df["f0"] > 0
    pitch_df.loc[mask, "f0"] = pitch_df.loc[mask, "f0"] * factor

    # Write to new directory, keeping two columns without header
    pitch_df.to_csv(out_pitch_path, index=False, header=False, float_format="%.6f")

    print(f"[{i+1}/{len(df_plan)}] {track_id} done | cents={cents}, factor={factor:.6f}")

print("\nAll 25 cents detuning done!")
print(f"Audio out dir : {AUDIO_OUT_DIR}")
print(f"Pitch out dir : {PITCH_OUT_DIR}")

manifest_tuning.csv preview:
                                  track_id   class_tag tuning_tag
0             MusicDelta_Beethoven_STEM_10  instrument          +
1             MusicDelta_LatinJazz_STEM_04  instrument          +
2             MusicDelta_Pachelbel_STEM_03  instrument          +
3  MatthewEntwistle_TheFlaxenField_STEM_02  instrument          +
4             MusicDelta_ModalJazz_STEM_02  instrument          + 

[1/103] MusicDelta_Beethoven_STEM_10 done | cents=25, factor=1.014545
[2/103] MusicDelta_LatinJazz_STEM_04 done | cents=25, factor=1.014545
[3/103] MusicDelta_Pachelbel_STEM_03 done | cents=25, factor=1.014545
[4/103] MatthewEntwistle_TheFlaxenField_STEM_02 done | cents=25, factor=1.014545
[5/103] MusicDelta_ModalJazz_STEM_02 done | cents=25, factor=1.014545
[6/103] MatthewEntwistle_FairerHopes_STEM_14 done | cents=25, factor=1.014545
[7/103] MusicDelta_Beethoven_STEM_06 done | cents=25, factor=1.014545
[8/103] SecretMountains_HighHorse_STEM_01 done | cents=25, facto

In [None]:
import matplotlib.pyplot as plt
from IPython.display import Audio, display

# Path configuration (consistent with previous cells)
PROJECT_ROOT   = '/content/drive/MyDrive/MedleyDB-Pitch-tuning'
MANIFEST_CSV   = os.path.join(PROJECT_ROOT, 'manifest_tuning.csv')
AUDIO_25_DIR   = os.path.join(PROJECT_ROOT, '25cents', 'audio')
PITCH_25_DIR   = os.path.join(PROJECT_ROOT, '25cents', 'pitch')

df_plan = pd.read_csv(MANIFEST_CSV)

print("Manifest preview:")
print(df_plan.head(), "\n")

# Utility functions

def pick_random_track(df, class_tag, seed=13):
    """Randomly picks a track_id from a given class_tag (vocal / instrument)."""
    sub = df[df["class_tag"] == class_tag]
    if len(sub) == 0:
        raise ValueError(f"No tracks with class_tag={class_tag}")
    row = sub.sample(n=1, random_state=seed).iloc[0]
    return row["track_id"], row["tuning_tag"]  # Returns track_id and its corresponding '+' / '-' tuning tag


def load_audio_pair(track_id):
    """Loads original audio and 25c detuned audio."""
    tr = dataset.track(track_id)
    orig_audio_path = tr.audio_path
    det_audio_path  = os.path.join(AUDIO_25_DIR, os.path.basename(orig_audio_path))

    y_orig, sr_orig = sf.read(orig_audio_path)
    y_det,  sr_det  = sf.read(det_audio_path)

    assert sr_orig == sr_det, "Sample rate mismatch between original and detuned audio."

    return (y_orig, sr_orig), (y_det, sr_det), orig_audio_path, det_audio_path


def load_pitch_pair(track_id):
    """Loads original pitch CSV and 25c detuned pitch CSV."""
    tr = dataset.track(track_id)
    orig_pitch_path = tr.pitch_path
    det_pitch_path  = os.path.join(PITCH_25_DIR, os.path.basename(orig_pitch_path))

    # MedleyDB-Pitch: two columns time, f0
    pitch_orig = pd.read_csv(
        orig_pitch_path,
        header=None,
        names=["time", "f0"]
    )
    pitch_det  = pd.read_csv(det_pitch_path, header=None, names=["time", "f0"])

    return pitch_orig, pitch_det, orig_pitch_path, det_pitch_path


def plot_pitch_overlay(pitch_orig, pitch_det, title="", semitone_window=4):
    """
    Plots the detuned and original pitch curves on the same graph:
    - x-axis: time (s)
    - y-axis: MIDI pitch (semitone scale, 1 semitone per tick)
    Focuses on a small range for visual inspection of 0.25 semitone offset
    """
    t = pitch_orig["time"].values
    f0o = pitch_orig["f0"].values
    f0d = pitch_det["f0"].values

    # Align: only consider positions where both are > 0
    mask = (f0o > 0) & (f0d > 0)
    t = t[mask]
    f0o = f0o[mask]
    f0d = f0d[mask]

    # Hz -> MIDI (semitone scale)
    midi_o = 69 + 12 * np.log2(f0o / 440.0)
    midi_d = 69 + 12 * np.log2(f0d / 440.0)

    # Calculate actual cents offset for sanity check
    cents_diff = 1200 * np.log2(f0d / f0o)
    med_cents  = np.median(cents_diff)
    mean_cents = np.mean(cents_diff)
    print(f"{title} | median detune \u2248 {med_cents:.2f} cents, mean \u2248 {mean_cents:.2f} cents")

    # Core change: Limit y-axis range and set 1 semitone per tick
    # Center around the median of the original pitch, opening a small window
    center_midi = np.median(midi_o)
    half_win = semitone_window / 2.0

    y_min = np.floor(center_midi - half_win)
    y_max = np.ceil(center_midi + half_win)

    yticks = np.arange(y_min, y_max + 1, 1)  # 1 semitone per tick

    # Plotting
    plt.figure(figsize=(10, 4))
    plt.plot(t, midi_o, label="Original pitch", linewidth=1.0)
    plt.plot(t, midi_d, label="Detuned pitch (25 cents)", linewidth=1.0, alpha=0.8)
    plt.xlabel("Time (s)")
    plt.ylabel("Pitch (MIDI semitones)")
    plt.title(title)
    plt.grid(True, alpha=0.3)

    plt.ylim(y_min, y_max)
    plt.xlim(50,100)
    plt.yticks(yticks)

    plt.legend()
    plt.tight_layout()
    plt.show()


def audition_and_plot(track_id, class_tag, tuning_tag):
    """Combines: playing audio + plotting pitch."""
    print("=" * 80)
    print(f"class_tag={class_tag}, track_id={track_id}, tuning_tag={tuning_tag}")

    # ---- Audio ----
    (y_orig, sr_orig), (y_det, sr_det), orig_ap, det_ap = load_audio_pair(track_id)
    assert sr_orig == sr_det

    # Ensure float32, mono, to avoid strange wave/header issues
    if y_orig.ndim > 1:
        y_orig_play = y_orig[:, 0]
    else:
        y_orig_play = y_orig
    if y_det.ndim > 1:
        y_det_play = y_det[:, 0]
    else:
        y_det_play = y_det

    y_orig_play = y_orig_play.astype(np.float32)
    y_det_play  = y_det_play.astype(np.float32)

    #  Normalize if amplitude is too high, to avoid clipping
    max_abs = max(np.max(np.abs(y_orig_play)), np.max(np.abs(y_det_play)))
    if max_abs > 1.0:
        y_orig_play = y_orig_play / max_abs
        y_det_play  = y_det_play  / max_abs

    print("Original audio:", orig_ap)
    display(Audio(y_orig_play, rate=sr_orig))

    print("Detuned audio (25 cents):", det_ap)
    display(Audio(y_det_play, rate=sr_det))

    # ---- Pitch ----
    pitch_orig, pitch_det, orig_pp, det_pp = load_pitch_pair(track_id)
    print("Original pitch CSV:", orig_pp)
    print("Detuned pitch CSV :", det_pp)

    title = f"{class_tag} | {track_id} | detune {tuning_tag}25 cents"
    plot_pitch_overlay(pitch_orig, pitch_det, title=title)


# 1) Randomly pick a vocal track
vocal_tid, vocal_tuning_tag = pick_random_track(df_plan, "vocal", seed=52)
audition_and_plot(vocal_tid, "vocal", vocal_tuning_tag)

# 2) Randomly pick an instrument track
instr_tid, instr_tuning_tag = pick_random_track(df_plan, "instrument", seed=98)
audition_and_plot(instr_tid, "instrument", instr_tuning_tag)

In [None]:

AUDIO_25_DIR = '/content/drive/MyDrive/MedleyDB-Pitch-tuning/25cents/audio'

cnt_total = 0
cnt_mono  = 0
cnt_stereo_to_mono = 0

for fname in os.listdir(AUDIO_25_DIR):
    if not fname.lower().endswith(".wav"):
        continue

    in_path = os.path.join(AUDIO_25_DIR, fname)
    y, sr = sf.read(in_path)
    cnt_total += 1

    # Already mono
    if y.ndim == 1:
        cnt_mono += 1

        y_out = y.astype(np.float32)
    else:

        y_out = y.mean(axis=1).astype(np.float32)
        cnt_stereo_to_mono += 1

    # Overwrite back to original path
    sf.write(in_path, y_out, sr)

print("Done!")
print(f"Total wav files        : {cnt_total}")
print(f"Already mono           : {cnt_mono}")
print(f"Converted stereo -> mono: {cnt_stereo_to_mono}")
print(f"Directory: {AUDIO_25_DIR}")

Done!
Total wav files        : 103
Already mono           : 0
Converted stereo -> mono: 103
Directory: /content/drive/MyDrive/MedleyDB-Pitch-tuning/25cents/audio


In [None]:
import os
import numpy as np
import pandas as pd
import soundfile as sf
import librosa

# Configuration
PROJECT_ROOT = '/content/drive/MyDrive/MedleyDB-Pitch-tuning'
MANIFEST_CSV = os.path.join(PROJECT_ROOT, 'manifest_tuning.csv')

AUDIO_OUT_DIR = os.path.join(PROJECT_ROOT, '50cents', 'audio')
PITCH_OUT_DIR = os.path.join(PROJECT_ROOT, '50cents', 'pitch')

os.makedirs(AUDIO_OUT_DIR, exist_ok=True)
os.makedirs(PITCH_OUT_DIR, exist_ok=True)

# Skip if file already exists
SKIP_IF_EXISTS = True

# Read manifest
df_plan = pd.read_csv(MANIFEST_CSV)

print("manifest_tuning.csv preview:")
print(df_plan.head(), "\n")

# Main detuning loop
for i, row in df_plan.iterrows():
    track_id   = row["track_id"]
    tuning_tag = row["tuning_tag"]   # '+' or '-'

    tr = dataset.track(track_id)
    in_audio_path = tr.audio_path
    in_pitch_path = tr.pitch_path

    # Output path (keep original filename)
    audio_fname = os.path.basename(in_audio_path)
    pitch_fname = os.path.basename(in_pitch_path)

    out_audio_path = os.path.join(AUDIO_OUT_DIR, audio_fname)
    out_pitch_path = os.path.join(PITCH_OUT_DIR, pitch_fname)

    if SKIP_IF_EXISTS and os.path.exists(out_audio_path) and os.path.exists(out_pitch_path):
        print(f"[{i+1}/{len(df_plan)}] {track_id} already exists, skipping")
        continue

    # 1) Read original audio and convert to mono
    y, sr = sf.read(in_audio_path)  # y: (n_samples,) or (n_samples, n_channels)

    if y.ndim > 1:
        # Multi-channel -> mono (average across channels)
        y = y.mean(axis=1)

    # Convert to float32 for better librosa compatibility
    y = y.astype(np.float32)

    # Calculate detune parameters
    cents = 50 if tuning_tag == "+" else -50
    n_steps = cents / 100.0                 # semitones
    factor = 2.0 ** (cents / 1200.0)        # frequency scaling factor

    # 2) Perform pitch shift on audio (maintain length, mono)
    y_shift = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)

    # Ensure length is exactly the same as original (pitch_shift might have 1-2 frame deviation)
    if y_shift.shape[0] > y.shape[0]:
        y_shift = y_shift[:y.shape[0]]
    elif y_shift.shape[0] < y.shape[0]:
        pad_width = y.shape[0] - y_shift.shape[0]
        y_shift = np.pad(y_shift, (0, pad_width), mode="constant")

    # 3) Write detuned audio (mono)
    sf.write(out_audio_path, y_shift, sr)

    # 4) Modify pitch CSV
    # MedleyDB-Pitch's pitch CSV is typically two columns: time_sec, f0_Hz
    pitch_df = pd.read_csv(
        in_pitch_path,
        header=None,
        names=["time", "f0"]
    )

    # Only scale where f0 > 0 (0 usually means no sound/unannotated)
    mask = pitch_df["f0"] > 0
    pitch_df.loc[mask, "f0"] = pitch_df.loc[mask, "f0"] * factor

    # Write to new directory, keeping two columns without header
    pitch_df.to_csv(out_pitch_path, index=False, header=False, float_format="%.6f")

    print(f"[{i+1}/{len(df_plan)}] {track_id} done | cents={cents}, factor={factor:.6f}")

print("\nAll 50 cents detuning done!")
print(f"Audio out dir : {AUDIO_OUT_DIR}")
print(f"Pitch out dir : {PITCH_OUT_DIR}")

In [None]:
import matplotlib.pyplot as plt
from IPython.display import Audio, display

# Path configuration (consistent with previous cells)
PROJECT_ROOT   = '/content/drive/MyDrive/MedleyDB-Pitch-tuning'
MANIFEST_CSV   = os.path.join(PROJECT_ROOT, 'manifest_tuning.csv')
AUDIO_25_DIR   = os.path.join(PROJECT_ROOT, '50cents', 'audio')
PITCH_25_DIR   = os.path.join(PROJECT_ROOT, '50cents', 'pitch')

df_plan = pd.read_csv(MANIFEST_CSV)

print("Manifest preview:")
print(df_plan.head(), "\n")

# Utility functions

def pick_random_track(df, class_tag, seed=13):
    """Randomly picks a track_id from a given class_tag (vocal / instrument)."""
    sub = df[df["class_tag"] == class_tag]
    if len(sub) == 0:
        raise ValueError(f"No tracks with class_tag={class_tag}")
    row = sub.sample(n=1, random_state=seed).iloc[0]
    return row["track_id"], row["tuning_tag"]  # Returns track_id and its corresponding '+' / '-' tuning tag


def load_audio_pair(track_id):
    """Loads original audio and 50c detuned audio."""
    tr = dataset.track(track_id)
    orig_audio_path = tr.audio_path
    det_audio_path  = os.path.join(AUDIO_25_DIR, os.path.basename(orig_audio_path))

    y_orig, sr_orig = sf.read(orig_audio_path)
    y_det,  sr_det  = sf.read(det_audio_path)

    assert sr_orig == sr_det, "Sample rate mismatch between original and detuned audio."

    return (y_orig, sr_orig), (y_det, sr_det), orig_audio_path, det_audio_path


def load_pitch_pair(track_id):
    """Loads original pitch CSV and 50c detuned pitch CSV."""
    tr = dataset.track(track_id)
    orig_pitch_path = tr.pitch_path
    det_pitch_path  = os.path.join(PITCH_25_DIR, os.path.basename(orig_pitch_path))

    # MedleyDB-Pitch: two columns time, f0
    pitch_orig = pd.read_csv(
        orig_pitch_path,
        header=None,
        names=["time", "f0"]
    )
    pitch_det  = pd.read_csv(det_pitch_path, header=None, names=["time", "f0"])

    return pitch_orig, pitch_det, orig_pitch_path, det_pitch_path


def plot_pitch_overlay(pitch_orig, pitch_det, title="", semitone_window=4):
    """
    Plots the detuned and original pitch curves on the same graph:
    - x-axis: time (s)
    - y-axis: MIDI pitch (semitone scale, 1 semitone per tick)
    Focuses on a small range for visual inspection of 0.25 semitone offset
    """
    t = pitch_orig["time"].values
    f0o = pitch_orig["f0"].values
    f0d = pitch_det["f0"].values

    # Align: only consider positions where both are > 0
    mask = (f0o > 0) & (f0d > 0)
    t = t[mask]
    f0o = f0o[mask]
    f0d = f0d[mask]

    # Hz -> MIDI (semitone scale)
    midi_o = 69 + 12 * np.log2(f0o / 440.0)
    midi_d = 69 + 12 * np.log2(f0d / 440.0)

    # Calculate actual cents offset for sanity check
    cents_diff = 1200 * np.log2(f0d / f0o)
    med_cents  = np.median(cents_diff)
    mean_cents = np.mean(cents_diff)
    print(f"{title} | median detune \u2248 {med_cents:.2f} cents, mean \u2248 {mean_cents:.2f} cents")

    # Core change: Limit y-axis range and set 1 semitone per tick
    # Center around the median of the original pitch, opening a small window
    center_midi = np.median(midi_o)
    half_win = semitone_window / 2.0
    y_min = np.floor(center_midi - half_win)
    y_max = np.ceil(center_midi + half_win)

    yticks = np.arange(y_min, y_max + 1, 1)  # 1 semitone per tick

    # Plotting
    plt.figure(figsize=(10, 4))
    plt.plot(t, midi_o, label="Original pitch", linewidth=1.0)
    plt.plot(t, midi_d, label="Detuned pitch (25 cents)", linewidth=1.0, alpha=0.8)
    plt.xlabel("Time (s)")
    plt.ylabel("Pitch (MIDI semitones)")
    plt.title(title)
    plt.grid(True, alpha=0.3)

    plt.ylim(y_min, y_max)
    plt.xlim(50,100)
    plt.yticks(yticks)

    plt.legend()
    plt.tight_layout()
    plt.show()


def audition_and_plot(track_id, class_tag, tuning_tag):
    """Combines: playing audio + plotting pitch."""
    print("=" * 80)
    print(f"class_tag={class_tag}, track_id={track_id}, tuning_tag={tuning_tag}")

    # ---- Audio ----
    (y_orig, sr_orig), (y_det, sr_det), orig_ap, det_ap = load_audio_pair(track_id)
    assert sr_orig == sr_det

    # Ensure float32, mono, to avoid strange wave/header issues
    if y_orig.ndim > 1:
        y_orig_play = y_orig[:, 0]
    else:
        y_orig_play = y_orig
    if y_det.ndim > 1:
        y_det_play = y_det[:, 0]
    else:
        y_det_play = y_det

    y_orig_play = y_orig_play.astype(np.float32)
    y_det_play  = y_det_play.astype(np.float32)

    # (Optional) Normalize if amplitude is too high, to avoid clipping
    max_abs = max(np.max(np.abs(y_orig_play)), np.max(np.abs(y_det_play)))
    if max_abs > 1.0:
        y_orig_play = y_orig_play / max_abs
        y_det_play  = y_det_play  / max_abs

    print("Original audio:", orig_ap)
    display(Audio(y_orig_play, rate=sr_orig))

    print("Detuned audio (50 cents):", det_ap)
    display(Audio(y_det_play, rate=sr_det))

    # ---- Pitch ----
    pitch_orig, pitch_det, orig_pp, det_pp = load_pitch_pair(track_id)
    print("Original pitch CSV:", orig_pp)
    print("Detuned pitch CSV :", det_pp)

    title = f"{class_tag} | {track_id} | detune {tuning_tag}50 cents"
    plot_pitch_overlay(pitch_orig, pitch_det, title=title)


# 1) Randomly pick a vocal track
vocal_tid, vocal_tuning_tag = pick_random_track(df_plan, "vocal", seed=150)
audition_and_plot(vocal_tid, "vocal", vocal_tuning_tag)

# 2) Randomly pick an instrument track
instr_tid, instr_tuning_tag = pick_random_track(df_plan, "instrument", seed=98)
audition_and_plot(instr_tid, "instrument", instr_tuning_tag)

In [None]:
noise_dir = '/content/drive/MyDrive/MedleyDB-Pitch-tuning/25cents'
noise_dataset = mirdata.initialize('medleydb_pitch', data_home=noise_dir)

In [None]:
noise_dataset.download()

208kB [00:02, 84.1kB/s]                          
    To download this dataset, visit:
    https://zenodo.org/record/2620624#.XKZc7hNKh24
    and request access.

    Once downloaded, unzip the file MedleyDB-Pitch.zip
    and copy the result to:
    /content/drive/MyDrive/MedleyDB-Pitch-tuning/25cents



In [None]:
track_ids = noise_dataset.track_ids
print(f"✅ Found {len(track_ids)} tracks.")

✅ 总共有 103 条音轨数据
