# 🎼 Composer Classification — Colab Orchestrator (One-Click)
This notebook runs the whole pipeline in **one Colab runtime**. It will:

1) Mount Google Drive (optional but recommended)

2) Find your **patched notebooks** automatically (or unzip `final_report_kit.zip` if present)

3) Ensure Kaggle auth

4) Inject parameters into each notebook

5) Patch preprocessing to extract into a **writable** folder

6) Execute **Preprocessing → (Hybrid/LSTM) → Evaluation** via nbclient

7) Summarize metrics & show confusion matrices


> Tip: If you see a Kaggle 403, upload your `kaggle.json` (Kaggle API token) to Drive and rerun the auth cell.


## 0) Install dependencies (first run only)

In [1]:
# If this is your first run in a fresh Colab session, run this cell.
!pip -q install kagglehub nbclient pretty_midi mido scikit-learn matplotlib pandas
try:
    import torch, torchvision, torchaudio  # often preinstalled on Colab GPU runtimes
except Exception:
    !pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone


## 1) Mount Drive (optional) & auto-discover the `notebooks/` folder

In [2]:
from google.colab import drive
from pathlib import Path
import os, zipfile

# Mount (safe to re-run)
try:
    drive.mount('/content/drive')
except Exception as e:
    print("Drive mount note:", e)

# If a kit zip is present in /content, unzip it for convenience
if Path('/content/final_report_kit.zip').exists() and not Path('/content/report/notebooks').exists():
    with zipfile.ZipFile('/content/final_report_kit.zip', 'r') as z:
        z.extractall('/content')

# Auto-detect the notebooks directory
REQUIRED = {
    "AAI511_Final_Data_Preprocessing_2_PATCHED_preproc.ipynb",
    "CNN_Hybrid_PATCHED_hybrid_CLEAN.ipynb",
    "CNN_LSTM_Evaluation_PATCHED_eval.ipynb",
    "LSTM_Training_PATCHED_lstm.ipynb",
}

def has_all(dirpath: Path) -> bool:
    names = {p.name for p in dirpath.glob("*.ipynb")}
    return REQUIRED.issubset(names)

CANDIDATES = [
    Path('/content/report/notebooks'),
    Path('/content/drive/MyDrive/ComposerReport/report/notebooks'),
    Path('/content/drive/MyDrive').resolve(),
    Path('/content').resolve(),
]

NOTEBOOKS_DIR = None
for base in CANDIDATES:
    if base.is_dir():
        # quick check on exact folder
        if has_all(base):
            NOTEBOOKS_DIR = base
            break
        # search shallow
        for root, dirs, files in os.walk(base):
            p = Path(root)
            if has_all(p):
                NOTEBOOKS_DIR = p
                break
        if NOTEBOOKS_DIR:
            break

assert NOTEBOOKS_DIR is not None, "Couldn't find the patched notebooks. Upload `final_report_kit.zip` and run again, or set NOTEBOOKS_DIR manually."

# Make artifacts & processed_data in a writable, local place
ROOT = NOTEBOOKS_DIR.parent if NOTEBOOKS_DIR.name == 'notebooks' else Path('/content')
ARTIFACTS = (ROOT / 'artifacts'); ARTIFACTS.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR = (ROOT / 'processed_data'); OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("NOTEBOOKS_DIR:", NOTEBOOKS_DIR)
print("ARTIFACTS:", ARTIFACTS)
print("OUTPUT_DIR:", OUTPUT_DIR)


Mounted at /content/drive
NOTEBOOKS_DIR: /content/drive/MyDrive/ComposerReport/report/notebooks
ARTIFACTS: /content/drive/MyDrive/ComposerReport/report/artifacts
OUTPUT_DIR: /content/drive/MyDrive/ComposerReport/report/processed_data


## 2) Configuration

In [3]:
# Edit these to your needs. Defaults are safe.
CONFIG = {
    # Kaggle dataset slug — change this to your dataset
    "KAGGLE_DATASET": "blanderbuss/midi-classic-music",  # <--- set your actual slug here
    "ZIP_FILENAME": None,  # or "something.zip" if dataset has multiple zips
    # Target composers
    "TARGET_COMPOSERS": ["bach","beethoven","chopin","mozart"],
    # Steps to run
    "RUN_PREPROCESS": True,
    "RUN_HYBRID_TRAINING": True,
    "RUN_LSTM_TRAINING": True,
}

CONFIG


{'KAGGLE_DATASET': 'blanderbuss/midi-classic-music',
 'ZIP_FILENAME': None,
 'TARGET_COMPOSERS': ['bach', 'beethoven', 'chopin', 'mozart'],
 'RUN_PREPROCESS': True,
 'RUN_HYBRID_TRAINING': True,
 'RUN_LSTM_TRAINING': True}

## 3) Kaggle authentication (required for KaggleHub)

In [4]:
# import os, shutil
# from pathlib import Path

# home = Path.home()/".kaggle"
# home.mkdir(parents=True, exist_ok=True)

# installed = False
# for s in [Path("./kaggle.json"),
#           Path("/content/drive/MyDrive/kaggle.json"),
#           Path("/content/drive/MyDrive/.kaggle/kaggle.json")]:
#     if s.exists():
#         shutil.copy(s, home/"kaggle.json"); os.chmod(home/"kaggle.json", 0o600)
#         print("✅ Installed kaggle.json from", s)
#         installed = True
#         break

# if not installed:
#     print("⚠️ kaggle.json not found. Public datasets may still require auth. Upload kaggle.json to Drive and re-run this cell if you hit 403 errors.")


## 4) Helpers: parameter injection, preprocessing patch, and execution

In [5]:
import nbformat, re, json, traceback, os
from nbclient import NotebookClient
from nbformat.v4 import new_code_cell
from pathlib import Path

def inject_parameters(nb_path: Path, params: dict):
    nb = nbformat.read(nb_path.as_posix(), as_version=4)
    # Make/replace a cell at top defining the parameters
    code_lines = []
    for k, v in params.items():
        if isinstance(v, str):
            code_lines.append(f'{k} = {json.dumps(v)}')
        elif isinstance(v, (list, tuple)):
            code_lines.append(f'{k} = {json.dumps(v)}')
        else:
            code_lines.append(f'{k} = {repr(v)}')
    cell = new_code_cell("\n".join(code_lines))
    # Insert at top (index 0)
    nb.cells.insert(0, cell)
    return nb

def patch_preprocessing_extraction(nb):
    # Find a cell that uses kagglehub + zipfile and patch the extraction path to OUTPUT_DIR/extracted_midis
    patched = False
    for c in nb.cells:
        if c.cell_type != 'code':
            continue
        src = c.source
        if "kagglehub.dataset_download" in src and "zipfile.ZipFile" in src:
            # recursive zip search
            src = re.sub(r"zips\s=\s\[.*?\]",
                         "zips = []\nfor _root, _dirs, _files in os.walk(download_root):\n    for _fn in _files:\n        if _fn.lower().endswith('.zip'):\n            zips.append(os.path.join(_root, _fn))",
                         src, flags=re.DOTALL)
            # set extract_path to OUTPUT_DIR/extracted_midis
            src = re.sub(r"extract_path\s*=.*",
                         "extract_path = os.fspath((Path(OUTPUT_DIR) / 'extracted_midis').resolve())",
                         src)
            if "from pathlib import Path" not in src:
                src = "from pathlib import Path\n" + src
            # midi_root uses extraction
            src = re.sub(r"midi_root\s*=.*", "midi_root = extract_path", src)
            c.source = src
            patched = True
            break
    return nb, patched

def execute_notebook(nb, name: str):
    out_path = ARTIFACTS / f"{name}_EXECUTED.ipynb"
    print("▶️ Executing:", name)
    client = NotebookClient(nb, timeout=None, kernel_name='python3')
    try:
        executed = client.execute()
    except Exception as e:
        print("❌ Error while executing", name, "->", type(e).__name__, e)
        traceback.print_exc()
        # Still save the partial output for debugging
        nbformat.write(client.nb, out_path.as_posix())
        raise
    nbformat.write(executed, out_path.as_posix())
    print("✅ Wrote:", out_path)
    return out_path


In [6]:
# --- Helper: replace fragile Hybrid loader with a safe one ---
import nbformat
from nbformat.v4 import new_code_cell

SAFE_HYBRID_LOADER = r"""
from pathlib import Path
import os, pickle, numpy as np
from collections import Counter

# ensure BASE_DIR is a Path
if not isinstance(BASE_DIR, Path):
    BASE_DIR = Path(BASE_DIR)

with open(BASE_DIR / 'lstm_data.pkl','rb') as f:
    data = pickle.load(f)

def first_non_none(*vals):
    for v in vals:
        if v is not None:
            return v
    return None

# Support tuple/list or dict-shaped pickles
if isinstance(data, dict):
    X_lstm = first_non_none(data.get('X'), data.get('X_lstm'), data.get('X_windows'))
    y      = first_non_none(data.get('y'), data.get('labels'))
    le     = first_non_none(data.get('le'), data.get('label_encoder'))
else:
    if len(data) == 3:
        X_lstm, y, le = data
    elif len(data) == 2:
        X_lstm, y = data
        le = None
    else:
        raise ValueError(f"Unexpected lstm_data.pkl format: len={len(data)}")

print("Type(X_lstm):", type(X_lstm))
try: print("len(X_lstm):", len(X_lstm))
except TypeError: print("X_lstm shape:", getattr(X_lstm, 'shape', 'N/A'))
print("Type(y):", type(y), "len(y):", len(y) if hasattr(y, '__len__') else 'N/A')
if getattr(le, "classes_", None) is not None:
    print("Label classes:", le.classes_)

assert hasattr(X_lstm, '__len__') and len(X_lstm) == len(y), "X and y must be same length"
assert len(X_lstm) > 0, "X_lstm is empty — upstream preprocessing likely failed."
"""

def patch_hybrid_loader(nb):
    """
    Find the cell that loads lstm_data.pkl using 'or' chaining and replace it with the safe loader.
    Returns (nb, replaced: bool).
    """
    replaced = False
    for i, c in enumerate(nb.cells):
        if c.cell_type != "code":
            continue
        src = c.source
        if ("lstm_data.pkl" in src) and ("data.get('X')" in src or "Handle either tuple/list or dict" in src):
            nb.cells[i] = new_code_cell(SAFE_HYBRID_LOADER)
            replaced = True
            break
    return nb, replaced


## 5) Run the pipeline

In [7]:
from pathlib import Path

# 5.1 Preprocess
if CONFIG.get("RUN_PREPROCESS", True):
    preproc_params = {
        "KAGGLE_DATASET": CONFIG["KAGGLE_DATASET"],
        "ZIP_FILENAME": CONFIG["ZIP_FILENAME"],
        "OUTPUT_DIR": str(OUTPUT_DIR),
        "TARGET_COMPOSERS": CONFIG["TARGET_COMPOSERS"],
    }
    nb_pre = inject_parameters(NOTEBOOKS_DIR/"AAI511_Final_Data_Preprocessing_2_PATCHED_preproc.ipynb", preproc_params)
    nb_pre, did_patch = patch_preprocessing_extraction(nb_pre)
    if did_patch:
        print("🔧 Patched preprocessing to extract into OUTPUT_DIR/extracted_midis")
    execute_notebook(nb_pre, "PREPROCESSING")

# 5.2 Hybrid training (optional)
if CONFIG.get("RUN_HYBRID_TRAINING", False):
    train_params = {"BASE_DIR": str(OUTPUT_DIR)}   # <- use OUTPUT_DIR exactly
    nb_h = inject_parameters(NOTEBOOKS_DIR/"CNN_Hybrid_PATCHED_hybrid_CLEAN.ipynb", train_params)
    nb_h, did_replace = patch_hybrid_loader(nb_h)
    if did_replace:
        print("🔧 Replaced fragile Hybrid loader with safe loader")
    else:
        print("ℹ️ Hybrid loader already safe (no replacement needed)")

    # ⬇️ Force and verify BASE_DIR inside the executed notebook
    from nbformat.v4 import new_code_cell
    nb_h.cells.insert(1, new_code_cell(f"""
from pathlib import Path, PurePath
BASE_DIR = Path({repr(str(OUTPUT_DIR))})
print("HYBRID BASE_DIR ->", BASE_DIR)
import os
print("Files in BASE_DIR:", sorted([f for f in os.listdir(BASE_DIR) if f.endswith(".pkl")]))
assert (BASE_DIR/"lstm_data.pkl").exists(), "lstm_data.pkl not found in " + str(BASE_DIR)
assert (BASE_DIR/"lstm_dev.pkl").exists(),  "lstm_dev.pkl not found in "  + str(BASE_DIR)
assert (BASE_DIR/"lstm_test.pkl").exists(), "lstm_test.pkl not found in " + str(BASE_DIR)
"""))

    execute_notebook(nb_h, "TRAIN_HYBRID")

# 5.3 LSTM training (optional)
if CONFIG.get("RUN_LSTM_TRAINING", False):
    train_params = {"BASE_DIR": Path(OUTPUT_DIR).as_posix()}
    nb_l = inject_parameters(NOTEBOOKS_DIR/"LSTM_Training_PATCHED_lstm.ipynb", train_params)
    execute_notebook(nb_l, "TRAIN_LSTM")

# 5.4 Evaluation (always run if present)
eval_params = {"BASE_DIR": Path(OUTPUT_DIR).as_posix()}
nb_e = inject_parameters(NOTEBOOKS_DIR/"CNN_LSTM_Evaluation_PATCHED_eval.ipynb", eval_params)
execute_notebook(nb_e, "EVALUATION")

🔧 Patched preprocessing to extract into OUTPUT_DIR/extracted_midis
▶️ Executing: PREPROCESSING
✅ Wrote: /content/drive/MyDrive/ComposerReport/report/artifacts/PREPROCESSING_EXECUTED.ipynb
🔧 Replaced fragile Hybrid loader with safe loader
▶️ Executing: TRAIN_HYBRID
❌ Error while executing TRAIN_HYBRID -> CellExecutionError An error occurred while executing the following cell:
------------------

from pathlib import Path, PurePath
BASE_DIR = Path('/content/drive/MyDrive/ComposerReport/report/processed_data')
print("HYBRID BASE_DIR ->", BASE_DIR)
import os
print("Files in BASE_DIR:", sorted([f for f in os.listdir(BASE_DIR) if f.endswith(".pkl")]))
assert (BASE_DIR/"lstm_data.pkl").exists(), "lstm_data.pkl not found in " + str(BASE_DIR)
assert (BASE_DIR/"lstm_dev.pkl").exists(),  "lstm_dev.pkl not found in "  + str(BASE_DIR)
assert (BASE_DIR/"lstm_test.pkl").exists(), "lstm_test.pkl not found in " + str(BASE_DIR)

------------------

----- stdout -----
HYBRID BASE_DIR -> /content/drive/MyD

Traceback (most recent call last):
  File "/tmp/ipython-input-1334599085.py", line 52, in execute_notebook
    executed = client.execute()
               ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/jupyter_core/utils/__init__.py", line 164, in wrapped
    return _runner_map[name].run(inner)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/jupyter_core/utils/__init__.py", line 127, in run
    return fut.result(None)
           ^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/concurrent/futures/_base.py", line 456, in result
    return self.__get_result()
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/concurrent/futures/_base.py", line 401, in __get_result
    raise self._exception
  File "/usr/local/lib/python3.11/dist-packages/nbclient/client.py", line 709, in async_execute
    await self.async_execute_cell(
  File "/usr/local/lib/python3.11/dist-packages/nbclient/client.py", line 1062, in async_execute_cell
    await 

CellExecutionError: An error occurred while executing the following cell:
------------------

from pathlib import Path, PurePath
BASE_DIR = Path('/content/drive/MyDrive/ComposerReport/report/processed_data')
print("HYBRID BASE_DIR ->", BASE_DIR)
import os
print("Files in BASE_DIR:", sorted([f for f in os.listdir(BASE_DIR) if f.endswith(".pkl")]))
assert (BASE_DIR/"lstm_data.pkl").exists(), "lstm_data.pkl not found in " + str(BASE_DIR)
assert (BASE_DIR/"lstm_dev.pkl").exists(),  "lstm_dev.pkl not found in "  + str(BASE_DIR)
assert (BASE_DIR/"lstm_test.pkl").exists(), "lstm_test.pkl not found in " + str(BASE_DIR)

------------------

----- stdout -----
HYBRID BASE_DIR -> /content/drive/MyDrive/ComposerReport/report/processed_data
Files in BASE_DIR: []
------------------

[0;31m---------------------------------------------------------------------------[0m
[0;31mAssertionError[0m                            Traceback (most recent call last)
[0;32m/tmp/ipython-input-3519094063.py[0m in [0;36m<cell line: 0>[0;34m()[0m
[1;32m      4[0m [0;32mimport[0m [0mos[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m [0mprint[0m[0;34m([0m[0;34m"Files in BASE_DIR:"[0m[0;34m,[0m [0msorted[0m[0;34m([0m[0;34m[[0m[0mf[0m [0;32mfor[0m [0mf[0m [0;32min[0m [0mos[0m[0;34m.[0m[0mlistdir[0m[0;34m([0m[0mBASE_DIR[0m[0;34m)[0m [0;32mif[0m [0mf[0m[0;34m.[0m[0mendswith[0m[0;34m([0m[0;34m".pkl"[0m[0;34m)[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 6[0;31m [0;32massert[0m [0;34m([0m[0mBASE_DIR[0m[0;34m/[0m[0;34m"lstm_data.pkl"[0m[0;34m)[0m[0;34m.[0m[0mexists[0m[0;34m([0m[0;34m)[0m[0;34m,[0m [0;34m"lstm_data.pkl not found in "[0m [0;34m+[0m [0mstr[0m[0;34m([0m[0mBASE_DIR[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      7[0m [0;32massert[0m [0;34m([0m[0mBASE_DIR[0m[0;34m/[0m[0;34m"lstm_dev.pkl"[0m[0;34m)[0m[0;34m.[0m[0mexists[0m[0;34m([0m[0;34m)[0m[0;34m,[0m  [0;34m"lstm_dev.pkl not found in "[0m  [0;34m+[0m [0mstr[0m[0;34m([0m[0mBASE_DIR[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      8[0m [0;32massert[0m [0;34m([0m[0mBASE_DIR[0m[0;34m/[0m[0;34m"lstm_test.pkl"[0m[0;34m)[0m[0;34m.[0m[0mexists[0m[0;34m([0m[0;34m)[0m[0;34m,[0m [0;34m"lstm_test.pkl not found in "[0m [0;34m+[0m [0mstr[0m[0;34m([0m[0mBASE_DIR[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;31mAssertionError[0m: lstm_data.pkl not found in /content/drive/MyDrive/ComposerReport/report/processed_data


## 6) Results summary

In [None]:
import json, pandas as pd
from pathlib import Path
from IPython.display import display

metric_files = list(Path(ARTIFACTS).glob("metrics_*.json"))
rows = []
for mf in metric_files:
    with open(mf, 'r') as f:
        d = json.load(f)
    rows.append({
        "model": d.get("model"),
        "accuracy": d.get("accuracy"),
        "f1_macro": d.get("f1_macro"),
        "f1_weighted": d.get("f1_weighted"),
        "precision_macro": d.get("precision_macro"),
        "recall_macro": d.get("recall_macro"),
    })

if rows:
    df = pd.DataFrame(rows).sort_values(by="accuracy", ascending=False)
    display(df)
    out_csv = Path(ARTIFACTS) / "model_comparison.csv"
    df.to_csv(out_csv, index=False)
    print("Saved CSV ->", out_csv)
else:
    print("No metrics_* JSON files in:", ARTIFACTS)


## 7) Confusion matrices

In [None]:
from IPython.display import Image, display
from pathlib import Path

pngs = list(Path(ARTIFACTS).glob("confusion_*.png"))
if not pngs:
    print("No confusion_*.png in:", ARTIFACTS)
for p in pngs:
    print(p.name)
    display(Image(filename=str(p)))
