In [6]:
!pip install mido pretty_midi tqdm

Collecting mido
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty_midi: filename=pretty_midi-0.2.10-py3-none-any.whl size=5592286 sha256=7a2a1bac04d80afaae4f9c1bc18893941e036fc5932fba81b8fab141d429f920
  Stored in directory: /root/.cache/pip/wheels/e6/95/ac/15ceaeb2823b04d8e638fd1495357adb8d26c00ccac9d7782e
Successfully built pretty_midi
Installing collected packages: mido, pretty_midi
Successfully installed mido-1.3.3 p

# MIDI Archive Colab Starter

This notebook helps you build a searchable, enriched, and deduplicated MIDI file archive.

### Pipeline Overview
1. Upload MIDI files to Colab
2. Deduplicate and hash files
3. Enrich metadata via Spotify or Discogs
4. Analyze musical features (key, BPM, polyphony, etc.)
5. Rename files and save historical data in JSON
6. (Optional) Generate MP3 previews for browsing

----

## Let's begin...

In [7]:
# Mount Google Drive to access your MIDI files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Step 1: De-duplicate and Hash MIDI Files

## Step 2: Enrich Metadata with Spotify

In [9]:
import os
import json
import hashlib
from pathlib import Path
from mido import MidiFile
from tqdm import tqdm
from shutil import move

midi_root_path = '/content/drive/MyDrive/midi_folder'

def get_midi_hash(file_path):
    """Generate a hash based on MIDI note events (ignores file name)."""
    try:
        mid = MidiFile(file_path)
        note_data = []
        for track in mid.tracks:
            for msg in track:
                if msg.type in ['note_on', 'note_off']:
                    note_data.append((msg.type, msg.note, msg.velocity, msg.time))
        note_str = str(note_data).encode('utf-8')
        return hashlib.md5(note_str).hexdigest()
    except Exception as e:
        print(f"[ERROR] Skipping {file_path}: {e}")
        return None

def deduplicate_midi_keep_unique(midi_root_path):
    midi_root = Path(midi_root_path)
    trash_dir = midi_root / "trash"
    trash_dir.mkdir(exist_ok=True)

    json_output_dir = midi_root / "logs"
    json_output_dir.mkdir(exist_ok=True)

    seen_hashes = {}
    duplicate_log = []

    # Match .mid and .kar files
    all_midi_files = list(midi_root.rglob("*.mid")) + list(midi_root.rglob("*.kar"))

    for midi_path in tqdm(all_midi_files, desc="Scanning MIDI and KAR files"):
        if trash_dir in midi_path.parents or json_output_dir in midi_path.parents:
            continue  # Skip trash or logs

        midi_hash = get_midi_hash(midi_path)
        if not midi_hash:
            continue

        if midi_hash not in seen_hashes:
            # Store unique file
            seen_hashes[midi_hash] = {
                "hash": midi_hash,
                "original_filenames": [midi_path.name],
                "status": "unique",
                "source_folder": str(midi_path.parent)
            }

            json_path = json_output_dir / f"{midi_path.stem}.json"
            with open(json_path, 'w') as f:
                json.dump(seen_hashes[midi_hash], f, indent=2)

        else:
            # It's a duplicate
            seen_hashes[midi_hash]["original_filenames"].append(midi_path.name)
            duplicate_log.append({
                "duplicate": midi_path.name,
                "original": seen_hashes[midi_hash]["original_filenames"][0],
                "folder": str(midi_path.parent)
            })

            move(str(midi_path), trash_dir / midi_path.name)
            print(f"[MOVED TO TRASH] {midi_path.name}")

    # Write summary duplicate log
    dup_log_path = json_output_dir / "duplicate_log.json"
    with open(dup_log_path, 'w') as f:
        json.dump(duplicate_log, f, indent=2)

    print(f"\nDone. Scanned: {len(all_midi_files)} files")
    print(f"Unique: {len(seen_hashes)} | Duplicates moved: {len(duplicate_log)}")
    print(f"Logs stored in: {json_output_dir}")

In [None]:
# from scripts.spotify_enrichment import enrich_with_spotify
# enrich_with_spotify('/content/drive/MyDrive/midi_folder')

## Step 3: Analyze MIDI Musical Features

In [None]:
# from scripts.midi_analysis import analyze_midi
# analyze_midi('/content/drive/MyDrive/midi_folder')