In [15]:
KAGGLE_DATASET = "blanderbuss/midi-classic-music"
ZIP_FILENAME = None
OUTPUT_DIR = "./processed_data"
TARGET_COMPOSERS = ["bach","beethoven","chopin","mozart"]


In [16]:
# === Parameters (Papermill friendly) ===
KAGGLE_DATASET = KAGGLE_DATASET if 'KAGGLE_DATASET' in globals() else 'blanderbuss/midi-classic-music'
ZIP_FILENAME   = ZIP_FILENAME   if 'ZIP_FILENAME'   in globals() else None
OUTPUT_DIR     = OUTPUT_DIR     if 'OUTPUT_DIR'     in globals() else '/mnt/data/processed_data'
TARGET_COMPOSERS = TARGET_COMPOSERS if 'TARGET_COMPOSERS' in globals() else ['bach','beethoven','chopin','mozart']
print("KAGGLE_DATASET:", KAGGLE_DATASET)
print("ZIP_FILENAME:", ZIP_FILENAME)
print("OUTPUT_DIR:", OUTPUT_DIR)
print("TARGET_COMPOSERS:", TARGET_COMPOSERS)


KAGGLE_DATASET: blanderbuss/midi-classic-music
ZIP_FILENAME: None
OUTPUT_DIR: ./processed_data
TARGET_COMPOSERS: ['bach', 'beethoven', 'chopin', 'mozart']


In [17]:
# Environment setup
!pip install pretty_midi music21 numpy pandas matplotlib scikit-learn



In [18]:
# Imports
import kagglehub
import os, zipfile, pickle
import numpy as np
import pretty_midi

from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [19]:
# Config
SEQUENCE_LENGTH = 50 # input for LSTM and CNN
SEQ_LEN = SEQUENCE_LENGTH
MAX_SEQUENCES_PER_FILE = 50
TARGET_COMPOSERS = {'bach', 'beethoven', 'chopin', 'mozart'}

In [20]:
# Load dataset from kagglehub
dataset_path = kagglehub.dataset_download("blanderbuss/midi-classic-music")
print(f"Path to dataset files: {dataset_path}")

midi_root = "/kaggle/working/midiclassics_extracted"
print(f"Using MIDI folder: {midi_root}")

Path to dataset files: /kaggle/input/midi-classic-music
Using MIDI folder: /kaggle/working/midiclassics_extracted


In [21]:
zip_path = os.path.join(dataset_path, 'midiclassics.zip')
extract_path = '/kaggle/working/midiclassics_extracted'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Unzipped to:", extract_path)

✅ Unzipped to: /kaggle/working/midiclassics_extracted


### Load MIDI Files

In [22]:
composer_dirs = [
    os.path.join(midi_root, d)
    for d in os.listdir(midi_root)
    if os.path.isdir(os.path.join(midi_root, d)) and d.lower() in TARGET_COMPOSERS
]

### Extract Features


In [23]:
def extract_note_sequence(midi):
  notes = []
  for instrument in midi.instruments:
    if not instrument.is_drum:
      for note in instrument.notes:
        notes.append({
            'start': note.start,
            'pitch': note.pitch,
            'duration': note.end - note.start,
            'velocity': note.velocity
        })
  # Sorting by pitch:
  notes.sort(key=lambda x: x['pitch'])
  return notes

### Make Sequences (LSTM/RNN)

In [24]:
def make_feature_sequences(notes, seq_len=SEQUENCE_LENGTH):
  return [notes[i:i+seq_len] for i in range(len(notes) - seq_len)]

### Make Piano Rolls (CNN)

In [25]:
# Notes to piano rolls
def notes_to_piano_roll(notes, seq_len=SEQUENCE_LENGTH, pitch_range=(21, 109)):
  roll = np.zeros((seq_len, pitch_range[1] - pitch_range[0])) # 50 * 88

  for i, note in enumerate(notes[:seq_len]):
    pitch = note['pitch']
    if pitch_range[0] <= pitch < pitch_range[1]:
      roll[i, pitch - pitch_range[0]] = 1.0
  return roll

### Normalize Features

In [26]:
def normalize(sequences):
  return [
      np.array([[n['pitch'], n['duration'], n['velocity']] for n in seq], dtype=np.float32)
      for seq in sequences
  ]

### Label Encoding

### Save model data

In [27]:
def save_model_data(X, y, le, path):
  with open(path, 'wb') as f:
    pickle.dump({'X': X, 'y': y, 'label_encoder': le}, f)
  print(f'Saved to {path}')

### Main Pipeline

In [28]:
# === KaggleHub download & extraction ===
import kagglehub, os, zipfile
from pathlib import Path

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

download_root = kagglehub.dataset_download(KAGGLE_DATASET)
print("Downloaded to:", download_root)

if ZIP_FILENAME:
    candidate = os.path.join(download_root, ZIP_FILENAME)
    assert os.path.exists(candidate), f"ZIP file not found: {candidate}"
    zip_path = candidate
else:
    zips = []
for _root, _dirs, _files in os.walk(download_root):
    for _fn in _files:
        if _fn.lower().endswith('.zip'):
            zips.append(os.path.join(_root, _fn))
    assert zips, f"No .zip found under {download_root} — set ZIP_FILENAME."
    zip_path = os.path.join(download_root, zips[0])

extract_path = os.fspath((Path(OUTPUT_DIR) / 'extracted_midis').resolve())
os.makedirs(extract_path, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_path)
print("Extracted to:", extract_path)

midi_root = extract_path
composer_dirs = [
    os.path.join(midi_root, d)
    for d in os.listdir(midi_root)
    if os.path.isdir(os.path.join(midi_root, d)) and d.lower() in TARGET_COMPOSERS
]
print("Composer dirs:", composer_dirs[:4], " ... total:", len(composer_dirs))


Downloaded to: /kaggle/input/midi-classic-music
Extracted to: /content/processed_data/extracted_midis
Composer dirs: ['/content/processed_data/extracted_midis/Mozart', '/content/processed_data/extracted_midis/Chopin', '/content/processed_data/extracted_midis/Bach', '/content/processed_data/extracted_midis/Beethoven']  ... total: 4


In [29]:
def preprocess_dataset(composer_dirs):
  all_lstm_sequences = []
  all_cnn_sequences = []
  all_labels = []

  for composer_path in composer_dirs:
    composer = os.path.basename(composer_path).lower()
    print(f"\n🎼 Processing {composer}")

    for file in os.listdir(composer_path):
      if not file.endswith('.mid'):
        continue
      try:
        midi_path = os.path.join(composer_path, file)
        pm = pretty_midi.PrettyMIDI(midi_path)
        notes = extract_note_sequence(pm)
        if len(notes) < SEQ_LEN:
          continue
        sequences = make_feature_sequences(notes, SEQ_LEN)[:MAX_SEQUENCES_PER_FILE]

        piano_rolls = [notes_to_piano_roll(seq) for seq in sequences]
        all_cnn_sequences.extend(piano_rolls)

        normalized = normalize(sequences)
        all_lstm_sequences.extend(normalized)

        all_labels.extend([composer] * len(sequences))
      except Exception as e:
        print(f"⚠️ Skipping {file}: {e}")
        continue

  if not all_lstm_sequences:
    print("No valid sequences found.")
    return None, None, None, None

  le = LabelEncoder()
  le.fit(sorted(TARGET_COMPOSERS))
  y = le.transform(all_labels)

  X_lstm = np.array(all_lstm_sequences, dtype=np.float32)
  X_cnn = np.array(all_cnn_sequences, dtype=np.float32)[..., np.newaxis]

  print(f"\n✅ Final LSTM shape: {X_lstm.shape}")
  print(f"✅ Final CNN shape:  {X_cnn.shape}")
  print(f"✅ Labels shape:     {y.shape}")

  return X_lstm, X_cnn, y, le

In [30]:
# 🚀 Run It
X_lstm, X_cnn, y, le = preprocess_dataset(composer_dirs)

if X_lstm is not None:
    save_model_data(X_lstm, y, le, 'lstm_data.pkl')
    save_model_data(X_cnn, y, le, 'cnn_data.pkl')


🎼 Processing mozart





🎼 Processing chopin

🎼 Processing bach

🎼 Processing beethoven
⚠️ Skipping Anhang 14-3.mid: Could not decode key with 3 flats and mode 255

✅ Final LSTM shape: (23990, 50, 3)
✅ Final CNN shape:  (23990, 50, 88, 1)
✅ Labels shape:     (23990,)
Saved to lstm_data.pkl
Saved to cnn_data.pkl


### Save Preprocessed Data

### Run

In [31]:
!zip cnn_data.pkl.zip /content/cnn_data.pkl

  adding: content/cnn_data.pkl (deflated 99%)


In [32]:
# === Standardized save: split & dump expected files ===
import numpy as np, pickle, os
from pathlib import Path
from sklearn.model_selection import train_test_split

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

def _ensure_numpy(x):
    return np.array(x)

X_lstm = _ensure_numpy(X_lstm)
y      = _ensure_numpy(y)

X_train_l, X_temp, y_train, y_temp = train_test_split(X_lstm, y, test_size=0.30, random_state=42, stratify=y)
X_dev_l,   X_test_l, y_dev,  y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

with open(os.path.join(OUTPUT_DIR, 'label_encoder.pkl'), 'wb') as f:
    pickle.dump(le, f)

with open(os.path.join(OUTPUT_DIR, 'lstm_data.pkl'), 'wb') as f:
    pickle.dump((X_train_l, y_train, le), f)
with open(os.path.join(OUTPUT_DIR, 'lstm_dev.pkl'), 'wb') as f:
    pickle.dump((X_dev_l, y_dev), f)
with open(os.path.join(OUTPUT_DIR, 'lstm_test.pkl'), 'wb') as f:
    pickle.dump((X_test_l, y_test), f)

try:
    X_cnn = _ensure_numpy(X_cnn)
    with open(os.path.join(OUTPUT_DIR, 'cnn_data.pkl'), 'wb') as f:
        pickle.dump((X_cnn, y, le), f)
except Exception as e:
    print("[note] CNN tensor not available or failed to save:", e)

print("Saved standardized outputs to:", OUTPUT_DIR)


Saved standardized outputs to: ./processed_data
