In [1]:
import pickle
import numpy as np
import pretty_midi
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

# Fixed list of composers & splits
COMPOSERS       = ['bach', 'beethoven', 'chopin', 'mozart']
SPLITS          = ['train', 'dev', 'test']
SEQUENCE_LENGTH = 50

# ← YOUR absolute data folder:
BASE_DIR = Path(
    "/Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/"
    "MS in AI/Neural Networks and Deep Learning AAI-511/"
    "Final Project/AAI511_Final_Project_8/data"
)
OUTPUT_DIR = BASE_DIR / 'processed_data'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Prepare a LabelEncoder for our four composers
le = LabelEncoder().fit(COMPOSERS)

# Quick sanity check
print("BASE_DIR:", BASE_DIR)
print("Contents of data folder:", [p.name for p in BASE_DIR.iterdir()])

BASE_DIR: /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data
Contents of data folder: ['.DS_Store', 'test', 'processed_data', 'train', 'dev']


In [2]:
def extract_note_sequence(pm_obj):
    """
    Flatten non-drum notes into a sorted list of dicts:
    'start', 'pitch', 'duration', 'velocity'
    """
    notes = []
    for inst in pm_obj.instruments:
        if not inst.is_drum:
            for n in inst.notes:
                notes.append({
                    'start':    n.start,
                    'pitch':    n.pitch,
                    'duration': n.end - n.start,
                    'velocity': n.velocity
                })
    notes.sort(key=lambda x: x['start'])
    return notes

def make_feature_sequences(notes, seq_len=SEQUENCE_LENGTH):
    """
    Slide a window of length seq_len over the notes list,
    returning a list of (seq_len × 3) float32 arrays.
    """
    seqs = []
    for i in range(len(notes) - seq_len + 1):
        window = notes[i:i + seq_len]
        arr = np.array([[n['pitch'], n['duration'], n['velocity']] 
                        for n in window], dtype=np.float32)
        seqs.append(arr)
    return seqs

In [3]:
for split in SPLITS:
    all_seqs, all_labels = [], []
    data_dir = BASE_DIR / split
    print(f"\n▶ Processing split: {split}  (looking in {data_dir})")

    for composer in COMPOSERS:
        composer_dir = data_dir / composer
        if not composer_dir.exists():
            print(f"  ⚠️  {composer_dir} not found, skipping")
            continue

        for midi_file in composer_dir.glob('*.mid'):
            try:
                pm    = pretty_midi.PrettyMIDI(str(midi_file))
                notes = extract_note_sequence(pm)
                if len(notes) < SEQUENCE_LENGTH:
                    continue
                seqs = make_feature_sequences(notes)
                all_seqs.extend(seqs)
                all_labels.extend([composer] * len(seqs))
            except Exception as e:
                print(f"  ⚠️ Skipping {midi_file.name}: {e}")

    # Convert and encode
    X = np.array(all_seqs, dtype=np.float32)
    y = le.transform(all_labels)

    print(f"  → {split}: extracted {X.shape[0]} windows of shape {X.shape[1:]}")

    # Save this split’s pickle
    out_path = OUTPUT_DIR / f'lstm_{split}.pkl'
    with open(out_path, 'wb') as f:
        pickle.dump((X, y), f)
    print(f"  ✅ Saved {split} to {out_path}")


▶ Processing split: train  (looking in /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data/train)
  ⚠️  /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data/train/beethoven not found, skipping




  → train: extracted 480044 windows of shape (50, 3)
  ✅ Saved train to /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data/processed_data/lstm_train.pkl

▶ Processing split: dev  (looking in /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data/dev)
  ⚠️  /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data/dev/beethoven not found, skipping




  → dev: extracted 41034 windows of shape (50, 3)
  ✅ Saved dev to /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data/processed_data/lstm_dev.pkl

▶ Processing split: test  (looking in /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data/test)
  ⚠️  /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data/test/beethoven not found, skipping




  → test: extracted 39295 windows of shape (50, 3)
  ✅ Saved test to /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data/processed_data/lstm_test.pkl


In [4]:
with open(OUTPUT_DIR / 'label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)
print("✅ Saved LabelEncoder to", OUTPUT_DIR / 'label_encoder.pkl')

✅ Saved LabelEncoder to /Users/Kevin/Library/Mobile Documents/com~apple~CloudDocs/MS in AI/Neural Networks and Deep Learning AAI-511/Final Project/AAI511_Final_Project_8/data/processed_data/label_encoder.pkl
