## Imports

In [15]:
import os
import shutil
import numpy as np
import pretty_midi
import pathlib
import collections
from miditok import REMI
import glob

# Loading Data

In [16]:
data_dir = pathlib.Path('maestro-v2.0.0')

# Teilmenge des Datasets wählen

#

In [17]:
#Hier den Ordner wählen, den es zu tokenizen gilt
paths = glob.glob("maestro-v2.0.0/2008/*.mid*")# + glob.glob("maestro-v2.0.0/2006/*.mid*") + glob.glob("maestro-v2.0.0/2008/*.mid*")

## Lists Midi files with appropriate length and Timesignature

In [18]:
midi_files_dir = data_dir

# Criteria for selection
desired_time_signature = (4, 4)  # (numerator, denominator)
min_length_seconds = 100
max_length_seconds = 800

selected_files = []

for filepath in paths:
    try:
        # Load the MIDI file
        midi_data = pretty_midi.PrettyMIDI(filepath)

        # Check time signatures
        time_signatures = midi_data.time_signature_changes
        has_desired_time_signature = any(ts.numerator == desired_time_signature[0] and
                                            ts.denominator == desired_time_signature[1]
                                            for ts in time_signatures)

        # Check length
        length = midi_data.get_end_time()  # This returns the length in seconds
        if has_desired_time_signature and min_length_seconds <= length <= max_length_seconds:
        
            # Append filepath to list
            selected_files.append(filepath)

    except Exception as e:
        print(f"Error processing {filepath}: {e}")

# Print or use the selected files
for file in selected_files:
    print(file)


maestro-v2.0.0/2008\MIDI-Unprocessed_01_R1_2008_01-04_ORIG_MID--AUDIO_01_R1_2008_wav--1.midi
maestro-v2.0.0/2008\MIDI-Unprocessed_01_R1_2008_01-04_ORIG_MID--AUDIO_01_R1_2008_wav--2.midi
maestro-v2.0.0/2008\MIDI-Unprocessed_01_R1_2008_01-04_ORIG_MID--AUDIO_01_R1_2008_wav--3.midi
maestro-v2.0.0/2008\MIDI-Unprocessed_02_R1_2008_01-05_ORIG_MID--AUDIO_02_R1_2008_wav--1.midi
maestro-v2.0.0/2008\MIDI-Unprocessed_02_R1_2008_01-05_ORIG_MID--AUDIO_02_R1_2008_wav--2.midi
maestro-v2.0.0/2008\MIDI-Unprocessed_02_R1_2008_01-05_ORIG_MID--AUDIO_02_R1_2008_wav--4.midi
maestro-v2.0.0/2008\MIDI-Unprocessed_02_R1_2008_01-05_ORIG_MID--AUDIO_02_R1_2008_wav--5.midi
maestro-v2.0.0/2008\MIDI-Unprocessed_02_R2_2008_01-05_ORIG_MID--AUDIO_02_R2_2008_wav--1.midi
maestro-v2.0.0/2008\MIDI-Unprocessed_02_R2_2008_01-05_ORIG_MID--AUDIO_02_R2_2008_wav--3.midi
maestro-v2.0.0/2008\MIDI-Unprocessed_02_R2_2008_01-05_ORIG_MID--AUDIO_02_R2_2008_wav--4.midi
maestro-v2.0.0/2008\MIDI-Unprocessed_02_R2_2008_01-05_ORIG_MID--AUDIO_

## Converts Midi to tokens (safed as Json)

In [19]:
from miditok import REMI
from pathlib import Path


# Creates the tokenizer and lists the file paths
tokenizer = REMI(sos_eos=True)
midi_paths = selected_files

# Converts MIDI files to tokens saved as JSON files
tokenizer.tokenize_midi_dataset(
    midi_paths,
    Path('data/only2008_2')
)

# Learns the vocabulary with BPE


Tokenizing MIDIs (data/only2008_2): 100%|██████████| 137/137 [00:37<00:00,  3.66it/s]


## Load trained bpetokenizer

In [20]:
import pickle
# Load the tokenizer from the file
with open('tokenizer_bpe2.pkl', 'rb') as f:
    tokenizer = pickle.load(f)


## Apply Tokenizer to Chosen Midi Files

In [21]:
from pathlib import Path

# Path to the directories
output_path = Path('data/only2008_BPE_2')
input_path = Path('data/only2008_2')

# Create the output directory if it does not exist
output_path.mkdir(parents=True, exist_ok=True)

# running function
tokenizer.apply_bpe_to_dataset(input_path, output_path)


Applying BPE to dataset: 100%|██████████| 137/137 [05:12<00:00,  2.28s/it]
