In [20]:
import os
import random
from tqdm import tqdm

import numpy as np

import pretty_midi
import note_seq
from note_seq.midi_io import midi_to_note_sequence, note_sequence_to_pretty_midi

import IPython.display


In [12]:
NOTE_LENGTH_16TH_120BPM = 0.25 * 60 / 120
BAR_LENGTH_120BPM = 4.0 * 60 / 120

In [48]:
def empty_note_sequence(qpm=120.0, total_time=0.0):
    note_sequence = note_seq.protobuf.music_pb2.NoteSequence()
    note_sequence.tempos.add().qpm = qpm
    note_sequence.ticks_per_quarter = note_seq.constants.STANDARD_PPQ
    note_sequence.total_time = total_time
    return note_sequence

def token_sequence_to_note_sequence(token_sequence, use_program=True, use_drums=True):

    if isinstance(token_sequence, str):
        token_sequence = token_sequence.split()

    note_sequence = empty_note_sequence()
    current_program = 1
    current_is_drum = False
    for token_index, token in enumerate(token_sequence):

        if token == "PIECE_START":
            pass
        elif token == "PIECE_END":
            print("The end.")
            break
        elif token == "TRACK_START":
            current_bar_index = 0
            pass
        elif token == "TRACK_END":
            pass
        elif token.startswith("INST"):
            current_instrument = token.split("=")[-1]
            if current_instrument != "DRUMS" and use_program:
                current_instrument = int(current_instrument)
                current_program = int(current_instrument)
                current_is_drum = False
            if current_instrument == "DRUMS" and use_drums:
                current_instrument = 0
                current_program = 0
                current_is_drum = True
        elif token == "BAR_START":
            current_time = current_bar_index * BAR_LENGTH_120BPM
            current_notes = {}
        elif token == "BAR_END":
            current_bar_index += 1
            pass
        elif token.startswith("NOTE_ON"):
            pitch = int(token.split("=")[-1])
            note = note_sequence.notes.add()
            note.start_time = current_time
            note.end_time = current_time + 4 * NOTE_LENGTH_16TH_120BPM
            note.pitch = pitch
            note.instrument = int(current_instrument)
            note.program = current_program
            note.velocity = 80
            note.is_drum = current_is_drum
            current_notes[pitch] = note
        elif token.startswith("NOTE_OFF"):
            pitch = int(token.split("=")[-1])
            if pitch in current_notes:
                note = current_notes[pitch]
                note.end_time = current_time
        elif token.startswith("TIME_DELTA"):
            delta = float(token.split("=")[-1]) * NOTE_LENGTH_16TH_120BPM
            current_time += delta
        elif token.startswith("DENSITY="):
            pass
        elif token == "[PAD]":
            pass
        else:
            assert False, token

    return note_sequence

def note_seq_to_text(notes):
    text = "PIECE_START"
    
    instrument = -1
    track_opened = False
    current_bar_end = 0
    
    for note in notes:
        current_instrument = note.instrument
        current_note = note.pitch
        current_start_time = note.start_time
        current_end_time = note.end_time
        
        # Bar_end?
        if current_start_time >= current_bar_end and track_opened:
            text += " BAR_END"
        
        if current_instrument != instrument:
            text += " TRACK_END" if track_opened else ""
            text += f" TRACK_START INST={current_instrument} DENSITY=0"
            instrument = current_instrument
            track_opened = True
            current_bar_end = 0

        
        # Bar_start?
        if current_start_time >= current_bar_end:
            text += " BAR_START"
            current_bar_end += 2.0
        
        current_timedelta = (current_end_time - current_start_time)*8
        current_timedelta = round(current_timedelta, 1)
        #TODO multiple note_ons can be
        text += f" NOTE_ON={current_note} TIME_DELTA={current_timedelta} NOTE_OFF={current_note}"
        
    text += " BAR_END TRACK_END PIECE_END\n"
    return text

In [49]:
def preprocess_files(folder, files):
    text = ""

    for file in tqdm(files):
        # Load MIDI file into PrettyMIDI object. Give path to a midi file
        midi_data = pretty_midi.PrettyMIDI(os.path.join(folder, file))
        note_sequence = midi_to_note_sequence(midi_data)
        text_sequence_converted = note_seq_to_text(note_sequence.notes)
        text += text_sequence_converted
        
    return text

In [50]:
midi_folder = "/home/karlos/Documents/workspace/js-fakes/midi"
train_size = 0.8

In [51]:
random.seed(5)
midi_files = os.listdir(midi_folder)
random.shuffle(midi_files)

In [52]:
train_midi_files, valid_midi_files = midi_files[:int(len(midi_files)*train_size)], midi_files[int(len(midi_files)*train_size):]
len(train_midi_files), len(valid_midi_files)

(400, 101)

In [53]:
train_text = preprocess_files(midi_folder, train_midi_files)
valid_text = preprocess_files(midi_folder, valid_midi_files)

100%|█████████████████████████████████████████████████████████████| 400/400 [00:03<00:00, 103.11it/s]
100%|█████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 108.55it/s]


In [54]:
with open('jsfake_train.txt', 'w') as f:
    f.write(train_text)

In [55]:
with open('jsfake_valid.txt', 'w') as f:
    f.write(valid_text)

In [56]:
%%sh
readlink -f ./

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/home/karlos/Documents/workspace/khm-generation/experiments/sp1_preprocessing_of_jsfake


## Tokenizer

In [81]:
from tokenizers.trainers import WordLevelTrainer
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace, WhitespaceSplit


In [90]:
tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [91]:
tokenizer.pre_tokenizer = WhitespaceSplit()

In [92]:
tokenizer.train(["jsfake_train.txt"], trainer)

In [93]:
tokenizer.to_str()

'{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},{"id":1,"content":"[CLS]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},{"id":2,"content":"[SEP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},{"id":3,"content":"[PAD]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},{"id":4,"content":"[MASK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true}],"normalizer":null,"pre_tokenizer":{"type":"WhitespaceSplit"},"post_processor":null,"decoder":null,"model":{"type":"WordLevel","vocab":{"[UNK]":0,"[CLS]":1,"[SEP]":2,"[PAD]":3,"[MASK]":4,"TIME_DELTA=4.0":5,"TIME_DELTA=2.0":6,"BAR_START":7,"BAR_END":8,"NOTE_OFF=67":9,"NOTE_ON=67":10,"NOTE_OFF=62":11,"NOTE_ON=62":12,"NOTE_OFF=64":13,"NOTE_ON=64":14,"NOTE_OFF=65":15,"NOTE_ON=65

In [94]:
tokenizer.save("./tokenizer-jsfake.json")