In [1]:
import pandas as pd
import numpy as np
from music21 import converter, midi, interval, pitch
from mido import MidiFile
import miditoolkit
import os
from os import walk
import json
from tokenizing_functions import extract_events, get_file_and_dirnames
#from helper_functions import get_file_and_dirnames
#from analysis_functions import analyse_data_folder
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

PATH_TRANSPOSED = "../0_data/4_preprocessed_sets"

In [2]:
MIN_DURATION_DENOMINATOR = 32
DURATION_STEPS = 64
POSITION_STEPS = 16
TICKS_PER_BEAT = 1024
TRIOLE_POS_1 = (TICKS_PER_BEAT/12).__round__()
TRIOLE_POS_2 = (TICKS_PER_BEAT/6).__round__()
TICKS_PER_MIN_DURATION = TICKS_PER_BEAT*4/MIN_DURATION_DENOMINATOR
DURATION_BINS = np.arange(TICKS_PER_MIN_DURATION, (TICKS_PER_MIN_DURATION*DURATION_STEPS)+1, TICKS_PER_MIN_DURATION, dtype=int)

start_position_tokens = 37 + DURATION_STEPS + 1
end_position_tokens = start_position_tokens + POSITION_STEPS

token2word = {0: "Bar_None"}
for i in range(1, 37):
    token2word[i] = f"Note-On_{i+59}"
for i in range(37, start_position_tokens-1):
    token2word[i] = f"Note-Duration_{i-36}"
token2word[start_position_tokens-1] = "Note-Duration_triole"
for i in range(start_position_tokens, end_position_tokens):
    token2word[i] = f"Position_{i-start_position_tokens+1}/{POSITION_STEPS}"
token2word[end_position_tokens] = "Position-Triole_1"
token2word[end_position_tokens+1] = "Position-Triole_2"

word2token = {v: k for k, v in token2word.items()}
word2token

{'Bar_None': 0,
 'Note-On_60': 1,
 'Note-On_61': 2,
 'Note-On_62': 3,
 'Note-On_63': 4,
 'Note-On_64': 5,
 'Note-On_65': 6,
 'Note-On_66': 7,
 'Note-On_67': 8,
 'Note-On_68': 9,
 'Note-On_69': 10,
 'Note-On_70': 11,
 'Note-On_71': 12,
 'Note-On_72': 13,
 'Note-On_73': 14,
 'Note-On_74': 15,
 'Note-On_75': 16,
 'Note-On_76': 17,
 'Note-On_77': 18,
 'Note-On_78': 19,
 'Note-On_79': 20,
 'Note-On_80': 21,
 'Note-On_81': 22,
 'Note-On_82': 23,
 'Note-On_83': 24,
 'Note-On_84': 25,
 'Note-On_85': 26,
 'Note-On_86': 27,
 'Note-On_87': 28,
 'Note-On_88': 29,
 'Note-On_89': 30,
 'Note-On_90': 31,
 'Note-On_91': 32,
 'Note-On_92': 33,
 'Note-On_93': 34,
 'Note-On_94': 35,
 'Note-On_95': 36,
 'Note-Duration_1': 37,
 'Note-Duration_2': 38,
 'Note-Duration_3': 39,
 'Note-Duration_4': 40,
 'Note-Duration_5': 41,
 'Note-Duration_6': 42,
 'Note-Duration_7': 43,
 'Note-Duration_8': 44,
 'Note-Duration_9': 45,
 'Note-Duration_10': 46,
 'Note-Duration_11': 47,
 'Note-Duration_12': 48,
 'Note-Duration_13

In [3]:
with open('vocab.json', 'w') as fp:
    json.dump(word2token, fp)

In [4]:
dir = "17_POP909-Dataset-master"
file = "111.mid"
files,_ = get_file_and_dirnames(f'{PATH_TRANSPOSED}/c)_transposed_octave/{dir}')
files.sort()
files[:10]

['002.mid',
 '003.mid',
 '004.mid',
 '005.mid',
 '006.mid',
 '007.mid',
 '008.mid',
 '009.mid',
 '010.mid',
 '011.mid']

In [5]:
path = f'{PATH_TRANSPOSED}/c)_transposed_octave/{dir}/100.mid'
"""note_items = convert_to_note_items(path)
note_items_shifts = compute_shifts(note_items)
max_time = note_items_shifts[-1]["end"]
grouped_items = group_items(note_items_shifts, max_time)
events = item2event(grouped_items)
events"""

'note_items = convert_to_note_items(path)\nnote_items_shifts = compute_shifts(note_items)\nmax_time = note_items_shifts[-1]["end"]\ngrouped_items = group_items(note_items_shifts, max_time)\nevents = item2event(grouped_items)\nevents'

In [6]:
events = extract_events(path)
tokens = [word2token[f"{e['name']}_{e['value']}"] for e in events]
tokens

[0,
 112,
 17,
 38,
 114,
 17,
 38,
 116,
 17,
 38,
 0,
 102,
 17,
 38,
 104,
 13,
 40,
 106,
 10,
 44,
 101,
 107,
 13,
 40,
 110,
 8,
 38,
 111,
 10,
 38,
 112,
 12,
 40,
 114,
 15,
 38,
 116,
 15,
 38,
 0,
 102,
 8,
 40,
 104,
 10,
 76,
 101,
 112,
 10,
 40,
 114,
 17,
 38,
 116,
 17,
 40,
 0,
 102,
 17,
 40,
 104,
 13,
 40,
 106,
 10,
 38,
 107,
 15,
 42,
 110,
 15,
 38,
 111,
 10,
 38,
 112,
 13,
 38,
 114,
 13,
 40,
 116,
 15,
 40,
 0,
 102,
 17,
 40,
 104,
 17,
 76,
 101,
 112,
 17,
 38,
 114,
 15,
 38,
 116,
 17,
 40,
 0,
 102,
 22,
 38,
 104,
 22,
 38,
 106,
 17,
 40,
 108,
 15,
 38,
 110,
 13,
 38,
 111,
 15,
 38,
 112,
 17,
 40,
 114,
 20,
 40,
 116,
 20,
 40,
 0,
 102,
 15,
 40,
 104,
 17,
 40,
 106,
 13,
 44,
 112,
 10,
 40,
 114,
 17,
 38,
 116,
 17,
 40,
 0,
 102,
 17,
 40,
 104,
 13,
 40,
 106,
 17,
 40,
 108,
 12,
 38,
 110,
 8,
 38,
 111,
 10,
 38,
 112,
 12,
 40,
 114,
 15,
 38,
 116,
 8,
 40,
 0,
 102,
 10,
 38,
 104,
 10,
 52,
 112,
 22,
 52,
 101,
 116,
 22,
 40,


In [7]:
[f"{e['name']}_{e['value']}" for e in events]

['Bar_None',
 'Position_11/16',
 'Note-On_76',
 'Note-Duration_2',
 'Position_13/16',
 'Note-On_76',
 'Note-Duration_2',
 'Position_15/16',
 'Note-On_76',
 'Note-Duration_2',
 'Bar_None',
 'Position_1/16',
 'Note-On_76',
 'Note-Duration_2',
 'Position_3/16',
 'Note-On_72',
 'Note-Duration_4',
 'Position_5/16',
 'Note-On_69',
 'Note-Duration_8',
 'Note-Duration_triole',
 'Position_6/16',
 'Note-On_72',
 'Note-Duration_4',
 'Position_9/16',
 'Note-On_67',
 'Note-Duration_2',
 'Position_10/16',
 'Note-On_69',
 'Note-Duration_2',
 'Position_11/16',
 'Note-On_71',
 'Note-Duration_4',
 'Position_13/16',
 'Note-On_74',
 'Note-Duration_2',
 'Position_15/16',
 'Note-On_74',
 'Note-Duration_2',
 'Bar_None',
 'Position_1/16',
 'Note-On_67',
 'Note-Duration_4',
 'Position_3/16',
 'Note-On_69',
 'Note-Duration_40',
 'Note-Duration_triole',
 'Position_11/16',
 'Note-On_69',
 'Note-Duration_4',
 'Position_13/16',
 'Note-On_76',
 'Note-Duration_2',
 'Position_15/16',
 'Note-On_76',
 'Note-Duration_4',

In [8]:
token_dict = {}
for file in tqdm(files):
    path = f'{PATH_TRANSPOSED}/c)_transposed_octave/{dir}/{file}'
    events = extract_events(path)
    tokens = [word2token[f"{e['name']}_{e['value']}"] for e in events]
    token_dict[file] = tokens


100%|██████████| 803/803 [00:21<00:00, 37.09it/s]


In [9]:
token_dict_words = {}
for file in tqdm(files):
    path = f'{PATH_TRANSPOSED}/c)_transposed_octave/{dir}/{file}'
    events = extract_events(path)
    words = [f"{e['name']}_{e['value']}" for e in events]
    token_dict_words[file] = words

100%|██████████| 803/803 [00:21<00:00, 37.45it/s]


In [10]:
len(token_dict_words.keys())

803

In [11]:
with open('data.json', 'w') as fp:
    json.dump(token_dict, fp)

In [12]:
with open('data_words.json', 'w') as fp:
    json.dump(token_dict_words, fp)

In [13]:
def token_to_event(tokens, token2word):
    events = []
    for token in tokens:
        event_name, event_value = token2word.get(token).split('_')
        events.append({
            "name": event_name,
            "time": None,
            "value": event_value,
            "text": None
        })
    return events

def get_position_triole(flags, position, triole_position):
    if triole_position == 0:
        st = flags[position]
    elif triole_position == 1:
        st = flags[position] + TRIOLE_POS_1
    elif triole_position == 2:
        st = flags[position] + TRIOLE_POS_2
    return st

def write_midi(tokens, token2word, output_path):
    events = token_to_event(tokens, token2word)
    # get downbeat and note (no time)
    incorrect_notes = 0
    temp_notes = []
    for i in range(len(events)-3):
        if events[i]["name"] == 'Bar' and i > 0:
            temp_notes.append('Bar')
        elif events[i]["name"] == 'Position':
            # get position bin 
            position = int(events[i]["value"].split('/')[0]) - 1
            # get triole position
            if events[i+1]["name"] == 'Position-Triole':
                triole_position = int(events[i+1]["value"])
                n = 1
            else:
                triole_position = 0
                n = 0
            if events[i+n+1]["name"] == 'Note-On' and \
            events[i+n+2]["name"] == 'Note-Duration' and events[i+n+2]["value"] != 'triole':
                # pitch
                pitch = int(events[i+n+1]["value"])
            else:
                incorrect_notes += 1
                continue
            # duration
            if events[i+n+3]["name"] == 'Note-Duration' and events[i+n+3]["value"] == 'triole':
                index = int(events[i+n+2]["value"])-1
                duration = int(DURATION_BINS[index] / 3)
            else:
                index = int(events[i+n+2]["value"])-1
                duration = DURATION_BINS[index]
            # adding
            temp_notes.append([position, triole_position ,pitch, duration])
    # get specific time for notes
    ticks_per_bar = TICKS_PER_BEAT * 4 # assume 4/4
    notes = []
    current_bar = 0
    for note in temp_notes:
        if note == 'Bar':
            current_bar += 1
        else:
            position, triole_position, pitch, duration = note
            # position (start time)
            current_bar_st = current_bar * ticks_per_bar
            current_bar_et = (current_bar + 1) * ticks_per_bar
            flags = np.linspace(current_bar_st, current_bar_et, POSITION_STEPS, endpoint=False, dtype=int)
            st = get_position_triole(flags, position, triole_position)
            # duration (end time)
            et = st + duration
            notes.append(miditoolkit.Note(100, pitch, st, et))
    # write to midi
    midi = miditoolkit.midi.parser.MidiFile()
    midi.ticks_per_beat = TICKS_PER_BEAT
    # write instrument
    inst = miditoolkit.midi.containers.Instrument(0, is_drum=False)
    inst.notes = notes
    midi.instruments.append(inst)
        
    # write
    midi.dump(output_path)
    print("midi saved in {}".format(output_path))
    print("Number of incorrect notes: {}".format(incorrect_notes))
    return incorrect_notes

In [14]:
PATH_TEST = "../0_data/99_test"

# when paths not exist, create directories
if not os.path.exists(PATH_TEST):
    os.makedirs(PATH_TEST)

In [15]:
write_midi(tokens, token2word, f"{PATH_TEST}/test.mid")

midi saved in ../0_data/99_test/test.mid
Number of incorrect notes: 0


In [16]:
test_sequences = [[  0, 103, 118,   8, 103, 102,  13,  48, 115,   5,  40, 114, 119,  18,
           8,  38, 109,  15,  52,  84, 112,  20,  44,  40, 111,  12,   8,  38,
         108, 119, 118, 114,  17, 111,  12, 119,  15,  40,  20,  38,  29,  38,
         117,  12,  78, 101, 107,  12,  29,  12,  68,  17,  40,  52, 103,  15,
          44,  13,  38,  22,  46,  15, 118, 104, 119,  11, 104,  25,  40,  77,
          65,  10,  40,  46,   8, 119,  60, 113,  19,  29,  38, 101, 117,  17,
          23,  25,  59,  52,  12, 100,  95,  10,  38, 108,  13, 115,  44, 101,
         113,  17],
        [  0,  94,  44, 101, 111, 119,  15,  52,  65,  38, 103,  22,  17, 101,
         116,  42, 113, 116,  13,  38, 114,  44, 115,  44, 101,  41,  13,  25,
         102, 116,  25,  42, 112, 119,   5,  44,  12, 102,  15,  29, 109,  10,
          10,  10, 105,  15, 112, 119,  26,  13, 107,  12,  59,  48, 101,  44,
         115,  64,  38, 110,  12,  19,  32,   0,  46,   2,   8,  19,  40, 111,
          92,  26, 104,  17, 103, 106,   8,  11,  38,  69, 112, 118,  10,  29,
           6,  56,   8, 112, 106,  15,  19,  52, 104, 119,  42,   0,  55, 104,
         119,  18],
        [  0, 103,  24,  38, 112, 107,  92,  44,   5,  40,   8,  17,  59, 101,
         108,  17,   0,   7,  48, 109, 119,  16, 103, 105,  41,  40,   0,  60,
          15,  38, 106,   5, 116,  17,  40, 107,  17,  44, 110, 108, 109,  15,
          44, 101, 117,  13,  44,  61,  42, 108,  77,  13,   0, 102,  44, 101,
          77,  38, 111,   5,  40, 110,  16,  52, 101, 107,  12,  48,  38,  77,
         109,  18,  52, 104,  84,  26, 111, 112, 118,  25,  19, 113,  20,  38,
           0,   8,  13,  15, 110,  10,  48,  65, 111, 119, 114, 107, 119,  15,
          13,  76]]

write_midi(test_sequences[0], token2word, f"{PATH_TEST}/seq_0.mid")
write_midi(test_sequences[1], token2word, f"{PATH_TEST}/seq_1.mid")
write_midi(test_sequences[2], token2word, f"{PATH_TEST}/seq_2.mid")

midi saved in ../0_data/99_test/seq_0.mid
Number of incorrect notes: 13
midi saved in ../0_data/99_test/seq_1.mid
Number of incorrect notes: 20
midi saved in ../0_data/99_test/seq_2.mid
Number of incorrect notes: 17
