## Imports

In [1]:
import os
import shutil
import numpy as np
import pretty_midi
import pathlib
import collections
from miditok import REMI
import glob
import pickle

# Loading Data

In [2]:
data_dir = pathlib.Path('maestro-v2.0.0')

# Teilmenge des Datasets wählen

#

In [3]:

paths = glob.glob("maestro-v2.0.0/2004/*.mid*")# + glob.glob("maestro-v2.0.0/2006/*.mid*") + glob.glob("maestro-v2.0.0/2008/*.mid*")

## Lists Midi files with appropriate length and Timesignature

In [5]:
midi_files_dir = data_dir

# Criteria for selection
desired_time_signature = (4, 4)  # (numerator, denominator)
min_length_seconds = 100
max_length_seconds = 800

selected_files = []

for filepath in paths:
    try:
        # Load the MIDI file
        midi_data = pretty_midi.PrettyMIDI(filepath)

        # Check time signatures
        time_signatures = midi_data.time_signature_changes
        has_desired_time_signature = any(ts.numerator == desired_time_signature[0] and
                                            ts.denominator == desired_time_signature[1]
                                            for ts in time_signatures)

        # Check length
        length = midi_data.get_end_time()  # This returns the length in seconds
        if has_desired_time_signature and min_length_seconds <= length <= max_length_seconds:
        
            # Append filepath to list
            selected_files.append(filepath)

    except Exception as e:
        print(f"Error processing {filepath}: {e}")

# Print or use the selected files
for file in selected_files:
    print(file)

maestro-v2.0.0/2004\MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_06_Track06_wav.midi
maestro-v2.0.0/2004\MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_08_Track08_wav.midi
maestro-v2.0.0/2004\MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_10_Track10_wav.midi
maestro-v2.0.0/2004\MIDI-Unprocessed_SMF_05_R1_2004_01_ORIG_MID--AUDIO_05_R1_2004_03_Track03_wav.midi
maestro-v2.0.0/2004\MIDI-Unprocessed_SMF_05_R1_2004_02-03_ORIG_MID--AUDIO_05_R1_2004_06_Track06_wav.midi
maestro-v2.0.0/2004\MIDI-Unprocessed_SMF_07_R1_2004_01_ORIG_MID--AUDIO_07_R1_2004_02_Track02_wav.midi
maestro-v2.0.0/2004\MIDI-Unprocessed_SMF_07_R1_2004_01_ORIG_MID--AUDIO_07_R1_2004_06_Track06_wav.midi
maestro-v2.0.0/2004\MIDI-Unprocessed_SMF_07_R1_2004_01_ORIG_MID--AUDIO_07_R1_2004_12_Track12_wav.midi
maestro-v2.0.0/2004\MIDI-Unprocessed_SMF_12_01_2004_01-05_ORIG_MID--AUDIO_12_R1_2004_07_Track07_wav.midi
maestro-v2.0.0/2004\MIDI-Unprocessed_SMF_12_01_2004_01-05_ORIG_MID-

## Converts Midi to tokens (safed as Json)

In [6]:
from miditok import REMI
from pathlib import Path


# Creates the tokenizer and list the file paths
tokenizer = REMI(sos_eos=True)
midi_paths = selected_files

# Converts MIDI files to tokens saved as JSON files
tokenizer.tokenize_midi_dataset(
    midi_paths,
    Path('data/only2004')
)




Tokenizing MIDIs (data/only2004): 100%|██████████| 101/101 [00:53<00:00,  1.88it/s]


## Learns BPETokenization

In [7]:
# Learns the vocabulary with BPE
tokenizer.learn_bpe(
    'data/only2004',
    2000,
    'data/only2004_BPE'
) 


Loading token files: 100%|██████████| 101/101 [00:02<00:00, 45.53it/s]
Learning byte pair encoding: 100%|██████████| 1780/1780 [49:42<00:00,  1.68s/it, seq_len_variation=-45.40, avg_nb_token_combs=2.63, max_nb_token_combs=3] 

Mean of original lengths: 15497.50495049505
Mean length after BPE: 8460.930693069307
Variation from original: -45.40 %





([2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0105263157894737,
  2.0104166666666665,
  2.020618556701031,
  2.020408163265306,
  2.0303030303030303,
  2.04,
  2.0396039603960396,
  2.049019607843137,
  2.0485436893203883,
  2.0576923076923075,
  2.057142857142857,
  2.056603773584906,
  2.0654205607476634,
  2.074074074074074,
  2.0825688073394497,
  2.081818181818182,
  2.0900900

## save tokenizer

In [8]:
with open('tokenizer_bpe2.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


## Load tokenizer

In [9]:
# Load the tokenizer from the file
with open('tokenizer_bpe2.pkl', 'rb') as f:
    tokenizer = pickle.load(f)



## Apply Tokenizer to Chosen Midi Files

In [None]:
from pathlib import Path

# Path to the directories
output_path = Path('data/only2008_BPE_2')
input_path = Path('data/only2008_2')

# Create the output directory if it does not exist
output_path.mkdir(parents=True, exist_ok=True)

# running function
tokenizer.apply_bpe_to_dataset(input_path, output_path)


In [None]:
DATA_FOLDER = "only2006_BPE_2"
BATCH_SIZE = 64
SEQUENCE_LENGTH = 100

VOC_SIZE = 2000 + 2 # achtung +2 warum auch immer

# transfomer
DIMS = 512
MAX_SEQ_LEN = 1000
NUM_HEADS = 8
N_BLOCKS = 4



# plus lr scheduler
TRAIN_STEPS = 1000
VAL_EXAMPLES = 100
EPOCHS = 4 # usually 10
LR = 5e-5

RUN_NAME = "First Try"

In [None]:
import zipfile
import tensorflow as tf

# Set the global policy to use mixed precision
tf.keras.mixed_precision.set_global_policy('mixed_float16')

import json
import numpy as np
from transformer_architecture import *
from glob import glob
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, LearningRateScheduler
import datetime

# ideally auf github packen und importieren
class MusicDataset:
    def __init__(self, data_folder, sequence_length, voc_size, split_ratio=0.1, batch_size=64, mode='train'):
        self.data_folder = data_folder
        self.sequence_length = sequence_length
        self.split_ratio = split_ratio
        self.batch_size = batch_size
        self.mode = mode
        self.files = glob(f"{data_folder}/*.json")
        self.voc_size = voc_size

        # Determine split index
        split_index = int(len(self.files) * (1 - split_ratio))
        if mode == 'train':
            self.files = self.files[:split_index]
            np.random.shuffle(self.files)
        else:
            self.files = self.files[split_index:]

    def generator(self):
        for filename in self.files:
            tokens = self.extract_tokens(filename)
            num_sequences = len(tokens) - self.sequence_length -1

            for i in range(num_sequences):

                try:
                  input_seq = tokens[i:i + self.sequence_length]
                  target_seq = tokens[i + 1:i + 1 + self.sequence_length]

                  if input_seq.shape != target_seq.shape:
                    tf.print("\n false x,y shapes, skipping \n skipping")

                    continue


                  if tf.reduce_max(input_seq) >= self.voc_size or tf.reduce_max(target_seq) >= self.voc_size:

                    number = max(tf.reduce_max(input_seq), tf.reduce_max(target_seq))
                    tf.print(f"\n found to high number {number } in data  skipping \n" )
                    continue

                except Exception as e:
                    tf.print(f"\n exception while creating data {e }skipping \n" )
                    continue


                yield input_seq, target_seq

    def extract_tokens(self, filename) -> tf.Tensor:
        with open(filename, 'r') as file:
            data = json.load(file)
            return tf.convert_to_tensor(data["tokens"][0], dtype=tf.int32)

    def create_dataset(self):
        dataset = tf.data.Dataset.from_generator(
            self.generator,
            output_signature=(
                tf.TensorSpec(shape=(self.sequence_length,), dtype=tf.int32),
                tf.TensorSpec(shape=(self.sequence_length,), dtype=tf.int32)
            )
        )
        return dataset.batch(self.batch_size).prefetch(tf.data.experimental.AUTOTUNE)


In [None]:
# Create training and validation datasets
train_dataset = MusicDataset(DATA_FOLDER, SEQUENCE_LENGTH, VOC_SIZE, batch_size=BATCH_SIZE, mode='train')
val_dataset = MusicDataset(DATA_FOLDER, SEQUENCE_LENGTH, VOC_SIZE, batch_size=BATCH_SIZE, mode='validation')

train_data = train_dataset.create_dataset()
val_data = val_dataset.create_dataset()

In [None]:

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, dims, num_heads, attn_size, dropout_prob):
        super(MultiHeadAttention, self).__init__()
        # Initialize the parameters
        self.num_heads = num_heads
        self.attn_size = attn_size
        self.dims = dims

        # Layers for QKV projection and output projection
        self.qkv_projection = tf.keras.layers.Dense(num_heads * attn_size * 3, use_bias=False)
        self.output_projection = tf.keras.layers.Dense(dims)
        self.dropout = tf.keras.layers.Dropout(dropout_prob)

    def call(self, qs, mask=None):
        # Compute batch size and sequence length from inputs
        batch_size = tf.shape(qs)[0]
        seq_length = tf.shape(qs)[1]

        # Project queries, keys, and values
        qkv = self.qkv_projection(qs)
        qkv = tf.reshape(qkv, [batch_size, seq_length, self.num_heads * 3, self.attn_size])
        qs, ks, vs = tf.split(qkv, 3, axis=2)

        # Compute scaled dot-product attention
        qs = tf.transpose(qs, [0, 2, 1, 3])
        ks = tf.transpose(ks, [0, 2, 3, 1])
        attn_product = tf.matmul(qs, ks) / tf.math.sqrt(tf.cast(self.attn_size, tf.float32))

        # Apply mask, if provided
        if mask is not None:

            mask = tf.broadcast_to(mask, [batch_size, self.num_heads, seq_length, seq_length])
            attn_product = tf.where(mask == 0, tf.fill(tf.shape(attn_product), -1e9), attn_product)

        # Softmax and dropout
        scores = tf.nn.softmax(attn_product, axis=-1)
        scores = self.dropout(scores)

        # Weighted sum of values
        vs = tf.transpose(vs, [0, 2, 1, 3])
        res = tf.matmul(scores, vs)

        # Reshape and project to output size
        res = tf.reshape(tf.transpose(res, [0, 2, 1, 3]), [batch_size, -1, self.num_heads * self.attn_size])
        output = self.output_projection(res)

        return output

class ExpandDense(tf.keras.layers.Layer):
    def __init__(self, d_model, factor = 4):
      super(ExpandDense, self).__init__()
      self.ffn1 = tf.keras.layers.Dense(d_model * factor, "silu")
      self.ffn2 = tf.keras.layers.Dense(d_model, "silu")

    @tf.function
    def call(self, x):

        return self.ffn2(self.ffn1(x))

class SelfAttentionBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dropout_rate):
        super(SelfAttentionBlock, self).__init__()
        self.ffn = ExpandDense(d_model, factor = 4)
        self.attention = MultiHeadAttention(d_model, num_heads,d_model//num_heads, dropout_rate)

        self.norm1 = tf.keras.layers.LayerNormalization()
        self.norm2 = tf.keras.layers.LayerNormalization()

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, mask=None):
        # First sublayer with MultiHeadAttention
        attn_output = self.attention(self.norm1(x), mask)
        x = x + self.dropout1(attn_output)  # Apply dropout after adding residual connection

        # Second sublayer with Feed-Forward Network
        ffn_output = self.ffn(self.norm2(x))
        x = x + self.dropout2(ffn_output)  # Apply dropout after adding residual connection

        return x

# we decided to use tf.keras.layers.MultiHeadAttention since it was optimzed and faster
class SelfAttentionBlockKerasAtt(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dropout_rate):
        super(SelfAttentionBlockKerasAtt, self).__init__()
        self.ffn = ExpandDense(d_model, factor = 4)
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads,d_model//num_heads, dropout = dropout_rate)

        self.norm1 = tf.keras.layers.LayerNormalization()
        self.norm2 = tf.keras.layers.LayerNormalization()

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, mask=None):
        # First sublayer with MultiHeadAttention
        x = self.norm1(x)
        attn_output = self.attention(x,x,x, mask)
        x = x + self.dropout1(attn_output)  # Apply dropout after adding residual connection

        # Second sublayer with Feed-Forward Network
        ffn_output = self.ffn(self.norm2(x))
        x = x + self.dropout2(ffn_output)  # Apply dropout after adding residual connection

        return x




class Transformer(tf.keras.Model):
    def __init__(self, voc_size, dims, maxseqlen, num_heads, dropout_rate, n_blocks):
        super(Transformer, self).__init__()

        self.posemb = PosEncode(dims, maxseqlen)
        self.embedding = tf.keras.layers.Embedding(voc_size,dims)
        self.blocks = [
            SelfAttentionBlockKerasAtt(dims, num_heads, dropout_rate) for _ in range(n_blocks)
        ]
        self.head = tf.keras.layers.Dense(voc_size)

    def encode(self, input):
        x = self.embedding(input,)
        return x + tf.cast(self.posemb(input),  x.dtype)

    def gen_mask(self, seq_len):
        return tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)[None,None,:]

    @tf.function
    def call(self, x, mask = None):

        seq_len = x.shape[1]

        x = self.encode(x)

        if mask is None:
          mask = self.gen_mask(seq_len)

        for block in self.blocks:
            x = block(x, mask)

        return self.head(x)


In [None]:
model = Transformer(VOC_SIZE, DIMS, MAX_SEQ_LEN, NUM_HEADS, 0.0, N_BLOCKS)


optimizer = Adam(LR,clipnorm=1.0)
loss_fn = SparseCategoricalCrossentropy(from_logits=True)
metrics = [SparseCategoricalAccuracy(name='accuracy')]

model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
x,y = list(train_data.take(1))[0]
model(x).shape

In [None]:
class ZipCallback(tf.keras.callbacks.Callback):
    def __init__(self, base_dir, run_name, logs_dir, checkpoint_dir):
        super().__init__()
        self.base_dir = base_dir
        self.run_name = run_name
        self.zip_path = os.path.join(base_dir, run_name, "run_data.zip")
        self.logs_dir = logs_dir
        self.checkpoint_dir = checkpoint_dir

    def on_epoch_end(self, epoch, logs=None):

        # Create a zip file and write directories to it
        with zipfile.ZipFile(self.zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            # Zip the logs directory
            for root, dirs, files in os.walk(self.logs_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, os.path.join(self.logs_dir, os.pardir))
                    zipf.write(file_path, arcname)

            # Zip the checkpoint directory
            for root, dirs, files in os.walk(self.checkpoint_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, os.path.join(self.checkpoint_dir, os.pardir))
                    zipf.write(file_path, arcname)
        zip_size_bytes = os.path.getsize(self.zip_path)
        zip_size_mb = zip_size_bytes / (1024 * 1024)  # Convert bytes to megabytes

        tf.print(f"\n Epoch {epoch + 1}: Data zipped and saved to {self.zip_path} is {zip_size_mb} mb big\n")

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs

In [None]:
# Base directory for all runs
base_dir = "runs"

# Specific run directories for logs and checkpoints
run_logs_dir = os.path.join(base_dir, RUN_NAME, "logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
run_checkpoint_dir = os.path.join(base_dir, RUN_NAME, "checkpoint")
zip_callback = ZipCallback(base_dir, RUN_NAME, os.path.join(base_dir, RUN_NAME, "logs"), run_checkpoint_dir )



# Create directories if they do not exist
os.makedirs(run_logs_dir, exist_ok=True)
os.makedirs(run_checkpoint_dir, exist_ok=True)

# Setup TensorBoard logging
tensorboard_callback = TensorBoard(log_dir=run_logs_dir, histogram_freq=1)

# Setup Model Checkpointing
checkpoint_filepath = os.path.join(run_checkpoint_dir, 'checkpoint.weights.h5')
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

train_repeated = train_data.repeat()
val_smaller = val_data.take(VAL_EXAMPLES)     # take leads to errors in generator maybe different apporach or repeat and val steps per epoch

# Run the training loop
history = model.fit(
    train_repeated,
    steps_per_epoch= TRAIN_STEPS,
    validation_data=val_smaller,
    epochs=EPOCHS,
    callbacks=[tensorboard_callback, model_checkpoint_callback, zip_callback]
)

## Saving Model

In [None]:
import shutil

final_model_path = os.path.join(run_checkpoint_dir, 'final_model')
model.save(final_model_path, save_format='tf')
print(f"Model saved successfully in TensorFlow SavedModel format at {final_model_path}")

# Path to your saved model
saved_model_path = os.path.join(run_checkpoint_dir, 'final_model')

# Path for the zip archive
zip_path = os.path.join(run_checkpoint_dir, 'final_model.zip')

# Compress the entire directory
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', saved_model_path)
print(f"Model compressed and saved as {zip_path}")