<a href="https://colab.research.google.com/github/JCalisso/JCalisso/blob/main/Lofi_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lofi? More like Lo-success. This isn't very good.

Ah shit, here we go again - a coding project that is *way* more difficult than I expected and I expected it to be pretty difficult. Inspired by [this guide](https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5) by Sigurður Skúli, let's try to use an LSTM to generate a jazzy lo-fi instrumental melody.

**Goal:** Use machine learning to generate the fundamental building blocks of a lofi song (a drum beat and an jazzy instrumental). To be more precise, lofi music is usually 70-95 bpm with an authentic, organic, degraded, boom bap kinda sound (pioneered by J Dilla)

If you want to play around with this notebook, just duplicate it to your own colab space or look through the code and steal stuff. Not sure it's worth stealing though. The output is rather garbage.


## Set up

In [None]:
#@title Import Libraries
#@markdown Some things you'll need to do the stuff in here.
import glob
import numpy as np
import pandas as pd
import music21 as m
from google.colab import files

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Dropout, Activation
from keras.optimizers import RMSprop
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [None]:
#@title Helper Functions
#@markdown Just some functions because good coding practice is important or something. Idk. I'm really only doing this because I know it's going to be an exclusive and other people will see it and i don't want to be shamed.
def read_midi(path, printout = False):
  """
  Given a path to a midi file, return every note and duration. If print is True, it'll print out each note and duration.
  """
  notes = []
  dur = []
  
  # load midi data into stream objects
  midi = m.converter.parse(path)
  notes_to_parse = None    
  parts = m.instrument.partitionByInstrument(midi)    
  
  if parts: # file has instrument parts
    notes_to_parse = parts.parts[0].recurse()
  else: # file has notes in a flat structure
    notes_to_parse = midi.flat.notes    

  # use stream data to convert to string
  for e in notes_to_parse:
    if isinstance(e, m.note.Note):
        notes.append(str(e.pitch))
        dur.append(float(e.duration.quarterLength))
        if printout: 
          print(e.pitch, e.duration.quarterLength)
    elif isinstance(e, m.chord.Chord):
        notes.append('.'.join(str(n) for n in e.normalOrder))
        dur.append(float(e.duration.quarterLength))
        if printout: 
          print('.'.join(str(n) for n in e.normalOrder), e.duration.quarterLength)
  return notes, dur

def create_in_out(notes, dur, sequence_length):
  """
  Given a list of notes, a list of durations, and a sequence length. This function returns a normalized input array 
  and two one-hot encoded output sequences (notes, durations) for our LSTM model.
  """
  # get all pitch names
  pitchnames = sorted(set(item for item in notes))

  # create a dictionary to map pitches to integers (for the sake of normalization)
  note_to_int = dict((note, number) for number, note in enumerate(pitchnames))

  network_input = []
  network_output = []

  # create input sequences and the corresponding outputs
  for i in range(0, len(notes) - sequence_length, 1):
      sequence_in = notes[i:i + sequence_length]
      sequence_in = [note_to_int[char] for char in sequence_in]
      sequence_out = notes[i + sequence_length]
      sequence_out = note_to_int[sequence_out]
      dur_in = dur[i:i + sequence_length]
      dur_out = dur[i + sequence_length]
      network_input.append([(a,b) for a,b in zip(sequence_in, dur_in)])
      network_output.append((sequence_out, dur_out))
      
  n_patterns = len(network_input)
  network_input = np.asarray(network_input)
  network_output = np.asarray(network_output)
  
  # normalize inputs
  n_notes = len(set(notes))
  n_dur = len(set(dur))
  network_input[:,:,0] = network_input[:,:,0] / float(n_notes)
  network_input[:,:,1] = network_input[:,:,1] / float(n_dur)

  # one hot encode the note output information since it is categorical
  note_encode = np_utils.to_categorical(network_output[:,0])

  # one hot encode the duration output information since you can argue it's categorical (due to music theory or whatever lmao)
  dur_encode = np_utils.to_categorical(network_output[:,1], num_classes=n_dur)

  # reshape data for LSTM input
  network_input = np.reshape(network_input,(n_patterns, sequence_length,2,1))

  # make sure the shapes make sense
  if (network_input.shape[0] !=  note_encode.shape[0]) | (network_input.shape[0] !=  dur_encode.shape[0]) | (dur_encode.shape[0] !=  note_encode.shape[0]):
    print('ERROR: something is wrong with the shape of ur stuff. Go check out helper functions and see if you know what is going on.')

  return network_input, note_encode, dur_encode

def model_maker(sequence_length):
  """
  Make a model. Mess around with this architecture because i truly head-empty copy-pasted this stuff from a tutorial basically.
  """
  n_notes = len(set(notes))
  n_dur = len(set(dur))

  input_layer = Input(shape = (sequence_length,2,))
  
  encoder = LSTM(512, return_sequences=True)(input_layer)
  
  x = Dropout(0.3)(encoder)
  x = LSTM(512, return_sequences=True)(x)
  x = Dropout(0.3)(x)
  x = LSTM(512)(x)

  note_dense = Dense(256)(x)
  note_decoder = Dropout(0.3)(note_dense)
  note_out=Dense(n_notes, activation="softmax")(note_decoder)

  dur_dense = Dense(256)(x)
  dur_dedcoder = Dropout(0.3)(dur_dense)
  dur_out=Dense(n_dur, activation="softmax")(dur_dedcoder)

  model = Model(inputs=input_layer, outputs=[note_out,dur_out])
  opt = RMSprop(learning_rate=0.01)
  model.compile(loss='categorical_crossentropy', optimizer=opt)

  return model

def model_fitter(model, network_input, note_encode, dur_encode, epochs=20, batch_size=64):
  # train the model
  filepath = "weights-{epoch:02d}-{loss:.4f}.hdf5"    
  checkpoint = ModelCheckpoint(
      filepath, monitor='loss', 
      verbose=0,        
      save_best_only=True,        
      mode='min'
  ) 
  callbacks_list = [checkpoint]     
  model.fit(network_input, [note_encode, dur_encode], 
            epochs=epochs, batch_size=batch_size, 
            callbacks = callbacks_list)
  return None

def sample(preds, temperature=1.0):
    """
    The LSTM will always get caught in a loop of notes unless you do this sampling thing.
    """
    preds = np.reshape(preds, preds.size)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def predict_notes(network_input,notes,dur,n_gen, n_temp=0.15, d_temp=1):
    """
    Generate n_gen notes. I have given up on good documentation. I am so tired.
    This samples notes rather than choosing the argmax if you care about kinda stuff. 
    """
    pitchnames = sorted(set(item for item in notes))
    durnames = sorted(set(item for item in dur))
    n_notes = len(set(notes))
    n_dur = len(set(dur)) 

    start = np.random.randint(0, len(network_input)-1)

    int_to_note = dict((number, note) for number, note in enumerate(pitchnames))
    int_to_dur = dict((number, dur) for number, dur in enumerate(durnames))

    pattern = network_input[start]
    prediction_output = []

    # generate notes
    for note_index in range(n_gen):

        out = []

        prediction_input = np.copy(pattern)
        prediction_input = np.reshape(prediction_input, (1, len(prediction_input), 2))
        prediction_input[0,:,0] = prediction_input[0,:,0] / float(n_notes)
        prediction_input[0,:,1] = prediction_input[0,:,1] / float(n_dur)

        prediction = model.predict(prediction_input, verbose=0)
        # identify the most likely note in the set of notes
        note_index = sample(prediction[0], n_temp)
        note_result = int_to_note[note_index]
        out.append(note_result)
        # identify the most likely duration in the set of notes
        dur_index = sample(prediction[1], d_temp)
        dur_result = int_to_dur[dur_index]
        out.append(dur_result)
        prediction_output.append(out)
        pattern = np.append(pattern, [note_index, dur_index])
        pattern = np.reshape(pattern, (1, int(len(pattern)/2), 2))
        pattern = pattern[0,1:pattern.shape[1]]
    return prediction_output

def repeat_agg(preds, note_extender):
  """
  Helper function that takes in a list of lists of notes and duration,
  It returns an ndarray where any repeated notes have their durations agregated.
  """
  new_preds = []
  # replace grace notes (ones with zero duration) with 0.25 duration
  for i in range(len(preds)):
    preds[i][1] += note_extender

  # convert to a pandas dataframe to simplify aggregation
  df = pd.DataFrame(preds, columns = ['note','dur'])
  df = df.groupby(df.note.ne(df.note.shift()).cumsum()).agg({'note':'first','dur':'sum'}).reset_index(drop=True)

  # convert back into a list
  df = np.asarray(df)
  for e in df:
    new_preds.append(list(e))
  return new_preds

def create_midi(prediction_output, midi_name, note_extender = 4, offset_increase = 1):
  """
  Given prediction output, generate a midi file.
  """
  offset = 0
  # create note and chord objects based on the values generated by the model
  output_notes = []

  # offset as appropriate
  for pattern in repeat_agg(prediction_output, note_extender):
    n = pattern[0]
    d = float(pattern[1])
    # pattern is a chord
    if ('.' in n) or n.isdigit():
        notes_in_chord = n.split('.')
        notes = []
        for current_note in notes_in_chord:
            new_note = m.note.Note(int(current_note), quarterLength=d)
            new_note.storedInstrument = m.instrument.Piano()
            notes.append(new_note)
        new_chord = m.chord.Chord(notes)
        new_chord.offset = offset
        output_notes.append(new_chord)
    # pattern is a note
    else:
        new_note = m.note.Note(n, quarterLength = d)
        new_note.offset = offset
        new_note.storedInstrument = m.instrument.Piano()
        output_notes.append(new_note)    # increase offset each iteration so that notes do not stack
    offset += offset_increase

  midi_stream = m.stream.Stream(output_notes)
  mf = midi_stream.write('midi', fp='{}.mid'.format(midi_name))
  
  return mf

## Make Your Model

In [None]:
#@title Data Preperation
#@markdown What to do:
#@markdown 1. Upload single instrument MIDI samples to your Colab notebook (keep in mind that these get deleted after every session. If you have a bunch, it might be worth mounting your drive and saving a path). Theoretically, the more samples you have, the better your output will sound.
#@markdown 2. Read your midi samples and concatenate the results into two lists: one for notes, another for durations.
#@markdown 3. Create model inputs and outputs based on `sequence_length`. You should play around with the sequence length depending on your midi imports. Longer sequences might give your machine the opportunity to learn more structural patterns. Maybe.

path = "" #@param {type:"string"}
sequence_length =  16#@param {type:"integer"}

#@markdown Do you want to see the notes as they're read?
printout = False #@param {type:"boolean"}

notes = []
dur = []
# read every midi file in your path
for file in glob.glob(path+'/*.mid'):
  r = read_midi(file, printout = printout)
  notes += r[0]
  dur += r[1]

network_input, note_encode, dur_encode = create_in_out(notes, dur, sequence_length)

In [None]:
#@title Train your Model
#@markdown The following code will upload parameters as their quality improves (measured by categorical crossentropy).
#@markdown Once training is down, you should probably download the best set of parameters so you don't need to train the model every time you want to make predictions.
#@markdown
#@markdown Also worth heading into notebook settings and getting a GPU accelerator to make this process a little less miserable.
#@markdown
#@markdown You can google what these mean but basically training time increases when epoch goes up and/or batch size goes down
epochs = 30 #@param {type:"slider", min:10, max:100, step:10}
batch_size = 64 #@param {type:"slider", min:12, max:128, step:4}
model = model_maker(sequence_length)
model_fitter(model, network_input, note_encode, dur_encode, epochs=epochs, batch_size=batch_size)

## Make Some Predictions

In [None]:
#@title Load model
#@markdown If you have a .hdf5 file of model weights from previous training, drop that path here.
#@markdown If you don't, train your model.
#@markdown
#@markdown Make sure you haven't altered the `model_maker` function or `sequence_length` parameter between training and loading otherwise the modedl weights won't make sense.
weight_path = "" #@param {type:"string"}

# build model
model = model_maker(sequence_length)

# load weight from training
model.load_weights(weight_path)

In [None]:
#@title Make Some Music
#@markdown Adjust the following settings, run the code block, and you'll be prompted to download a MIDI file of your sweet, sweet tunes.
#
#@markdown Higher temperature will make predictions whackier.
note_temperature = 0.5 #@param {type:"slider", min:0, max:1, step:0.01}
duration_temperature = 5 #@param {type:"slider", min:0, max:5, step:0.5}
#@markdown How many notes do you want to generate?
n_gen = 46 #@param {type:"slider", min:16, max:500, step:10}
#@markdown What do you want your midi file to be named?
midi_name = "drum loop" #@param {type:"string"}

prediction_output = predict_notes(network_input,notes,dur,n_gen, n_temp=note_temperature, d_temp=duration_temperature)
mf = create_midi(prediction_output, midi_name, note_extender=note_extender, offset_increase = offset_increase)

files.download(mf) 

# Archive

In [None]:
#@title Archived model training
#@markdown This is some of the code that I used while building this thing out. It's basically just a bunch of things that should've been helper functions in the first place if I had more foresight. But unfortunately, I need glasses.

# load midi data into stream objects
notes = [] 
#offset = []
dur = []

for file in glob.glob("/content/samples/*.mid"):
    midi = converter.parse(file)
    notes_to_parse = None    
    parts = instrument.partitionByInstrument(midi)    
    if parts: # file has instrument parts
        notes_to_parse = parts.parts[0].recurse()
    else: # file has notes in a flat structure
        notes_to_parse = midi.flat.notes    

    # use stream data to convert to string
    for e in notes_to_parse:
        if isinstance(e, note.Note):
            notes.append(str(e.pitch))
#            offset.append(e.offset)
            dur.append(e.duration.quarterLength)
        elif isinstance(e, chord.Chord):
            notes.append('.'.join(str(n) for n in e.normalOrder))
#            offset.append(e.offset)
            dur.append(e.duration.quarterLength)

dur = np.asarray(dur, dtype = 'float32')

# map string data to integer data for ML
sequence_length = 50

# get all pitch names
pitchnames = sorted(set(item for item in notes))
durnames = sorted(set(item for item in dur))

# create a dictionary to map pitches to integers (for the sake of normalization)
note_to_int = dict((note, number) for number, note in enumerate(pitchnames))
network_input = []
network_output = []

# create input sequences and the corresponding outputs
for i in range(0, len(notes) - sequence_length, 1):
    sequence_in = notes[i:i + sequence_length]
    sequence_in = [note_to_int[char] for char in sequence_in]
    sequence_out = notes[i + sequence_length]
    sequence_out = note_to_int[sequence_out]
    dur_in = dur[i:i + sequence_length]
    dur_out = dur[i + sequence_length]
    network_input.append([(a,b) for a,b in zip(sequence_in, dur_in)])
    network_output.append((sequence_out, dur_out))
    
n_patterns = len(network_input)
network_input = np.asarray(network_input)
network_output = np.asarray(network_output)

# normalize inputs
n_notes = len(set(notes))
n_dur = len(set(dur))
network_input[:,:,0] = network_input[:,:,0] / float(n_notes)

#network_input[:,1,:] = network_input[:,1,:] / float(n_offset)
network_input[:,:,1] = network_input[:,:,1] / float(n_dur)

# one hot encode the note output information since it is categorical
note_encode = np_utils.to_categorical(network_output[:,0])

# one hot encode the duration output information since you can argue it's categorical (due to music theory or whatever lmao)
dur_encode = np_utils.to_categorical(network_output[:,1], num_classes=n_dur)

# reshape data for LSTM input
network_input = np.reshape(network_input,(n_patterns, sequence_length,2,1))


input_layer = Input(shape = (sequence_length,2,))

encoder = LSTM(512, return_sequences=True)(input_layer)

x = Dropout(0.3)(encoder)
x = LSTM(512, return_sequences=True)(x)
x = Dropout(0.3)(x)
x = LSTM(512)(x)

note_dense = Dense(256)(x)
note_decoder = Dropout(0.3)(note_dense)
note_out=Dense(n_notes, activation="softmax")(note_decoder)

dur_dense = Dense(256)(x)
dur_dedcoder = Dropout(0.3)(dur_dense)
dur_out=Dense(n_dur, activation="softmax")(dur_dedcoder)

model = Model(inputs=input_layer, outputs=[note_out,dur_out])
opt = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt)

# train the model
filepath = "weights-improvement-v1-seq50-b64-{epoch:02d}-{loss:.4f}.hdf5"    
checkpoint = ModelCheckpoint(
    filepath, monitor='loss', 
    verbose=0,        
    save_best_only=True,        
    mode='min'
) 
callbacks_list = [checkpoint]     
model.fit(network_input, [note_encode, dur_encode], 
          epochs=100, batch_size=64, 
          callbacks = callbacks_list)

input_layer = Input(shape = (sequence_length,2,))

encoder = LSTM(512, return_sequences=True)(input_layer)

x = Dropout(0.3)(encoder)
x = LSTM(512, return_sequences=True)(x)
x = Dropout(0.3)(x)
x = LSTM(512)(x)

note_dense = Dense(256)(x)
note_decoder = Dropout(0.3)(note_dense)
note_out=Dense(n_notes, activation="softmax")(note_decoder)

dur_dense = Dense(256)(x)
dur_dedcoder = Dropout(0.3)(dur_dense)
dur_out=Dense(n_dur, activation="softmax")(dur_dedcoder)

model = Model(inputs=input_layer, outputs=[note_out,dur_out])
opt = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt)

# Load the weights to each node
weight = ''
model.load_weights(weight)