# Guitar Melody Transcription Project
This notebook contains the code for a Machine Learning project with the goal of transcribing guitar melodies from a recording. The process involves generating audio from MIDIs in various soundfonts, slicing the spectrogram, and feeding these slices into a model to recreate the MIDI notes as one-hot encoded arrays.

## Section 1: MIDI Generation

In [ ]:
import random
from music21 import stream, note, clef, midi, instrument

# Constants
LOWEST_NOTE = 40  # MIDI number for E2
HIGHEST_NOTE = 88  # MIDI number for E6
NOTE_PROBABILITY = 0.7  # Probability of a note being played instead of a rest
NUM_MEASURES = 4
NOTES_PER_MEASURE = 16  # 16 sixteenth-notes per measure
DURATIONS = [0.25, 0.5, 1.0, 2.0, 4.0]  # 16th, 8th, quarter, half, whole
MAX_ACTIVE_NOTES = 5

def biased_note_choice():
    """Randomly select note, but certain note ranges are more favored."""
    ranges = {'low': (40, 60), 'middle': (61, 73), 'high': (74, 88)}
    probabilities = {'low': 0.5, 'middle': 0.35, 'high': 0.15}
    selected_range = random.choices(list(ranges.keys()), weights=list(probabilities.values()), k=1)[0]
    return random.randint(*ranges[selected_range])

def generate_melody():
    """Generate a random melody with constraints for note range and duration."""
    melody = stream.Part()
    melody.append(clef.TrebleClef())
    
    # Guitar instrument lines up with Bank 0 preset 24
    gen_inst = instrument.Guitar()
    melody.insert(0, gen_inst)

    current_time = 0.0
    max_time = NUM_MEASURES * 4.0  # time is measured in quarter note proportions
    
    active_notes = []  # lets us keep track of num notes active

    while current_time < max_time:
        if random.random() < NOTE_PROBABILITY:  # If note (else rest)
            note_duration = random.choice(DURATIONS)
            if current_time + note_duration > max_time:
                note_duration = max_time - current_time

            midi_note = biased_note_choice()  # returns int note id
            new_note = note.Note(midi=midi_note, quarterLength=note_duration)
            active_notes = [(start, end, n) for start, end, n in active_notes if end > current_time]
            
            if len(active_notes) < MAX_ACTIVE_NOTES:  # If we can insert note (else rest)
                melody.insert(current_time, new_note)
                active_notes.append((current_time, current_time + note_duration, new_note))
            else:
                melody.insert(current_time, note.Rest(quarterLength=0.25))
        else:
            melody.insert(current_time, note.Rest(quarterLength=0.25))
        current_time += 0.25  # increment one 16th

    return melody

def generate_random_basic(filename):
    """Generate a random MIDI file and save it to disk."""
    melody = generate_melody()

    # Save to file
    midi_filename = 'MIDIs/' + filename + '.mid'
    mf = midi.translate.music21ObjectToMidiFile(melody)
    mf.open(midi_filename, 'wb')
    mf.write()
    mf.close()

## Section 2: MIDI to WAV Conversion

In [ ]:
import subprocess

def convert(filename, soundfont_path):
    """Convert a MIDI file to WAV using FluidSynth."""
    # Paths
    midi_file_path = 'MIDIs/'+filename+'.mid'
    output_wav_path = 'WAVs/'+filename+'.wav'

    # FluidSynth command
    command = [
        'fluidsynth',
        '-ni',
        soundfont_path,
        midi_file_path,
        '-F', output_wav_path,
        '-r', '44100'
    ]

    # Run the FluidSynth command
    result = subprocess.run(command, capture_output=True, text=True)

    # Check result
    if result.returncode == 0:
        print('MIDI has been successfully converted to WAV.')
    else:
        print('Error converting MIDI to WAV:')
        print(result.stderr)

## Section 3: Tensor Generation

In [ ]:
from scipy.io import wavfile
from scipy.signal import spectrogram, butter, filtfilt
from PIL import Image
import numpy as np
import mido

# ---WAV PROCESSING---

def normalize_array(array):
    if np.ptp(array) == 0:  # Check if the array has zero range
        return np.zeros(array.shape, dtype=np.uint8)
    normalized_array = 255 * (array - np.min(array)) / np.ptp(array)
    return normalized_array.astype(np.uint8)

def bandpass_filter(data, lowcut, highcut, sample_rate, order=5):
    nyquist = 0.5 * sample_rate
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    y = filtfilt(b, a, data)
    return y

def create_amplitude_tensors(filename, bpm):
    wav_file = 'WAVs/' + filename + '.wav'
    output_file = 'Spectrograms/' + filename + '.png'

    # Load the WAV file
    sample_rate, data = wavfile.read(wav_file)

    # If stereo, convert to mono by averaging the channels
    if len(data.shape) == 2:
        data = data.mean(axis=1)

    # Apply the band-pass filter
    lowcut = 70  # E2 frequency in Hz
    highcut = 1700  # E6 frequency in Hz
    data = bandpass_filter(data, lowcut, highcut, sample_rate)

    # Calculate the spectrogram with a larger FFT window size
    nperseg = 4094  # Larger window size for better frequency resolution
    noverlap = nperseg // 1.5  # Strange grey bars appear for values greater than 1.5

    frequencies, times, Sxx = spectrogram(data, fs=sample_rate, window='hann', nperseg=nperseg, noverlap=noverlap)

    # Convert the spectrogram (power spectral density) to decibels
    Sxx_dB = 10 * np.log10(Sxx + 1e-10)  # Adding a small number to avoid log(0)

    Sxx_dB = Sxx_dB[:][:512]

    # Normalize the values between 0 and 255
    img_array = np.uint8(255 * (Sxx_dB - np.min(Sxx_dB)) / np.ptp(Sxx_dB))
    
    # Convert to Image and save as PNG
    image = Image.fromarray(img_array)
    image.save(output_file)

    # Calculate the duration of a 32nd note in seconds
    beats_per_second = bpm / 60
    seconds_per_beat = 1 / beats_per_second
    seconds_per_32nd_note = seconds_per_beat / 8  # 32nd note duration

    # Determine the number of time slices for each 32nd note duration
    num_slices = int(np.ceil(times[-1] / seconds_per_32nd_note))

    # List to store the average values of each vertical slice
    avg_slices = []

    # Iterate over each 32nd note slice
    for i in range(num_slices):
        # Determine the start and end time for this slice
        start_time = i * seconds_per_32nd_note
        end_time = (i + 1) * seconds_per_32nd_note

        # Find the indices in the time array that correspond to this slice
        start_idx = np.searchsorted(times, start_time)
        end_idx = np.searchsorted(times, end_time)

        # Get the slice of the spectrogram for this time period
        slice_Sxx_dB = Sxx_dB[:, start_idx:end_idx]

        # Calculate the average value of each vertical pixel in this slice
        avg_values = np.mean(slice_Sxx_dB, axis=1)
        avg_slices.append(avg_values)

    # Convert the list of average slices to a numpy array for further processing
    avg_slices_array = np.array(avg_slices)
    
    return avg_slices_array

### MIDI Processing Functions

In [ ]:
def load_midi(file_path):
    """Load the MIDI file and return the messages with their cumulative times."""
    midi_file = mido.MidiFile(file_path)
    messages_with_time = []

    # Initialize the current time
    current_time = 0

    for message in midi_file:
        # Increment the current time by the time of the current message
        current_time += message.time
        # Append the message with the cumulative time to the list
        messages_with_time.append((current_time, message))

    return messages_with_time

def get_note_periods(messages_with_time):
    """Get the time periods for each note."""
    note_periods = []
    notes_on = {}

    for time, message in messages_with_time:
        if message.type == 'note_on' and message.velocity > 0:
            if message.note not in notes_on:
                notes_on[message.note] = []
            notes_on[message.note].append(time)
        elif message.type == 'note_off' or (message.type == 'note_on' and message.velocity == 0):
            if message.note in notes_on and notes_on[message.note]:
                start_time = notes_on[message.note].pop()
                note_periods.append((message.note, start_time, time))

    # If there are notes that were not turned off, handle them appropriately
    for note, times in notes_on.items():
        for start_time in times:
            note_periods.append((note, start_time, messages_with_time[-1][0]))

    return note_periods

def create_note_dict(note_periods):
    """Create a dictionary of note periods."""
    note_dict = {}
    note_id = 0

    for note, start_time, end_time in note_periods:
        note_dict[note_id] = [note, (start_time, end_time)]
        note_id += 1

    return note_dict

def get_notes_in_32nd_period(note_dict, start_time, end_time):
    """Get one-hot encoded notes for a specific 32nd-note period."""
    notes_playing = set()
    period_duration = end_time - start_time
    threshold = period_duration / 2

    for note_info in note_dict.values():
        note, (note_start, note_end) = note_info
        overlap_start = max(note_start, start_time)
        overlap_end = min(note_end, end_time)
        overlap_duration = overlap_end - overlap_start

        if overlap_duration > threshold:
            notes_playing.add(note)

    # Create a one-hot encoded array for notes 40 to 88
    one_hot_array = [0] * (88 - 40 + 1)
    for note in notes_playing:
        if 40 <= note <= 88:
            one_hot_array[note - 40] = 1

    return one_hot_array

def get_all_32nd_note_periods(note_dict, start_time, end_time, period_duration):
    """Generate one-hot encoded arrays for all 32nd-note periods."""
    current_time = start_time
    periods = []

    while current_time < end_time:
        next_time = current_time + period_duration
        one_hot_array = get_notes_in_32nd_period(note_dict, current_time, next_time)
        periods.append(one_hot_array)
        current_time = next_time

    return periods

def create_midi_tensors(file_path):
    """Main function to load the MIDI file and get one-hot encoded note periods."""
    messages_with_time = load_midi(file_path)
    note_periods = get_note_periods(messages_with_time)
    note_dict = create_note_dict(note_periods)
    
    one_hot_encoded_periods = get_all_32nd_note_periods(note_dict, 0, 8, 0.0625)
    return np.array(one_hot_encoded_periods)


### Main Function to Build Tensors

In [ ]:
def build_tensors(filename, bpm):
    midi_list = create_midi_tensors('MIDIs/' + filename + '.mid')
    amplitudes_list = create_amplitude_tensors(filename, bpm)
    if len(midi_list) > len(amplitudes_list):
        print("Error: amplitude list smaller than midi list for unknown reason, aborting...")
        return []
    master_list = [[amplitudes_list[i], midi_list[i]] for i in range(len(midi_list))]
    return master_list

## Section 4: Model Training and Prediction

In [ ]:
import tensorflow as tf
import os
import numpy as np

# Parsing function
def _parse_function(proto):
    keys_to_features = {
        'spectrogram': tf.io.FixedLenFeature([512], tf.float32),
        'notes': tf.io.FixedLenFeature([49], tf.int64)
    }
    parsed_features = tf.io.parse_single_example(proto, keys_to_features)
    return parsed_features['spectrogram'], parsed_features['notes']

# Augmentation functions
def add_noise(spectrogram, noise_factor=0.005):
    noise = np.random.randn(*spectrogram.shape) * noise_factor
    return spectrogram + noise

def amplitude_scaling(spectrogram, scale_range=(0.9, 1.1)):
    scale = np.random.uniform(scale_range[0], scale_range[1])
    return spectrogram * scale

def augment_data(spectrogram, notes):
    spectrogram = add_noise(spectrogram).astype(np.float32)
    spectrogram = amplitude_scaling(spectrogram).astype(np.float32)
    return spectrogram, notes

def tf_augment_data(spectrogram, notes):
    spectrogram, notes = tf.numpy_function(augment_data, [spectrogram, notes], [tf.float32, tf.int64])
    spectrogram.set_shape([512])  # Explicitly set the shape
    notes.set_shape([49])          # Explicitly set the shape
    return spectrogram, notes

# Directory containing TFRecord files
tfrecord_dir = 'GeneratedData'

# Get list of all TFRecord files
tfrecord_files = [os.path.join(tfrecord_dir, f) for f in os.listdir(tfrecord_dir) if f.endswith('.tfrecord')]

# Create a dataset from the TFRecord files
raw_dataset = tf.data.TFRecordDataset(tfrecord_files)

# Parse the dataset
parsed_dataset = raw_dataset.map(_parse_function)

# Augment the dataset
augmented_dataset = parsed_dataset.map(tf_augment_data)

# Shuffle, batch, and prefetch the dataset
batch_size = 32
dataset = augmented_dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(512, 1)),

    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(49, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

# Train the model
history = model.fit(dataset, epochs=80, validation_data=dataset, validation_steps=80, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)])

model_save_path = '*saved_tf_models/BasicConvGuitarNotePredictor(512_input).keras'

# Save the model
model.save(model_save_path)