In [49]:
import os
import time
import random
import torch
import torch.nn as nn
from torch.utils.data import IterableDataset, DataLoader
import pandas as pd
import pyarrow.parquet as pq
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
from audio_midi_pipeline import process_files
from music21 import converter, instrument, environment



In [5]:
inputs = process_files('music/19-HAPPY BIRTHDAY.mp3')
inputs = inputs[380:]
inputs.shape

torch.Size([1688, 513])

In [None]:
# Define the CNN model
class PitchDetectionModel(nn.Module):
    def __init__(self, num_pitches=88):
        super(PitchDetectionModel, self).__init__()

        # Reduced number of pooling layers and smaller kernels
        self.conv_layers = nn.Sequential(
            # First conv block
            nn.Conv2d(1, 32, kernel_size=(1, 3), padding=(0, 1)),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((1, 2)),  # Only pool frequency dimension

            # Second conv block
            nn.Conv2d(32, 64, kernel_size=(1, 3), padding=(0, 1)),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d((1, 2)),

            # Third conv block without pooling
            nn.Conv2d(64, 128, kernel_size=(1, 3), padding=(0, 1)),
            nn.BatchNorm2d(128),
            nn.ReLU(),
        )

        # Calculate flattened feature size
        self.flattened_size = 128 * 1 * 128  # channels * height * width

        # Fully connected layers
        self.fc_layers = nn.Sequential(
            nn.Flatten(),  # Flatten all dimensions except batch
            nn.Linear(self.flattened_size, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_pitches),
            nn.Sigmoid()
        )

    def forward(self, x):
        # x shape: [batch_size, channels, height, width]
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x

def load_trained_model(model_path, device='cpu'):
    # After defining model load the model
    device = torch.device(device)
    model = torch.load(model_path, map_location=device)
    model.to(device)
    model.eval()  # Set to evaluation mode
    return model

model = load_trained_model("best_model.pth")


  model = torch.load(model_path, map_location=device)


In [34]:
# predict_function.py
def make_prediction(model, spectrogram_tensor, device='cpu', threshold=0.5):
    """
    Makes predictions using the trained model.

    Parameters:
    - model (nn.Module): Trained PyTorch model.
    - spectrogram_tensor (torch.Tensor): Input spectrogram tensor.
    - device (str): 'cpu' or 'cuda' device.
    - threshold (float): Threshold for binary classification.

    Returns:
    - active_notes (list): List of active MIDI note numbers.
    """
    device = torch.device(device)
    spectrogram_tensor = spectrogram_tensor.to(device)

    with torch.no_grad():
        outputs = model(spectrogram_tensor)
        predictions = (outputs > threshold).float()

    predictions = predictions.cpu().numpy()
    return predictions

outputs = []
iterations = 0

for input_tensor in inputs:
    iterations+=1
    input_tensor = input_tensor.view(1, 1, 1, 513)
    outputs.append(make_prediction(model, input_tensor))

binary_results = outputs
binary_results

[array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
      

The following code checks for repeating values in three different window sizes. For each window size, it examines a set number of frames and calculates whether at least 40% of the notes are active. If this condition is met, it combines all the active notes in that window. This approach addresses the issue where values like 1,0,1,1,1,0,1,0,1,1,0,1,1 would generate separate notes for each 1 after a 0 in the MIDI file. Now, these notes are grouped together and treated as a single event. The 40% threshold and window sizes are adjustable and can be tweaked for different results.

In [None]:


def fill_zeros_with_ones(binary_results, large_window_size=40, medium_window_size=20, small_window_size=7,
                          threshold_large=0.4, threshold_medium=0.4, threshold_small=0.4):
    """
    Fill zeros with ones in binary data if at least a certain percentage of the values in the window are 1s.

    Parameters:
    - binary_results: 2D numpy array with shape (frames, 88)
    - large_window_size: Size of the large window for long stretches
    - medium_window_size: Size of the medium window for medium bursts
    - small_window_size: Size of the small window for short bursts
    - threshold_large: Percentage threshold for large window (default is 40%)
    - threshold_medium: Percentage threshold for medium window (default is 40%)
    - threshold_small: Percentage threshold for small window (default is 40%)

    Returns:
    - modified_results: The updated binary results array
    """
    # Ensure binary_results is a 2D array with shape (frames, 88)
    if binary_results.ndim != 2:
        raise ValueError("Input array must have 2 dimensions (frames, features)")

    if binary_results.shape[1] != 88:
        raise ValueError(f"Expected 88 columns, but found {binary_results.shape[1]}.")

    print(f"Input shape: {binary_results.shape}")

    # Calculate the threshold counts (e.g., 40% for large, medium, and small windows)
    threshold_count_large = int(large_window_size * threshold_large)
    threshold_count_medium = int(medium_window_size * threshold_medium)
    threshold_count_small = int(small_window_size * threshold_small)

    # Create a copy of the original binary results to avoid overwriting
    modified_results = binary_results.copy()

    # Iterate over each column (88 columns)
    for col in range(binary_results.shape[1]):
        print(f"Processing column {col + 1} of {binary_results.shape[1]}...")

        # Iterate through the frames and apply the large window
        for i in range(binary_results.shape[0] - large_window_size + 1):
            window = binary_results[i:i + large_window_size, col]
            if np.sum(window) >= threshold_count_large:  # Check if the large window meets the threshold
                # Fill all zeros in the window with 1 in the modified results
                modified_results[i:i + large_window_size, col] = np.where(window == 0, 1, window)

        # Apply the medium window
        for i in range(binary_results.shape[0] - medium_window_size + 1):
            window = binary_results[i:i + medium_window_size, col]
            if np.sum(window) >= threshold_count_medium:  # Check if the medium window meets the threshold
                # Fill all zeros in the window with 1 in the modified results
                modified_results[i:i + medium_window_size, col] = np.where(window == 0, 1, window)

        # Apply the small window
        for i in range(binary_results.shape[0] - small_window_size + 1):
            window = binary_results[i:i + small_window_size, col]
            if np.sum(window) >= threshold_count_small:  # Check if the small window meets the threshold
                # Fill all zeros in the window with 1 in the modified results
                modified_results[i:i + small_window_size, col] = np.where(window == 0, 1, window)

    return modified_results


# predict_function.py
def make_prediction(model, spectrogram_tensor, device='cpu', threshold=0.5):
    """
    Makes predictions using the trained model.

    Parameters:
    - model (nn.Module): Trained PyTorch model.
    - spectrogram_tensor (torch.Tensor): Input spectrogram tensor.
    - device (str): 'cpu' or 'cuda' device.
    - threshold (float): Threshold for binary classification.

    Returns:
    - active_notes (list): List of active MIDI note numbers.
    """
    device = torch.device(device)
    spectrogram_tensor = spectrogram_tensor.to(device)

    with torch.no_grad():
        outputs = model(spectrogram_tensor)
        predictions = (outputs > threshold).float()

    predictions = predictions.cpu().numpy()
    return predictions

outputs = []

for input_tensor in inputs:
    input_tensor = input_tensor.view(1, 1, 1, 513)
    binary_results = (make_prediction(model, input_tensor))

    # Example usage
    outputs.append(fill_zeros_with_ones(binary_results))


Input shape: (1, 88)
Processing column 1 of 88...
Processing column 2 of 88...
Processing column 3 of 88...
Processing column 4 of 88...
Processing column 5 of 88...
Processing column 6 of 88...
Processing column 7 of 88...
Processing column 8 of 88...
Processing column 9 of 88...
Processing column 10 of 88...
Processing column 11 of 88...
Processing column 12 of 88...
Processing column 13 of 88...
Processing column 14 of 88...
Processing column 15 of 88...
Processing column 16 of 88...
Processing column 17 of 88...
Processing column 18 of 88...
Processing column 19 of 88...
Processing column 20 of 88...
Processing column 21 of 88...
Processing column 22 of 88...
Processing column 23 of 88...
Processing column 24 of 88...
Processing column 25 of 88...
Processing column 26 of 88...
Processing column 27 of 88...
Processing column 28 of 88...
Processing column 29 of 88...
Processing column 30 of 88...
Processing column 31 of 88...
Processing column 32 of 88...
Processing column 33 of 88..

Code below actually generates the midi file from our new binary variable we created called "binssss"

In [48]:
from midiutil import MIDIFile

def create_midi_from_binary_test(outputs, output_file):
    """
    Convert binary results to a MIDI file while preserving original shape.

    Parameters:
    - binssss: 2D numpy array of binary values (0 or 1) representing key states
    - output_file: Path to the output MIDI file
    """

     # Create a new MIDI file
    midi_file = MIDIFile(1)
    midi_file.addTempo(0, 0, 120)  # Add tempo track

    for binssss in outputs:


        # Validate input shape
        if binssss.ndim != 2 or binssss.shape[1] != 88:
            raise ValueError("Input array must have shape (x, 88)")

        print(f"Original binary results shape: {binssss.shape}")

        # MIDI note numbers for piano (A0 to C8)
        base_midi_note = 21

        # Set the time increment based on your data
        time_increment_ms = 23.2198  # 0.0116099 seconds per row

        # Track the state of each note (on/off) and the start time of each note
        note_state = [False] * 88  # Assume initially all notes are off
        note_start_times = [None] * 88

        # Iterate through binary results and add note on/off events
        for time_index, frame in enumerate(binssss):
            current_time = time_index * time_increment_ms / 1000  # Convert to seconds

            for key_index, key_state in enumerate(frame):
                midi_note = base_midi_note + key_index

                # If the note is on and was previously off, start the note
                if key_state == 1 and not note_state[key_index]:
                    note_state[key_index] = True
                    note_start_times[key_index] = current_time

                # If the note is off and was previously on, end the note
                elif key_state == 0 and note_state[key_index]:
                    # Only add note if we have a valid start time
                    if note_start_times[key_index] is not None:
                        duration = current_time - note_start_times[key_index]

                        # Ensure a minimum duration to prevent zero-length notes
                        duration = max(duration, 0.1)

                        midi_file.addNote(
                            0,      # track
                            0,      # channel
                            midi_note,
                            note_start_times[key_index],
                            duration,
                            100     # velocity
                        )

                    # Reset note state
                    note_state[key_index] = False
                    note_start_times[key_index] = None

    # Write the MIDI file
    with open(output_file, "wb") as f:
        midi_file.writeFile(f)

    print("MIDI file created successfully!")

# Example usage
create_midi_from_binary_test(outputs, "masons_model_3mapped.mid")


Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results shape: (1, 88)
Original binary results s

In [None]:
def convert_midi_to_sheet_music(midi_path, output_xml_path, instrument_name='Piano'):
    """
    Convert a MIDI file to a MusicXML file representing sheet music.

    Parameters:
    - midi_path (str): Path to the input MIDI file.
    - output_xml_path (str): Path to save the output MusicXML file.
    - instrument_name (str): Name of the instrument to assign (default: 'Piano').

    Returns:
    - None
    """
    try:
        # Load MIDI file using music21
        midi_stream = converter.parse(midi_path)

        # Assign instrument
        inst = instrument.fromString(instrument_name)
        midi_stream.insert(0, inst)

        # Export to MusicXML
        midi_stream.write('musicxml', fp=output_xml_path)
        print(f"Sheet music written to {output_xml_path}")

    except Exception as e:
        print(f"An error occurred while converting MIDI to sheet music: {e}")

output_midi_file = midi = converter.parse("full_midi_mason_model.mid")

convert_midi_to_sheet_music(output_midi_file, "sheet_music_test.xml", instrument_name='Piano')

An error occurred while converting MIDI to sheet music: list index out of range
