# CPEN 291 Final Project

#Proposal:
Our project would involve turning youtube videos of piano compositions into sheet music. As a person who loves to learn new songs through youtube, I found that it can be often expensive to buy sheet music and instead I try to learn by slowing down the video and looking at the artist's hands. However, that has proven itself as quite a challenge since many times the pianist might edit the video in a way that their fingers aren't visible during all the performance. As a solution, I thought it would be interesting to build an ML system that would take audio as an input (mostly from videos on youtube and other streaming platforms) and create music sheets that would be readily available for download. Upon research, I found that there are quite a few companies that offer similar services, which makes me hopeful of the possibility of implementing my project. There are quite a few articles that mention new exciting ML algorithms such as magenta and Deep Watershed Detection. I would probably rely on a scraping algorithm to get a database with audios from amazing piano performances posted on all sorts of social media. Then, I would sort out the database by converting the audio into something more intelligible (such as waves) that would be analyzed by my model. The model would use the data from the waves to point out what note that is, what's its length, etc. In order to train the model, I could find cheap or free sheet music online in order to have a comparison of what the model generated to what it is originally supposed to look like. Finally, once having an acceptable accuracy rate, I would make the sheet music available on a website where we would be able to download it and learn from it. (Note that this would serve mostly as practice for beginners/intermediate students, as it would not be 100% correct)

#Import Statements


In [None]:
!pip install pretty_midi
import pretty_midi

In [None]:
!pip install pydub
from pydub import AudioSegment

In [3]:
import pandas as pd, csv
import torch, torchvision
from torchvision import datasets, models, transforms
from torch import nn, optim, functional as F
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import numpy as np
import librosa
from librosa import display
from IPython.display import Audio,display
import os
from scipy.io import wavfile
import PIL
from google.colab import drive

# Dataset Collection


In [None]:
# Code to scrape samples using Selenium. Requires user to install chromedriver.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

PATH_TO_CHROMEDRIVER = ''

driver = webdriver.Chrome(PATH_TO_CHROMEDRIVER)

driver.get('https://www.mutopiaproject.org/cgibin/make-table.cgi?Instrument=Piano')

for i in range(77):
     try:
        tables_mid = []
        tables_ps = []

        for j in range(1,11):
            tables_mid.append(driver.find_element_by_xpath(f"//table/tbody//tr[{j}]/td//table//tbody/tr[4]/td[2]"))
            tables_ps.append(driver.find_element_by_xpath(f"//table/tbody//tr[{j}]/td//table//tbody/tr[5]/t'd"))

        for table in tables_mid:
            time.sleep(0.5)
            table.find_element_by_partial_link_text('.mid').click()

        for table in tables_ps:
            time.sleep(1)
            driver.execute_script("arguments[0].click();", table.find_element_by_partial_link_text('.ps'))

        link = driver.find_element_by_link_text('Next 10')
        link.click()

     except:
        continue


driver.quit()


# Preprocessing

In [None]:
"""
This code converts .mid files to .wav

Note: the below code requires you to manually install a new version of fluidsynth (using pip install gives you
a version that is too old). This code must also be run on a 32-bit version of Python.
"""

# import fluidsynth

# PATH_TO_SOUNDFONT = ''
# PATH_TO_MID = ''
# DIR_SAVE = ''

# entries = os.listdir(PATH_TO_ENTRIES)

# for entry in entries:
#   fs = FluidSynth(sample_rate = 22050, sound_font = PATH_TO_SOUNDFONT)
#   midi_fn = PATH_TO_MID + '/' + entry
#   fs.midi_to_audio(midi_fn, DIR_SAVE)

In [76]:
# inspired by https://stackoverflow.com/questions/5120555/how-can-i-convert-a-wav-from-stereo-to-mono-in-python

"""
converts audio files from stereo to mono
"""
def convert_to_mono(PATH_WAV):
  if os.path.isdir(PATH_WAV):
    DIR_WAV = os.listdir(PATH_WAV)
    for entry in DIR_WAV:
        sound = AudioSegment.from_wav(PATH_WAV + '/' + entry)
        sound = sound.set_channels(1)
        sound.export(PATH_WAV + '/' + entry, format='wav')
  else:
    sound = AudioSegment.from_wav(PATH_WAV)
    sound = sound.set_channels(1)
    sound.export(PATH_WAV, format='wav')

# Model, Training and Testing

In [None]:
drive.mount('/content/drive')

In [40]:
sample_length_2 = 0.5

In [2]:
PATH_SPEC_2 = ''
PATH_MID_2 = ''
PATH_MODEL = ''

In [4]:
# Inspired by https://github.com/jsleep/wav2mid/blob/master/examples/one_hot.py

"""
Creates the labels for a sample given the pretty_midi object of its file, and 
the start time of the interval. The interval is sample_length_2 seconds long. 
The label is the first note start or end detected in this interval. If there are
multiple, only the first one is returned. If no note starts or ends are detected,
the label is 0.
"""

def create_label_2(pm, time, fs=1):
    for instrument in pm.instruments:
        for note in instrument.notes:
          if note.start >= (time - sample_length_2) and note.start <= time:
            return torch.as_tensor(note.pitch, dtype=torch.long)
          elif note.end >= (time - sample_length_2) and note.end <= time:
           return torch.as_tensor(note.pitch + 128, dtype=torch.long)

    return torch.as_tensor(0, dtype=torch.long)  

In [43]:
"""
Applies CQT to the time signal data of an audio sample and saves and returns the
resulting spectrogram.
"""

def create_sample_2(signalData, fn, j):
    signalData_float = signalData.astype(float)
    f = librosa.cqt(signalData_float, fmin=46.25)
    librosa.display.specshow(librosa.amplitude_to_db(f, ref=np.max), y_axis='log', x_axis='time', sr=22050)
    plt.savefig(PATH_SPEC + '/' + fn.replace('.mid', f'_{j}.jpg'))
    return PIL.Image.open(PATH_SPEC + '/' + fn.replace('.mid', f'_{j}.jpg'))

In [44]:
"""
Used only to create the images for the dataset. Given a directory of midi files
and their corresponding wav files, creates and saves spectrogram images of the 
wav files in intervals determined by sample_length_2. If the length of the music
specified by the midi file and wav file of a given sample differ, this sample 
is not added to the dataset.
"""

def create_dataset_2(path_mid, path_wav):
    entries_mid = os.listdir(path_mid)
    entries_mid.sort()
    entries_wav = os.listdir(path_wav)
    entries_wav.sort()

    for i in range(len(entries_mid)):
        print("itr: " + str(i))
        time = sample_length_2
        pm = pretty_midi.PrettyMIDI(path_mid + '/' + entries_mid[i])
        samplingFrequency, origSignalData = wavfile.read(path_wav + '/' + entries_wav[i])

        if int(len(origSignalData) / samplingFrequency) != int(pm.get_end_time()):
            print("different length, not saved")
            print("duration of " + entries_wav[i] + ": " + str(int(len(origSignalData) / samplingFrequency)))
            print("duration of " + entries_mid[i] + ": " + str(int(pm.get_end_time())))
            continue

        j = 0
        while time < pm.get_end_time():
            if samplingFrequency != 22050:
                print("bad sampling freq: " + str(samplingFrequency))
                return

            signalData = origSignalData[int((samplingFrequency * (time - sample_length_2))):int(time * samplingFrequency)]
            sample = create_sample_2(signalData, entries_mid[i], j)
            time += sample_length_2
            j += 1

    return dataset

In [45]:
"""
Fetches spectrogram images and labels to create the dataset. Applies the given 
transform to the images.
"""

def get_dataset_2(path_mid, path_spec, transform):
    entries_mid = os.listdir(path_mid)
    entries_mid.sort()
    entries_spec = os.listdir(path_spec)
    entries_spec.sort()
    dataset = []

    for i in range(len(entries_mid)):
        print("itr: " + str(i))
        time = sample_length_2
        pm = pretty_midi.PrettyMIDI(path_mid + '/' + entries_mid[i])

        j = 0
        while float(time) < pm.get_end_time():
            sample = PIL.Image.open(path_spec + '/' + entries_mid[i].replace('.mid', f'_{j}.jpg'))
            sample = sample.crop((81, 59, 576, 427))
            sample = transform(sample)
            label = create_label_2(pm, time)
            dataset.append((sample, label))
            time += sample_length_2
            j += 1

    return dataset

In [5]:
"""
Class for dataset. Calls get_dataset_2 to fetch the images and 
create the corresponding labels. Passes in a transform to resize the images
and convert them to tensors
"""

class Dataset_2():
  def __init__(self, PATH_MID, PATH_SPEC):
    transform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor()])
    self.dataset = get_dataset_2(PATH_MID, PATH_SPEC, transform)

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, i):
    if torch.is_tensor(i):
      i = i.item()

    return self.dataset[i][0], self.dataset[i][1]

In [None]:
dataset_2 = Dataset_2(PATH_MID_2, PATH_SPEC_2)

In [47]:
n_all_2 = len(dataset_2)
n_train_2 = int(0.8 * n_all_2)
n_test_2 = n_all_2 - n_train_2
rng = torch.Generator().manual_seed(1711)
dataset_train_2, dataset_test_2 = torch.utils.data.random_split(dataset_2, [n_train_2, n_test_2], rng)

ldr_train_2 = torch.utils.data.DataLoader(dataset_train_2, batch_size=128, shuffle=True)
ldr_test_2 = torch.utils.data.DataLoader(dataset_test_2, batch_size=128, shuffle=True)

In [48]:
model_2 = models.resnet18(pretrained=True)
model_2.fc = nn.Linear(512, 257)

In [None]:
# For model not pretrained on our data

def init_weights(m):
    if type(m) == torch.nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)

model_2.fc.apply(init_weights)

In [None]:
# For Model pretrained on our data

"""
model_2 = torch.load(PATH_MODEL)
model_2.eval()
"""

In [51]:
criterion_2 = nn.CrossEntropyLoss()
optimizer_2 = optim.AdamW(model_2.parameters(), lr=0.001, weight_decay=2)
scheduler_2 = optim.lr_scheduler.StepLR(optimizer_2, step_size=5, gamma=0.2)
device = torch.device('cuda:0')

In [52]:
"""
Training and testing code. The accuracy computation assumes cross entropy loss
is used for a classification task.
"""

def run_train_2(model, opt, sched, criterion):
    nsamples_train = len(dataset_train_2)
    loss_sofar, correct_sofar = 0, 0
    model.train()
    model = model.to(device)
    with torch.enable_grad():
        for samples, labels in ldr_train_2:
            model.zero_grad()
            samples = samples.to(device)
            labels = labels.to(device)
            opt.zero_grad()
            outs = model(samples)
            loss = criterion(outs, labels)
            _, preds = torch.max(outs.detach(), 1)
            loss.backward()
            opt.step()
            loss_sofar += loss.item() * samples.size(0)
            correct_sofar += torch.sum(preds == labels.detach())
    sched.step()
    return loss_sofar / nsamples_train, correct_sofar / nsamples_train


def run_test_2(model, criterion):
    nsamples_test = len(dataset_test_2)
    loss, correct = 0, 0
    model.eval()

    with torch.no_grad():
        for samples, labels in ldr_test_2:
            samples = samples.to(device)
            labels = labels.to(device)
            outs = model(samples)
            loss += criterion(outs, labels) * samples.size(0)
            _, preds = torch.max(outs.detach(), 1)
            correct_mask = preds == labels
            correct += correct_mask.sum(0).item()
    return loss / nsamples_test, correct / nsamples_test


def run_all_2(model, optimizer, scheduler, criterion, n_epochs):
    for epoch in range(n_epochs):
        loss_train, acc_train = run_train_2(model, optimizer, scheduler, criterion)
        loss_test, acc_test = run_test_2(model, criterion)
        print(f"epoch {epoch}: train loss {loss_train:.4f} acc {acc_train:.4f}, test loss {loss_test:.4f} acc {acc_test:.4f}")

In [None]:
run_all_2(model_2, optimizer_2, scheduler_2, criterion_2, 20)

# Postprocessing

In [55]:
"""
Helper Function that creates a spectrogram for a portion of an audio file.
"""

def create_sample_post(signalData, fn, j):
    signalData_float = signalData.astype(float)
    f = librosa.cqt(signalData_float, fmin=46.25)
    librosa.display.specshow(librosa.amplitude_to_db(f, ref=np.max), y_axis='log', x_axis='time', sr=22050)
    plt.savefig(fn.replace('.wav', f'_{j}.jpg'))
    image =  PIL.Image.open(fn.replace('.wav', f'_{j}.jpg'))
    os.remove(fn.replace('.wav', f'_{j}.jpg'))
    return image

In [1]:
"""
Helper function to accept the user's audio file input and split it 
into segments to be fed to model

"""

def split_audio(audio_file):
  transform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor()])
  time = sample_length_2
  samplingFrequency, origSignalData = wavfile.read(audio_file)
  audio_length = int(len(origSignalData) / samplingFrequency)
  audio_split = []

  j = 0
  while time < audio_length:
    signalData = origSignalData[int((samplingFrequency * (time - sample_length_2))):int(time * samplingFrequency)]
    sample = create_sample_post(signalData, audio_file, j)
    sample = sample.crop((81, 59, 576, 427))
    sample = transform(sample)
    sample = sample[None,:,:,:]
    audio_split.append(sample)
    time += sample_length_2
    j += 1

  if time - audio_length > 0:
    signalData = origSignalData[(samplingFrequency*(time - sample_length_2)):]
    sample = create_sample_post(signalData, audio_file, j)
    sample = sample.crop((81, 59, 576, 427))
    sample = transform(sample)
    sample = sample[None,:,:,:]
    audio_split.append(sample)

  return audio_split


In [103]:
"""
Helper function that accepts users' audio file and returns the predicted 
notes in small intervals.
"""

def get_notes(audio_file):
  model_2.eval()
  audio_split = split_audio(audio_file)
  notes = []

  for segment in audio_split:
    segment = segment.to(device)
    output = model_2(segment)
    _, preds = torch.max(output.detach(), 1)
    notes.append(preds.item())

  return notes

In [104]:
"""
Helper function that converts m4a audio files to wav files
"""

def convert_to_wav(path):
  AudioSegment.from_file(path).export(path.replace('.m4a', '.wav'), format='wav')
  return path.replace('.m4a', '.wav')


In [105]:
"""
Function creates a midi file from a wav file using the model. Since the accuracy
of the model is not perfect, when a note start is detected the next note end 
that is a similar pitch is used as the corresponding note end. Since the model
does not preserve the information about the instrument played, the output file
is assumed to be played by the Acoustic Grand Piano.
"""

def create_midi(path_input, path_output):
  path_wav = convert_to_wav(path_input)
  convert_to_mono(path_wav)
  notes = get_notes(path_wav)
  mid_file = pretty_midi.PrettyMIDI()
  piano_program = pretty_midi.instrument_name_to_program('Acoustic Grand Piano')
  piano = pretty_midi.Instrument(program=piano_program)

  time = 0.0
  for i in range(len(notes)):
    if notes[i] <= 128:
      onset = notes[i]
      for j in range(i, len(notes)):
        if onset + 120 <= notes[j] <= onset + 136 :
          note = pretty_midi.Note(velocity=100, pitch=onset, start=i, end=j)
          piano.notes.append(note)
          break

  mid_file.instruments.append(piano)
  mid_file.write(path_output)