In [1]:
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

In [2]:
#loading the audio
audio_file = 'C:/Users/joshh/OneDrive/Desktop/projects/assets/Reality (feat. Janieck Devy) [Christmas Mix].mp3'
y, sr= librosa.load(audio_file)

In [3]:
#preprocess the audio file
y = librosa.util.normalize(y)
y = librosa.effects.trim(y)[0]

In [4]:
#applying feature extraction
chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length = 512)
chroma_scaled = StandardScaler().fit_transform(chroma)

In [5]:
chroma.shape

(12, 6848)

In [6]:
#define the note names  
note_names = ['C','C#','D','D#','E','F','F#','G','G#','A','A#','B']

In [7]:
#define the machine learning model
model = MLPClassifier(hidden_layer_sizes=(100,50), max_iter=500, alpha=0.01, solver='adam', random_state=1)

In [8]:
#train the model based on its features
X = chroma_scaled
y_notes_encoded = np.array([note_names.index(n) for n in note_names]).flatten()
model.fit(X,y_notes_encoded)

In [9]:
#predict the notes for each time step 
note_predictions = model.predict(X)

In [10]:
#convert the output into a symbolic representation of the music
time_step_duration = librosa.get_duration(y=y,sr=sr) / len(note_predictions)
sheet_music = []
for i in range(len(note_predictions)):
    note_name = note_names[note_predictions[i]]
    note_start = i * time_step_duration
    sheet_music.append((note_name, note_start))

In [11]:
#print the music sheet
for note in sheet_music:
    print(note[0], '\t', note[1])

C 	 0.0
C# 	 13.248919123204837
D 	 26.497838246409675
D# 	 39.746757369614514
E 	 52.99567649281935
F 	 66.24459561602418
F# 	 79.49351473922903
G 	 92.74243386243386
G# 	 105.9913529856387
A 	 119.24027210884354
A# 	 132.48919123204837
B 	 145.7381103552532
