In [None]:
%load_ext autoreload
%autoreload 2

## Test pipeline on one instance


In [79]:
from pathlib import Path
import librosa
import IPython.display as ipd
from pitch_estimator import PitchEstimator
from preprocessors import Preprocessor
import matplotlib.pyplot as plt
import scipy
import numpy as np
import utils as u
from tuning_similarity import TuningSimilarityComputer

In [None]:
# Initialize objects and variables
data_path = Path("gamelan_music_dataset")
targets_first = data_path / "first ensemble/orchestra/target"
audio_file_path = targets_first / "demung/001.wav"
audio_file_path = "audios/tg1.wav"
pitch_est = PitchEstimator()
pp = Preprocessor()

# Display input audio
y, sr = librosa.load(audio_file_path, duration=30)
print("Input audio:")
display(ipd.Audio(y, rate=sr))

# Use median filtering to divide harmonic from percussive component
spectrogram = pp.compute_spectrogram(y)
harmonic, percussive = pp.apply_median_filtering(spectrogram)

# Reconstruct harmonic component audio and display it
harmonic_audio = librosa.griffinlim(harmonic)
print("Harmonic component:")
display(ipd.Audio(harmonic_audio, rate=sr))

# Reconstruct harmonic component audio and display it
percussive_audio = librosa.griffinlim(percussive)
print("Percussive component:")
display(ipd.Audio(percussive_audio, rate=sr))

In [None]:
plt.figure(figsize=(15, 10))

plt.subplot(3, 1, 1)
librosa.display.specshow(librosa.amplitude_to_db(spectrogram, ref=np.max), y_axis="log")
plt.colorbar()
plt.title("Full spectrogram")

plt.subplot(3, 1, 2)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(harmonic), ref=np.max), y_axis="log")
plt.colorbar()
plt.title("Harmonic spectrogram")

plt.subplot(3, 1, 3)
librosa.display.specshow(librosa.amplitude_to_db(np.abs(percussive), ref=np.max), y_axis="log", x_axis="time")
plt.colorbar()
plt.title("Percussive spectrogram")
plt.tight_layout()

plt.show()

In [None]:
onset_indexes = librosa.onset.onset_detect(y=percussive_audio, sr=sr)

# Create subplots
fig, ax = plt.subplots(figsize=(14, 4))

# Plot the spectrogram
ax.imshow(spectrogram, aspect="auto", origin="lower", cmap="hot")

# Plot vertical lines at onset indices
for onset_index in onset_indexes:
    ax.vlines(onset_index, 0, spectrogram.shape[0], color="blue", linestyle="-")

# Set labels and title
ax.set_xlabel("Time")
ax.set_ylabel("Frequency")
ax.set_title("Spectrogram with onsets")

# Adjust the layout
plt.tight_layout()
plt.grid(False)

# Show the plot
plt.show()

In [None]:
onsets = librosa.onset.onset_detect(y=percussive_audio, units="time")

In [None]:
# TODO: for simplicity get rid of pitch_est class and replace it with simple crepe.predict()
time, frequency, confidence, activation = pitch_est.estimate_crepe(harmonic_audio, sr)

In [None]:
# NOT IN USE
# import utils as u
# seg = u.segment_stable_frequency_regions(frequency, stdThsld=5, minNoteDur=0.5, winStable=5, fs=sr, H=512)

In [None]:
# Plot the interpolated frequencies
plt.plot(time, frequency)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.title("Crepe Frequencies")

In [75]:
tones = []
stable_segments_idxs = []


for i, onset in enumerate(onsets):
    index_a = np.argmax(time > onset)
    if i == len(onsets) - 1:
        index_b = len(time) - 1
    else:
        index_b = np.argmax(time > onsets[i + 1])

    # this was an attempt to try to get the frequency with the highest confidence value. 
    #  not working :-(
    # most_confident_idx = np.argmax(confidence[index_a:index_b])
    # fq = frequency[most_confident_idx]
    # # print(fq)
    # tones.append(fq)

    frequency_segment = frequency[index_a:index_b]
    stable_freq_seg = u.select_stable_part(frequency_segment, threshold=0.5)

    
    # # Calculate time indices for stable segment
    # stable_index_a = time[index_a + np.argmax(frequency_segment == stable_freq_seg[0])]
    # stable_index_b = time[index_a + np.argmax(frequency_segment == stable_freq_seg[-1]) + 1]

    # # Store the time indices of the stable segment
    # stable_segments_idxs.append((stable_index_a, stable_index_b))

    print("original segment:", frequency_segment, "\n")
    print("stable segment:", stable_freq_seg, "\n")
    tone = np.median(stable_freq_seg)
    print(f'Onset median {i}-{i+1}: {tone}')
    print(f'Onset mean {i}-{i+1}: {np.mean(stable_freq_seg)}', "\n")

    print("----------")
    tones.append(tone)

    # if i == 57:
    # #     audio = harmonic_audio[int(onset*sr):int(onsets[i+1]*sr)]
    # #     display(ipd.Audio(audio, rate=sr))
    # #     plt.plot(audio)


original segment: [442.31533005 434.84344193 423.96656148 421.59354858 421.36575607
 418.88004261 418.3259738  418.85971011 417.45759787 416.3979031
 416.10625026 415.1144286  414.80815751 415.06825419 414.71331964
 414.36561108 414.00068661 413.80881529 413.36450797 413.29015399
 412.92582724 412.57053609 413.3657093  413.97660753 414.06558813
 414.18218108 413.79507136 414.13326971 414.65973927 414.46932283
 414.03419179 414.14228021 414.65258231 413.97143817 414.15033555
 413.85251225 413.70098294 413.6082182  414.07198863 414.44610749
 414.18029516 414.3767965  414.25795711 413.73415646 413.65896704
 413.74208911] 

stable segment: [442.31533005 434.84344193 423.96656148 421.59354858 421.36575607
 418.88004261 418.3259738  418.85971011 417.45759787 416.3979031
 416.10625026 415.1144286  414.80815751 415.06825419 414.71331964
 414.36561108 414.00068661 413.80881529 413.36450797 413.29015399
 412.92582724 412.57053609 413.3657093  413.97660753 414.06558813
 414.18218108 413.79507136 

In [None]:
# for t in tones:
#     audio = librosa.tone(t, duration=1)
#     print(f'Playing t {t}')
#     display(ipd.Audio(audio, rate=sr))

plt.figure(figsize=(14, 6))  # Adjust the size as needed

# Plot the interpolated frequencies
plt.plot(time, frequency)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.title("Crepe Frequencies")

for t in onsets:
    plt.axvline(x=round(t, 2), color="r", linestyle="-", label=f"Time {t}")


# # Plot stable segments
# for idxs in stable_segments_idxs:
#     plt.axvspan(idxs[0], idxs[1], color='green', alpha=0.5)
# plt.show()

In [76]:
grouped_averaged_frequencies = u.group_and_average_frequencies(tones)
print(grouped_averaged_frequencies)

[303.9454046422517, 324.12695849644547, 414.54278332379704, 445.6127408141946, 576.2408809119124, 609.8608748708061, 656.3014671575204, 842.5190825896454, 859.7537399485806, 904.8358960755265]


In [77]:
for tone in grouped_averaged_frequencies:
    audio = librosa.tone(tone, duration=1)
    print(f"Playing tone {tone}")
    display(ipd.Audio(audio, rate=sr))

Playing tone 303.9454046422517


Playing tone 324.12695849644547


Playing tone 414.54278332379704


Playing tone 445.6127408141946


Playing tone 576.2408809119124


Playing tone 609.8608748708061


Playing tone 656.3014671575204


Playing tone 842.5190825896454


Playing tone 859.7537399485806


Playing tone 904.8358960755265


In [83]:

# TODO: make sure grouped averaged frequencies corresponds to ding to ding one octave above
# 6 tones -> 5 intervals
tsc = TuningSimilarityComputer(fqs=grouped_averaged_frequencies[:6])

foo = tsc.compute_tuning_similarity()

foo

Estimated freqs (Hz): [303.9454046422517, 324.12695849644547, 414.54278332379704, 445.6127408141946, 576.2408809119124, 609.8608748708061]
Computed intervals (cents)  [111.29617391936381, 425.9543830259677, 125.12337368443923, 445.0578390589076, 98.1697788729507]


array([[0.46619755, 0.58522496, 0.596218  ]])