# Pich detection using SPICE model

メモ: あとでmusic21, pydubのインストールをしておくこと

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

import numpy as np
import matplotlib.pyplot as plt
import librosa
from librosa import display as librosadisplay

import logging
import math
import statistics
import sys

from IPython.display import Audio, Javascript
from scipy.io import wavfile

from base64 import b64decode

import music21
from pydub import AudioSegment

logger = logging.getLogger()
logger.setLevel(logging.ERROR)

print(f'tensorflow: {tf.__verson__}')



In [None]:
# JS code
RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
    const reader = new FileReader()
    reader.onloadend = e => resolve(e.srcElement.result)
    reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
    stream = await navigator.mediaDevices.getUserMedia({audio: true})
    recorder = new MediaRecorder(stream)
    chunks = []
    recorder.ondataavailable = e => chunks.push(e.data)
    recorder.start()
    await sleep(time)
    recorder.onstop = async ()=>{
        blob = new Blob(chunks)
        text = await b2text(blob)
        resolve(text)
    }
    recorder.stop()
})
"""

In [None]:
def record(sec=5):
    try:
        from google.colab import output
    except ImportError:
        print('no possible to import output from google.colab')
        return
    else:
        print('Recording')
        display(Javascript(RECORD))
        s = output.eval_js('record(%d)' % (sec*1000))
        fname = 'recorded_audio.wav'
        print('Saving to', fname)
        b = b64decode(s.split(',')[1])
        with open(fname, 'wb') as f:
            f.write(b)
        return fname

### Input your audio

In [None]:
INPUT_SOURCE = 'https://storage.googleapis.com/download.tensorflow.org/data/c-scale-metronome.wav'

print(f'You selected {INPUT_SOURCE}')

if INPUT_SOURCE == 'RECORD':
    uploaded_file_name = record(5)
elif INPUT_SOURCE == 'UPLOAD':
    try:
        from google.colab import files
    except ImportError:
        print("ImportError: files from google.colab seems to not be available")
    else:
        uploaded = files.upload()
        for fn in uploaded.keys():
            print(f'User uploaded file {fn} with length {len(uploaded[fn])} bytes')
        uploaded_file_name = next(iter(uploaded))
        print(f'Uploaded file: {uploaded_file_name}')

elif INPUT_SOURCE.startswith('./drive/'):
    try:
        from google.colab import drive
    except ImportError:
        print("ImportError: files from google.colab seems to not be available")
    else:
        drive.mount('/content/drive')
    # don't forget to change the name of the file you will you here!
    gdrive_audio_file = 'your audio file here'
    uploaded_file_Name = INPUT_SOURCE
elif INPUT_SOURCE.startswith('http'):
    # wget --no-check-certificate 'https://storage.googleapis.com/download.tensorflow.org/data/c-scale-metronome.wav' -O c-scale.wav
    uploaded_file_name = 'c-scale.wav'
else:
    print('Unrecognized input format!')
    print('Please select "RECORD", "UPLOAD", or specify a file hosted on Google Drive or a file from the web to download file to download')


### Set Audio data

In [None]:
"""
Function that converts the user-created audio to the format that the model
expects: bitrate 16kHz and only one channel (mono)
"""

EXPECTED_SAMPLE_RATE = 16000 # 16kHz

def convert_audio_for_model(user_file, output_file='converted_audio_file.wav'):
    audio = AudioSegment.from_file(user_file)
    audio = audio.set_frame_rate(EXPECTED_SAMPLE_RATE).set_channels(1)
    audio.export(output_file, format="wav")
    return output_file

In [None]:
"""
Converting to the expected format for the model
in all the input 4 input method before, the uploaded file name is at 
the variable uploaded_file_name
"""
converted_audio_file = convert_audio_for_model(uploaded_file_name)

In [None]:
# Loading audio samples from the wav file
sample_rate, audio_samples = wavfile.read(converted_audio_file, 'rb')

# Show some basic information about the audio
duration = len(audio_samples) / sample_rate
print(f'sampele rate: {sample_rate} Hz')
print(f'Total duration: {duration:.2f}s')
print(f'Size of the input: {len(audio_samples)}')

# Listen to the wav file
Audio(audio_samples, rate=sample_rate)

In [None]:
# Visalize the audio as a waveform
_ = plt.plot(audio_samples)

### Spectrogram

In [None]:
MAX_ABS_INT16 = 32768.0


def plot_stft(x, sample_rate, show_black_and_white=False):
    x_stft = np.abs(librosa.stft(x, n_fft=2048))
    fig, ax = plt.subplots()
    fig.set_size_inches(20, 10)
    x_stft_db = librosa.amplitude_to_db(x_stft, ref=np.max)
    if show_black_and_white:
        librosadisplay.specshow(data=x_stft_db, y_axis='log',
                                sr=sample_rate, cmap='gray_r')
    else:
        librosadisplay.specshow(data=x_stft_db, y_axis='log', sr=sample_rate)

    plt.colorbar(format='%+2.0f dB')

plot_stft(audio_samples / MAX_ABS_INT16 , sample_rate=EXPECTED_SAMPLE_RATE)
plt.show()

In [None]:
audio_samples = audio_samples / float(MAX_ABS_INT16)

### Executed the model

In [None]:
# Loading the SPICE model is easy:
model = hub.load("https://tfhub.dev/google/spice/2")

In [None]:
# We now feed the audio to the SPICE tf.hub model to obtain pitch and uncertainty outputs as tensors
model_output = model.signatures['serving_default'](tf.constant(audio_samples, tf.float32))

pitch_outputs = model_output['pitch']
uncertainty_outputs = model_output['uncertainty']

# Uncertainty basically means the inverse of confidence.
confidence_outputs = 1.0 - uncertainty_outputs


fig, ax = plt.subplots()
fig.set_size_inches(20, 10)
plt.plot(pitch_outputs, label='pitch')
plt.plot(confidence_outputs, label='confidence')
plt.legend(loc='lower right')
plt.show()

In [None]:
# show graph(drawing confidence < 0.9)
confidence_outputs = list(confidence_outputs)
pitch_outputs = [float(x) for x in pitch_outputs]

indices = range(len(pitch_outputs))
confident_pitch_outputs = [(i, p)
    for i, p, c in zip(indices, pitch_outputs, confidence_outputs) if c >= 0.9]
confident_pitch_outputs_x, confident_pitch_outputs_y = zip(*confident_pitch_outputs)

fig, ax = plt.subplots()
fig.set_SIZE_inches(20, 10)
ax.set_ylim([0, 1])
plt.scatter(confident_pitch_outputs_x, confident_pitch_outputs_y)
plt.scatter(confident_pitch_outpus_x, confident_pitch_outputs_y, c="r")

plt.show()