In [1]:
import tensorflow as tf
import numpy as np
import librosa
import IPython.display as ipd

In [2]:
audio_path = './assets/electronic_piano/HM_120_AF_EPiano5.wav'
sample_rate = 44100
original_audio_data, _ = librosa.core.load(mono=True, path=audio_path, sr=sample_rate)
original_audio_data = original_audio_data[:len(original_audio_data) // 4]

In [3]:
def get_audio_input_frame_size(sequence_length, window_size, hop_size):
    input_frame_size = window_size
    for _ in range(sequence_length):
        input_frame_size += hop_size
    return input_frame_size

In [11]:
fft_size = 256
window_size = 256
hop_size = 64
sequence_length = 15

input_frame_size = get_audio_input_frame_size(sequence_length,
                                              window_size,
                                              hop_size)
    
audio_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, input_frame_size])

stfts = tf.contrib.signal.stft(fft_length=fft_size, 
                               frame_length=window_size, 
                               frame_step=hop_size,
                               signals=audio_placeholder)

with tf.variable_scope("cart2polar"):    
    magnitudes = tf.abs(stfts)[:, -1]
    phases = tf.angle(stfts)[:, -1]

with tf.variable_scope("polar2cart"):
    converted_real = magnitudes * tf.cos(phases)
    converted_imag = magnitudes * tf.sin(phases)      
    converted_stfts = tf.complex(converted_real, converted_imag)

new_audio_tensor = tf.contrib.signal.inverse_stft(fft_length=fft_size, 
                                                  frame_length=window_size, 
                                                  frame_step=hop_size, 
                                                  stfts=converted_stfts)

all_audio = []
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    
    for start in range(0, len(original_audio_data) - input_frame_size, hop_size):
        end = start + input_frame_size

        frames = sess.run(new_audio_tensor, feed_dict={
            audio_placeholder: original_audio_data[start:end].reshape((1, -1))
        })
        
        all_audio += frames.tolist()
    

In [12]:
ipd.Audio(data=np.array(all_audio).reshape((-1)), rate=sample_rate)

In [None]:
ipd.Audio(data=original_audio_data, rate=sample_rate)