In [5]:
import ParsingNote as pn
import tensorflow as tf
from tensorflow.contrib.framework.python.ops import audio_ops

# Wav 파일 이름 받아올 placeholder
wav_file = tf.placeholder(tf.string)

# Wav 파일 읽기
audio_binary = tf.read_file(wav_file)

# 0차원으로, 채널1값, mono로 wav 디코드
# Decode the wav mono into a 2D tensor with time in dimension 0
# and channel along dimension 1
waveform = audio_ops.decode_wav(audio_binary, desired_channels=1)

#Audio값 slice 받아올 placeholder
slice_audio=tf.placeholder(tf.float32)

#SliceAudio값으로 Spectogram만들기
spectrogram = audio_ops.audio_spectrogram(
        slice_audio,
        window_size=1024,
        stride=64)

# brightness 설정(여기선 * 100)
brightness = tf.placeholder(tf.float32, shape=[])
mul = tf.multiply(spectrogram, brightness)

# Normalize pixels
min_const = tf.constant(255.)
minimum =  tf.minimum(mul, min_const)

# Resize 하기위해 배열 차원 조절
# Expand dims so we get the proper shape
expand_dims = tf.expand_dims(minimum, -1)

# Resize함 (여기선 128 * 128)
# Resize spectogram
# Resize the spectrogram to input size of the model
resize = tf.image.resize_bilinear(expand_dims, [128, 128])

# 다시 배열  차원 낮춤
# Remove the trailing dimension
squeeze = tf.squeeze(resize, 0)

# Time이 Y축, Frequencies가 X축이므로 이걸 뒤집음(Flip)
# Tensorflow spectrogram has time along y axis and frequencies along x axis
# so we fix that
flip = tf.image.flip_left_right(squeeze)
transpose = tf.image.transpose_image(flip)

# 다시 3차원 이미지로 만듬(GrayScale)
# Convert image to 3 channels, it's still a grayscale image however
grayscale = tf.image.grayscale_to_rgb(transpose)

# unit8로 인코딩 + Png로 인코딩 또는 Jpg로 인코딩
# Cast to uint8 and encode as png
cast = tf.cast(grayscale, tf.uint8)
png = tf.image.encode_png(cast)
jpg = tf.image.encode_jpeg(cast)

#depth = 1 이미지
gray_cast = tf.cast(transpose, tf.uint8)
gray_jpg = tf.image.encode_jpeg(gray_cast)

# 시간값에 따른 index 구하기
def find_index(audio, sample_rate, t):
    
    n = len(audio)
    start = 0.0
    end = len(audio) / sample_rate
    i = round((n-1) * (t - start) / (end - start))
    
    return int(i)

# 오디오 자르는 함수 - 오디오, 샘플레이트, 시작시간, 얼마 초 동안
def slicing_audio(audio, sample_rate, start_time, duration):
    
    start_index = find_index(audio, sample_rate, start_time)
    end_index = find_index(audio, sample_rate, start_time + duration)    
    new_audio = audio[start_index:end_index].copy()
    
    return new_audio

In [36]:
audio_name = "Tooi_Sekai_no_Toki_to_Sora"
audio_path = "./Audio/" + audio_name + ".wav"

dir_name = audio_name + "_Spectrogram"
dir_path = "./Output/" + dir_name
pn.MakeOutputDir(dir_path)

note_file_name = audio_name + "_Note.txt"
note_path = "./Note/" + note_file_name
parsing_note = pn.ParsingNoteFile(note_path)

In [37]:
#0.1초 단위로 잘라 출력
with tf.Session() as sess:
    # 먼저 audio, sample_rate를 받음
    sample_rate, audio = sess.run([waveform.sample_rate, waveform.audio], 
                                  feed_dict={wav_file: audio_path})
    
    #갯수 구하기 - 전체 시간 / 구간값(0.1)
    total_time = len(audio) / sample_rate
    duration_time = 0.1
    num = int(total_time / duration_time)
    
    for i in range(0, num):
        #오디오 자르기
        start_time = 0.1 * float(i)
        duration = 0.1 * (float(i) + 1.0)
        new_audio = slicing_audio(audio, sample_rate, start_time, duration)
        
        #자른 오디오로 Spectrogram 이미지 구하기
        result_image = sess.run(jpg, feed_dict={
          wav_file: audio_path, slice_audio: new_audio, brightness: 100})
    
        #이미지 분류 - 노트에 따라 파일 경로 구하기)
        file_path = ""
        image_name = str(i).zfill(4)
        if image_name in parsing_note:
            file_path = "".join([dir_path, '/', parsing_note[image_name], '/', audio_name, '_', image_name, '_output.jpg'])
        else:
            file_path = "".join([dir_path, '/N/', audio_name, '_', image_name, '_output.jpg'])
        
        #이미지 출력
        with open(file_path, 'wb') as f:
            f.write(result_image)
            
        if i % 100 == 0:
            print("step : ", i)
    
    print("done")

step :  0
step :  100
step :  200
step :  300
step :  400
step :  500
step :  600
step :  700
step :  800
step :  900
step :  1000
done


In [5]:
#하나 출력
with tf.Session() as sess:
    # 먼저 audio, sample_rate를 받음
    sample_rate, audio = sess.run([waveform.sample_rate, waveform.audio], 
                                  feed_dict={wav_file: audio_path})
    
    #오디오 자르기
    new_audio = slicing_audio(audio, sample_rate, 15.3, 0.1)
    
    #자른 오디오로 Spectrogram 이미지 구하기
    result_image = sess.run(jpg, feed_dict={
      wav_file: audio_path, slice_audio: new_audio, brightness: 100})
    
    #이미지 출력
    with open('output2.jpg', 'wb') as f:
        f.write(result_image)
    
    print("done")

done
