In [4]:
from pydub import AudioSegment

In [5]:
audio_file = AudioSegment.from_file("../Data/short.mp3", format="mp3")
audio_file.set_channels(1)

In [6]:
import numpy as np

np.frombuffer(audio_file.raw_data,dtype='int').shape

(86447,)

In [7]:
len(audio_file)

1801

In [8]:
audio_file.frame_rate

48000

In [9]:
audiochannel=audio_file.channels
audiochannel

2

In [10]:
audiosize=np.frombuffer(audio_file.raw_data,dtype='int').shape[0]
audiosize

86447

In [11]:
batch_size=3

In [12]:
import tensorflow as tf

def makebatch():
    train=np.zeros(1*audiosize.reshape(1,audiosize,audiochannel))

    x_train=train.reshape(train.shape[0],audiosize,audiochannel)
    X=x_train
    train_dataset=tf.data.Dataset.from_tensor_slices(X).batch(batch_size)

    return train_dataset

In [13]:
from tensorflow import keras

def build_generator_model():
    model = keras.Sequential() # Keras 모델 생성
    
    model.add(keras.layers.Dense(1024, input_dim=100, use_bias=False))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.LeakyReLU())
    
    model.add(keras.layers.Dense(audiosize*128, use_bias=False))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.LeakyReLU())
        
    model.add(keras.layers.Reshape(audiosize/4, 128))
    
    model.add(keras.layers.Conv1DTranspose(audiosize*4, 5,
                                        strides=1, padding='same', use_bias=False))
    model.add(keras.layers.BatchNormalization()) 
    model.add(keras.layers.LeakyReLU())
        
    model.add(keras.layers.Conv1DTranspose(audiosize*2, 5,
                                        strides=2, padding='same', use_bias=False))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.LeakyReLU())
        
    model.add(keras.layers.Conv1DTranspose(audiochannel, 5, 
                                        strides=2, padding='same', activation='tanh'))
    assert model.output_shape == (None, audiosize, audiochannel)
    
    return model

In [14]:
def build_discriminator_model():

    model = keras.Sequential()
    
    model.add(keras.layers.Conv1D(audiosize, 5, strides=2, padding='same',
                       input_shape=[audiosize, audiochannel])) # input image size
    model.add(keras.layers.LeakyReLU(0.2))
    model.add(keras.layers.Dropout(0.3))

    model.add(keras.layers.Conv1D(audiosize*4, 5, strides=2, padding='same'))
    model.add(keras.layers.LeakyReLU(0.2))
    
    model.add(keras.layers.Flatten())
    
    model.add(keras.layers.Dense(audiosize*4*2))
    model.add(keras.layers.LeakyReLU(0.2))
    model.add(keras.layers.Dropout(0.3))

    model.add(keras.layers.Dense(1))
    
    return model

In [15]:
import tensorflow as tf

def generator_loss(fake_output):
    cross_entropy=keras.losses.BinaryCrossentropy(from_logits=True)
    return cross_entropy(tf.ones_like(fake_output),fake_output)

In [16]:
def discriminator_loss(real_output,fake_output):
    cross_entropy=keras.losses.BinaryCrossentropy(from_logits=True)
    real_loss=cross_entropy(tf.ones_like(real_output),real_output) #1과 진짜 이미지 판별 값 비교
    fake_loss=cross_entropy(tf.zeros_like(fake_output),fake_output) #0과 가짜 이미지 판별 값 비교
    total_loss=real_loss+fake_loss
    return total_loss

In [17]:
generator=build_generator_model()
discriminator=build_discriminator_model()
batch_data=makebatch()

: 

: 

In [None]:
generator_optimizer=keras.optimizers.Adam(1e-4)
discriminator_optimizer=keras.optimizers.Adam(1e-4)

In [None]:
@tf.function
def train_step(images):
    # 생성자 input noise
    noise = tf.random.normal([batch_size, 100])
    # Gradient descent 계산 및 파라미터 업데이트
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(noise, training=True)
    
        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)
    
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
    
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
    
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

In [None]:
import sounddevice as sd

def train(epochs=1000,show_freq=5):
    for epoch in range(epochs+1):
        for image_batch in batch_data:
            train_step(image_batch)
        if epoch%show_freq==0:
            noise = tf.random.normal([batch_size, 100])
            testbatch=generator.predict(noise)
            sd.play(testbatch,44100)
    print('Done!')

In [None]:
train()