In [1]:
from __future__ import print_function
import scipy.constants as const
from IPython.core.display import HTML
from __future__ import division

from keras import backend as K
K.set_image_dim_ordering('th') # ensure our dimension notation matches

from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.layers import Reshape
from keras.models import Model
from keras.layers.merge import _Merge
from keras.layers.core import Activation, Lambda
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import UpSampling2D, Conv1D
from keras.layers.convolutional import Convolution2D, AveragePooling2D, Conv2DTranspose
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.core import Flatten
from keras.optimizers import SGD, Adam
from keras import utils
import numpy as np
from scipy.io import wavfile
from PIL import Image, ImageOps
from functools import partial
import random
import argparse
import math
import wavfile24
import os
import os.path
import nnresample 

import glob

NP_RANDOM_SEED = 2000

Using TensorFlow backend.


In [2]:
# Set Model Hyperparameters
class BeatGanHyperParameters():
    def __init__(self, num_channels, batch_size, model_size, phase_shuffle_size, D_update_per_G_update):
        self.c = num_channels
        self.b = batch_size
        self.d = model_size
        self.n = phase_shuffle_size
        self.D_updates_per_G_update = D_update_per_G_update
        self.WGAN_GP_weight = 10

hp = BeatGanHyperParameters(2,64,64,2,5)

In [3]:
def get_generator():
    model = Sequential()
    model.add(Dense(input_dim=100, output_dim=256*hp.d))
    model.add(Reshape((1, 16, 16*hp.d), input_shape = (256*hp.d,)))
    model.add(Activation('relu'))
    model.add(Conv2DTranspose(8*hp.d, (1,25), strides=(1,4), padding="same", data_format='channels_last'))
    model.add(Activation('relu'))
    model.add(Conv2DTranspose(4*hp.d, (1,25), strides=(1,4), padding="same", data_format='channels_last'))
    model.add(Activation('relu'))
    model.add(Conv2DTranspose(2*hp.d, (1,25), strides=(1,4), padding="same", data_format='channels_last'))
    model.add(Activation('relu'))
    model.add(Conv2DTranspose(hp.d, (1,25), strides=(1,4), padding="same", data_format='channels_last'))
    model.add(Activation('relu'))
    model.add(Conv2DTranspose(hp.c, (1,25), strides=(1,4), padding="same", data_format='channels_last'))
    model.add(Activation('tanh'))
    model.add(Reshape((16384, hp.c), input_shape = (1, 16384, hp.c)))
    return model

def get_discriminator():
    def phase_shuffle(x):
        shuffle_amount = random.randint(-1*hp.n, hp.n)
        return K.concatenate((x[shuffle_amount:, :], x[:shuffle_amount, :]), axis=0)
    
    model = Sequential()
    model.add(Conv1D(hp.d, 25, strides=4, padding="same", input_shape=(16384, hp.c)))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Lambda(lambda x: phase_shuffle(x)))
    model.add(Conv1D(2*hp.d, 25, strides=4, padding="same"))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Lambda(lambda x: phase_shuffle(x)))
    model.add(Conv1D(4*hp.d, 25, strides=4, padding="same"))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Lambda(lambda x: phase_shuffle(x)))
    model.add(Conv1D(8*hp.d, 25, strides=4, padding="same"))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Lambda(lambda x: phase_shuffle(x)))
    model.add(Conv1D(16*hp.d, 25, strides=4, padding="same"))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Reshape((256*hp.d, ), input_shape = (1, 16, 16*hp.d)))
    model.add(Dense(1))
    return model

def generator_containing_discriminator(generator, discriminator):
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    return model

def wasserstein_loss(y_true, y_pred):
    """Calculates the Wasserstein loss for a sample batch.
    The Wasserstein loss function is very simple to calculate. In a standard GAN, the discriminator
    has a sigmoid output, representing the probability that samples are real or generated. In Wasserstein
    GANs, however, the output is linear with no activation function! Instead of being constrained to [0, 1],
    the discriminator wants to make the distance between its output for real and generated samples as large as possible.
    The most natural way to achieve this is to label generated samples -1 and real samples 1, instead of the
    0 and 1 used in normal GANs, so that multiplying the outputs by the labels will give you the loss immediately.
    Note that the nature of this loss means that it can be (and frequently will be) less than 0."""
    return K.mean(y_true * y_pred)


def gradient_penalty_loss(y_true, y_pred, averaged_samples, gradient_penalty_weight):
    """Calculates the gradient penalty loss for a batch of "averaged" samples.
    In Improved WGANs, the 1-Lipschitz constraint is enforced by adding a term to the loss function
    that penalizes the network if the gradient norm moves away from 1. However, it is impossible to evaluate
    this function at all points in the input space. The compromise used in the paper is to choose random points
    on the lines between real and generated samples, and check the gradients at these points. Note that it is the
    gradient w.r.t. the input averaged samples, not the weights of the discriminator, that we're penalizing!
    In order to evaluate the gradients, we must first run samples through the generator and evaluate the loss.
    Then we get the gradients of the discriminator w.r.t. the input averaged samples.
    The l2 norm and penalty can then be calculated for this gradient.
    Note that this loss function requires the original averaged samples as input, but Keras only supports passing
    y_true and y_pred to loss functions. To get around this, we make a partial() of the function with the
    averaged_samples argument, and use that for model training."""
    gradients = K.gradients(K.sum(y_pred), averaged_samples)
    gradient_l2_norm = K.sqrt(K.sum(K.square(gradients)))
    gradient_penalty = gradient_penalty_weight * K.square(1 - gradient_l2_norm)
    return gradient_penalty

class RandomWeightedAverage(_Merge):
    """Takes a randomly-weighted average of two tensors. In geometric terms, this outputs a random point on the line
    between each pair of input points.
    Inheriting from _Merge is a little messy but it was the quickest solution I could think of.
    Improvements appreciated."""

    def _merge_function(self, inputs):
        weights = K.random_uniform((64, 1, 1))
        return (weights * inputs[0]) + ((1 - weights) * inputs[1])

In [4]:
def add_white_noise(sound):
    wn = np.random.randint(-10000,10000,(len(sound), hp.c))
    sound = sound + wn
    return np.clip(sound, -8388608, 8388608-1)
    
def load_beat_data(policy):
    print("Loading data")
    X_train = []
    normalization_factor = 8388608
    num_versions = 5
    paths = glob.glob(os.path.normpath(os.getcwd() + '/ULTIMATE_DRUM_LOOPS/*.wav'))
    for i in range(len(paths)):
        sound = wavfile24.read(paths[i])
        if policy == 0:
            X_train.append(sound)
        elif policy == 1:
            wavfile.write('temp.wav', 14700, sound[1][::3])
            temp = wavfile.read('temp.wav')
            def get_length(path):
                if "125" in path:
                    return 14112
                elif "124" in path:
                    return 14226
                return 0
            length = get_length(paths[i])
            for _ in range(num_versions):
                a = add_white_noise (temp[1][:length])/normalization_factor
                b = np.zeros((16384 - length, 2))
                normed = np.concatenate((a,b))
                X_train.append(normed)
    return np.array(X_train) if policy == 1 else X_train

In [5]:
def generate_after_training(BATCH_SIZE):
    generator = generator_model()
    generator.compile(loss='binary_crossentropy', optimizer="SGD")
    generator.load_weights('goodgenerator.h5')
    
    noise = np.zeros((BATCH_SIZE, 100))
    for i in range(BATCH_SIZE):
        noise[i, :] = np.random.uniform(-1, 1, 100)
    generated_audio = generator.predict(noise, verbose=1)
    print(generated_audio.shape)
    for audio in generated_audio:
        wavfile.write('thing.wav', 14700, audio)

def make_generator_model(X_train, generator, discriminator):
    for layer in discriminator.layers:
        layer.trainable = False
    discriminator.trainable = False
    
    generator_input = Input(shape=(100,))
    generator_layers = generator(generator_input)
    discriminator_layers_for_generator = discriminator(generator_layers)
    generator_model = Model(inputs=[generator_input], outputs=[discriminator_layers_for_generator])
    
    # We use the Adam paramaters from Gulrajani et al.
    generator_model.compile(optimizer=Adam(0.0001, beta_1=0.5, beta_2=0.9), loss=wasserstein_loss)
    return generator_model

def make_discriminator_model(X_train, generator, discriminator):
    for layer in discriminator.layers:
        layer.trainable = True
    for layer in generator.layers:
        layer.trainable = False
    discriminator.trainable = True
    generator.trainable = False
    
    real_samples = Input(shape=(16384, hp.c))
    generator_input_for_discriminator = Input(shape=(100,))
    generated_samples_for_discriminator = generator(generator_input_for_discriminator)
    discriminator_output_from_generator = discriminator(generated_samples_for_discriminator)
    discriminator_output_from_real_samples = discriminator(real_samples)
    averaged_samples = RandomWeightedAverage()([real_samples, generated_samples_for_discriminator])
    averaged_samples_out = discriminator(averaged_samples)
    
    partial_gp_loss = partial(gradient_penalty_loss,
                          averaged_samples=averaged_samples,
                          gradient_penalty_weight=10)
    partial_gp_loss.__name__ = 'gradient_penalty'  
    
    discriminator_model = Model(inputs=[real_samples, generator_input_for_discriminator],
                            outputs=[discriminator_output_from_real_samples,
                                     discriminator_output_from_generator,
                                     averaged_samples_out])
    
    discriminator_model.compile(optimizer=Adam(0.0001, beta_1=0.5, beta_2=0.9),
                            loss=[wasserstein_loss,
                                  wasserstein_loss,
                                  partial_gp_loss])
    return discriminator_model

def get_noise(shape):
    return np.random.uniform(-1, 1, shape).astype(np.float32)

def train(epochs, BATCH_SIZE):
    np.random.seed(NP_RANDOM_SEED)
    X_train = load_beat_data(1)
    np.random.shuffle(X_train)
    
    discriminator = get_discriminator()
    generator = get_generator()
    
    generator_model = make_generator_model(X_train, generator, discriminator)
    discriminator_model = make_discriminator_model(X_train, generator, discriminator)
    
    positive_y = np.ones((BATCH_SIZE, 1), dtype=np.float32)
    negative_y = -positive_y
    dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32)
    
    print("Number of batches", int(X_train.shape[0]/BATCH_SIZE))
    for epoch in range(epochs):
        print("Epoch is", epoch)
        dl, gl = {}, {}
        np.random.shuffle(X_train)
        for index in range(int(X_train.shape[0]/BATCH_SIZE)):       
            audio_batch = X_train[index*BATCH_SIZE:(index+1)*BATCH_SIZE].reshape(BATCH_SIZE, 16384, hp.c)
            noise = get_noise((BATCH_SIZE, 100))
            d_loss = discriminator_model.train_on_batch([audio_batch, noise], [positive_y, negative_y, dummy_y])
            dl = d_loss
            if index % hp.D_updates_per_G_update == 0:
                #print("batch %d d_loss : %s" % (index, d_loss))
                noise = get_noise((BATCH_SIZE, 100))
                g_loss = generator_model.train_on_batch(noise, positive_y)
                gl = g_loss
                #print("batch %d g_loss : %0.10f" % (index, g_loss))
        
        if epoch % 500 == 0:
            print("epoch %d d_loss : %s" % (epoch, dl))
            print("epoch %d g_loss : %0.10f" % (epoch, gl))
            generator.save_weights('weights/generator' + str(epoch) + '.h5', True)
            discriminator.save_weights('weights/discriminator' + str(epoch) + '.h5', True)
            generate_one(generator, epoch, 0)

def generate_one(generator, epoch, index):
    noise = get_noise((1,100))
    generated_audio = generator.predict(noise, verbose=1)
    q = np.array(generated_audio[0]*8388608).astype('int32')
    wavfile24.write('outputs/epoch' + ("%04d" % epoch) + 'index'+ ("%03d" % index) + '.wav', 14700, q, bitrate=24)
        
def generate_batch(generator, weights_file, batch_size):
    noise = get_noise((batch_size,100))
    generator.load_weights(weights_file)
    generated_audio = generator.predict(noise, verbose=1)
    re_normalization_factor = 8388608
    assumed_sample_length = 14112
    sample_rate = 14700
    for i in range(len(generated_audio)):
        output = generated_audio[i]
        q = np.array(output*re_normalization_factor).astype('int32')
        wavfile24.write('generated_outputs/output' + ("%03d" % i) + '.wav', sample_rate, np.concatenate((q[:assumed_sample_length], q[:assumed_sample_length])), bitrate=24)
 
# Based on qualitative analysis 0.10 is a good threshold for two samples being alike. Furthermore, audiotory analysis
# shows the scores to be relatively well correlated with the similarity of the waveforms. 
def compute_similarity_score(threshold):
    original_beats = load_beat_data(0)
    X_train = load_beat_data(1)
    generated_outputs = glob.glob(os.path.normpath('/home/narainsk/beat_gan/BeatsByGAN/generated_outputs/*.wav'))
    num_similar = 0
    normalization_factor = 8388608
    num_samples_compared = 14112
    for i in range(len(generated_outputs)):
        generated_output_file = generated_outputs[i]
        b = (wavfile24.read(generated_output_file)[1])/normalization_factor
        for i in range(len(original_beats)):
            a = X_train[i*5]
            error = np.sum(np.square(a[:num_samples_compared] - b[:num_samples_compared]))
            similarity = error/(np.sum(np.square(a[:num_samples_compared])))
            if similarity <= threshold:
                num_similar += 1
                break
    
    print (str(num_similar) + ' similar out of ' + str(len(generated_outputs)))
    return (num_similar*1.0)/len(generated_outputs)
    
#train(6100, hp.b) - this was the original training call, 6k epochs

In [6]:
generator = get_generator()
generate_batch(generator, 'weights/generator6000.h5', 40)
print (compute_similarity_score(0.15))

  This is separate from the ipykernel package so we can avoid doing imports until


Loading data




Loading data
9 similar out of 40
0.225


In [18]:
print (compute_similarity_score(0.10))

Loading data




Loading data
9 similar out of 40
0.225


In [13]:
# Test Script that lets you manually check similarity of a generated output vs the training set
original_beats = load_beat_data(0)
X_train = load_beat_data(1)
generated_outputs = glob.glob(os.path.normpath('/home/narainsk/beat_gan/BeatsByGAN/generated_outputs/*.wav'))
generated_output_file = generated_outputs[15]
print ('using file' + generated_output_file)
normalization_factor = 8388608
num_samples_compared = 14112
b = (wavfile24.read(generated_output_file)[1])/normalization_factor
for i in range(len(original_beats)):
    a = X_train[i*5]
    error = np.sum(np.square(a[:num_samples_compared] - b[:num_samples_compared]))
    similarity = error/(np.sum(np.square(a[:num_samples_compared])))
    if similarity < 0.5:
        print (i)
        print (similarity)
        print (original_beats[i][1][:6])
        wavfile24.write('similarities_test/similar' + str(i) + '.wav', 44100, original_beats[i][1] , bitrate=24)
        wavfile24.write('similarities_test/similar' + str(i) + 'downsampled.wav', 14700, original_beats[i][1][::3] , bitrate=24)

Loading data




Loading data
using file/home/narainsk/beat_gan/BeatsByGAN/generated_outputs/output015.wav
56
0.0944771932906
[[ 1412384  1528271]
 [-2175288 -2346744]
 [-2719620 -3184740]
 [ -599306 -1000781]
 [  980280   679230]
 [ 3036766  2878628]]
339
0.296423343549
[[ 1707163  1325793]
 [-2172580 -2394666]
 [-3003963 -3255791]
 [ -931994 -1307497]
 [ 1113924   747368]
 [ 3253257  3189736]]
347
0.224672853009
[[ 1848923  1415223]
 [-2055090 -2303770]
 [-2905551 -3184642]
 [ -848912 -1243150]
 [ 1181088   808796]
 [ 3342973  3266019]]


In [189]:
# Based on qualitative analysis 0.05 is a good threshold for two samples being alike. Furthermore, audiotory analysis
# shows the scores to be relatively well correlated with the similarity of the waveforms. 
def compute_similarity_score(threshold):
    original_beats = load_beat_data(0)
    X_train = load_beat_data(1)
    generated_outputs = glob.glob(os.path.normpath('/home/narainsk/beat_gan/BeatsByGAN/generated_outputs/*.wav'))
    num_similar = 0
    normalization_factor = 8388608
    num_samples_compared = 14112
    for i in range(len(generated_outputs)):
        generated_output_file = generated_outputs[i]
        b = (wavfile24.read(generated_output_file)[1])/normalization_factor
        for i in range(len(original_beats)):
            a = X_train[i*5]
            error = np.sum(np.square(a[:num_samples_compared] - b[:num_samples_compared]))
            similarity = error/(np.sum(np.square(a[:num_samples_compared])))
            if similarity <= threshold:
                num_similar += 1
    
    print (num_similar)
    print ('out of')
    print (len(generated_outputs))
    return (num_similar*1.0)/len(generated_outputs)

print (compute_similarity_score(0.05))

Loading data




Loading data
16
out of
40
0.4


In [None]:
def load_wavegan_paper_drumhit_data(policy):
    print("Loading data")
    X_train = []
    skip_list = set(['/home/narainsk/beat_gan/BeatsByGAN/drums/Roland JV 1080/MaxV - Guiro.wav'])
    normalization_factor = 32768
    paths = glob.glob(os.path.normpath(os.getcwd() + '/drums/*/*.wav'))
    for i in range(len(paths)):
        if paths[i] not in skip_list:
            sound = wavfile.read(paths[i])
            if policy == 0:
                X_train.append(sound)
            elif policy == 1:
                if sound[1].size <= 44100:
                    wavfile.write('temp.wav', 14700, sound[1][::3])
                    temp = wavfile.read('temp.wav')
                    normed = np.concatenate((temp[1], np.zeros(16384 - len(temp[1]))))/normalization_factor
                    X_train.append(normed)
    return np.array(X_train) if policy == 1 else X_train
X_train = load_wavegan_paper_drumhit_data(1)
np.random.shuffle(X_train)

In [14]:
wavfile24.write('a.wav', 44100, X_train[0][1], bitrate=24)

In [7]:
paths[-1]

NameError: name 'paths' is not defined

In [13]:
#sound = wavfile24.read(paths[0])
sound = wavfile24.read('/home/narainsk/beat_gan/BeatsByGAN/ULTIMATE_DRUM_LOOPS/tr07_drlp_124_Complete_Full.wav')
wavfile24.write('temp_start.wav', 44100, sound[1], bitrate=24)
wavfile24.write('temp_downsampled.wav', 14700, sound[1][::3], bitrate=24)
temp_downsampled = wavfile24.read('temp_downsampled.wav')
wavfile24.write('temp_reupsampled.wav', 44100, nnresample.resample(temp_downsampled[1], 44100, 14700), bitrate=24)

In [None]:
hp.n = 2
#shuffle_amount = random.randint(-1*hp.n, hp.n)
print (shuffle_amount)
x = a[1].reshape(len(a[1]),1)
print (x)
print (x[shuffle_amount:, :])
print (x[:shuffle_amount, :])
combined = np.concatenate((x[shuffle_amount:, :], x[:shuffle_amount, :]), axis=0)
print (combined)

In [None]:
generator = get_generator()
generator.compile(loss='binary_crossentropy', optimizer="SGD")
generator.load_weights('goodgenerator.h5')
noise = np.random.normal(0, 1, (1, 100))
print (noise[:,0:10])
generated_audio = generator.predict(noise, verbose=1)
q = np.array(generated_audio[0]*32768).astype('int16')
print (generated_audio[0][0:10])
print (q[0:10])
wavfile.write('thing1.wav', 14700, q)

array([[ 0.06183016,  0.11935139],
       [-0.00069547,  0.09105957],
       [ 0.03382051, -0.05227661],
       ..., 
       [ 0.        ,  0.        ],
       [ 0.        ,  0.        ],
       [ 0.        ,  0.        ]])

In [172]:
def add_white_noise(sound):
    wn = np.random.randint(-10000,10000,(len(sound), hp.c))
    sound = sound + wn
    return np.clip(sound, -8388608, 8388608-1)
    
renormed = np.array(X_train[-100]*8388608).astype('int32')
wavfile24.write('a_renormed.wav',14700, renormed, bitrate=24)
wavfile24.write('a_renormed_wn.wav',14700, add_white_noise(renormed), bitrate=24)


In [133]:
np.mean(renormed), np.min(renormed), np.max(renormed)

(1900.0692443847656, -7692914, 8378955)

In [134]:
a = add_white_noise(renormed)
np.mean(a), np.min(a), np.max(a)

(1897.4014892578125, -7694340, 8379862)

In [126]:
np.max(add_white_noise(renormed)), np.min(add_white_noise(renormed))

(8367980, -6697630)

In [None]:
normalization_factor = 32786
a = X_train[1345]
print (a)
print (min(a[1]), max(a[1]), a[1].size)
wavfile.write('a.wav', 44100, a[1])
print(a[1][::3])
wavfile.write('a_14700.wav', 14700, a[1][::3])
normed = (a[1][::3]/normalization_factor)
renormed = np.array(normed*normalization_factor).astype('int16')
print (renormed)
wavfile.write('a_renormed.wav',14700, renormed)

In [None]:
normalization_factor = 32786
a = X_train[2401]
print (a)
print (a[1].shape)
wavfile.write('a.wav', 44100, a[1])
if a[1].size <= 44100:
    wavfile.write('temp.wav', 14700, a[1][::3])
    temp = wavfile.read('temp.wav')
    normed = np.concatenate((temp[1], np.zeros(16384 - len(temp[1]))))/normalization_factor
    print (normed)
    renormed = np.array(normed*normalization_factor).astype('int16')
    wavfile.write('a_renormed.wav',14700, renormed) 

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist([len(i) for i in x], bins="auto")
plt.show()
max_1 = 250000

In [None]:
plt.hist([len(i) for i in x if len(i) < max_1], bins="auto")
plt.show()
max_2 = 100000

In [None]:
plt.hist([len(i) for i in x if len(i) < max_2], bins="auto")
plt.show()
max_3 = 44100

In [None]:
plt.hist([len(i) for i in x if len(i) < max_3], bins="auto")
plt.show()

In [None]:
maxes = [1e99, max_1, max_2, max_3]
for m in maxes:
    print (len([len(i) for i in x if len(i) < m]))

In [None]:
paths = glob.glob(os.path.normpath(os.getcwd() + '/drums/*/*.wav'))
lens = []
for i in range(len(paths)):
    if i != 4963:
        x = wavfile.read(paths[i])
        if x[0] != 44100:
            print (x[0])

<function wavfile24.read>

In [None]:
BATCH_SIZE = 32
np.random.seed(NP_RANDOM_SEED)
np.random.shuffle(X_train)
data = X_train[:BATCH_SIZE]
data = data.reshape(BATCH_SIZE, 16384, 1)

noise = np.zeros((BATCH_SIZE, 100))
for i in range(BATCH_SIZE):
    noise[i, :] = np.random.uniform(-1, 1, 100)
generator = generator_model()
generated_data = generator.predict(noise, verbose=0)

discriminator = discriminator_model()
d_optim = Adam(lr=1e-4, beta_1=0.5, beta_2=0.9)
discriminator.compile(loss='binary_crossentropy', optimizer=d_optim)
discriminator.trainable = True
X = np.concatenate((data, generated_data))
y = [1] * BATCH_SIZE + [0] * BATCH_SIZE

for i in range(100):
    d_loss = discriminator.train_on_batch(X, y)
    print("batch %d d_loss : %0.10f" % (i, d_loss))

In [None]:
BATCH_SIZE = 32
np.random.seed(NP_RANDOM_SEED)
np.random.shuffle(X_train)
data = X_train[:BATCH_SIZE]
data = data.reshape(BATCH_SIZE, 16384, 1).astype('float32')

noise = np.zeros((BATCH_SIZE, 100))
for i in range(BATCH_SIZE):
    noise[i, :] = np.random.uniform(-1, 1, 100)
generator = generator_model()
generated_data = generator.predict(noise, verbose=0)