- RNN-GAN - fourier-transformed audiodata - split complex number values

In [5]:
import numpy as np
import tensorflow.keras as keras
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import soundfile as sf

Add path to util-functions

In [6]:
import sys
sys.path.append('D:/1. Studium/7. Semester/BA/Birdvoice/utils')

The Paths to the FT-databases

In [7]:
databasePathsParusMajor=['../../../../dataPreprocessing/databases/ft/parusMajor/ParusMajorSongFT1.csv', '../../../../dataPreprocessing/databases/ft/parusMajor/ParusMajorSongFT5.csv']
databasePathsTurdusMerula=['../../../../dataPreprocessing/databases/ft/turdusMerula/TurdusMerulaSongFT5.csv']
databasePathsCorvusCorone=['../../../../dataPreprocessing/databases/ft/corvusCorone/CorvusCoroneCallFT5.csv']

Load data from chosen path

In [8]:
readSampleCount = 500

#number of columns per dataset-sample for
ONE_SECOND = 22
FIVE_SECONDS = 108

#load and reshape data
x_train = pd.read_csv(databasePathsTurdusMerula[0], header=None, delimiter=',', index_col=None, nrows=readSampleCount).to_numpy()
x_train = np.reshape(x_train, [readSampleCount, 1025, FIVE_SECONDS])

#change datatype to complex128 and expand one dim
x_train = x_train.astype('complex128')

SAMPLE_COUNT, AUDIO_ROWS, AUDIO_COLS = x_train.shape

TODO: replace value and insert imaginary part after that

In [9]:
combinedArray = np.empty((readSampleCount, 1025, AUDIO_COLS*2), dtype=np.float32)

#split the comples data for all numbers
for lineIndex in range(SAMPLE_COUNT):
    for rowIndex in range(AUDIO_ROWS):
        for colIndex in range(AUDIO_COLS):
            combinedArray[lineIndex][rowIndex][colIndex*2] = x_train[lineIndex][rowIndex][colIndex].real
            combinedArray[lineIndex][rowIndex][colIndex*2+1] = x_train[lineIndex][rowIndex][colIndex].imag

In [10]:
x_train = combinedArray.copy()
SAMPLE_COUNT, AUDIO_ROWS, AUDIO_COMBINED = x_train.shape

Create optimizer

In [11]:
optimiser = keras.optimizers.Adam(0.002, 0.5)

Create the discriminator

In [12]:
from utils.discriminatorsFT import *

In [13]:
discriminator = buildDiscriminator1(AUDIO_ROWS, AUDIO_COMBINED)
discriminator.compile(loss = "binary_crossentropy",
                      optimizer = optimiser,
                      metrics = "accuracy")

Create the generator and GAN

In [14]:
from utils.generatorsFT import *

In [15]:
generator = buildGenerator4(AUDIO_ROWS, AUDIO_COMBINED)
discriminator.trainable = False
gan = keras.models.Sequential( [generator, discriminator] )
gan.compile(loss="binary_crossentropy",
           optimizer = optimiser,
           metrics = "accuracy")


Run the training:

In [16]:
from utils.trainingFT import trainFT

In [None]:
ITERATIONS = 50
BATCH_SIZE = 1

trainFT(iterations=ITERATIONS, batch_size=BATCH_SIZE, generator=generator, discriminator=discriminator, network=gan, data=x_train, columnCount=AUDIO_COMBINED)

Iteration: 0
d_loss_real: {'loss': 0.6938037872314453, 'accuracy': 0.0009756097570061684}
d_loss_gen: {'loss': 0.6964981555938721, 'accuracy': 0.0}
g_loss: [0.6916704773902893, 0.0]
Iteration: 1
d_loss_real: {'loss': 0.6923372745513916, 'accuracy': 0.0009756097570061684}
d_loss_gen: {'loss': 0.699492871761322, 'accuracy': 0.010731707327067852}
g_loss: [0.6949120163917542, 0.004878048785030842]
Iteration: 2
d_loss_real: {'loss': 0.6923531293869019, 'accuracy': 0.0}
d_loss_gen: {'loss': 0.7026169896125793, 'accuracy': 0.0}
g_loss: [0.6933498382568359, 0.0]
Iteration: 3
d_loss_real: {'loss': 0.6921148896217346, 'accuracy': 0.0019512195140123367}
d_loss_gen: {'loss': 0.709833025932312, 'accuracy': 0.0}
g_loss: [0.7006258368492126, 0.0]
Iteration: 4
d_loss_real: {'loss': 0.6961697340011597, 'accuracy': 0.0}
d_loss_gen: {'loss': 0.7039899230003357, 'accuracy': 0.0}
g_loss: [0.7116666436195374, 0.0]
Iteration: 5
d_loss_real: {'loss': 0.6925654411315918, 'accuracy': 0.0019512195140123367}
d_lo

Test of generator after training

In [None]:
#generate sound
noise = np.random.normal(0,0.01, (1, 1025, AUDIO_COMBINED))  
fake_audios = generator.predict(noise)

#combine split to complex values
complexDataArray = np.empty((1, 1025, AUDIO_COLS), dtype=np.complex128)
for lineIndex in range(complexDataArray.shape[0]):
    for rowIndex in range(complexDataArray.shape[1]):
        for colIndex in range(complexDataArray.shape[2]):
            complexDataArray[lineIndex][rowIndex][colIndex] = complex(fake_audios[lineIndex][rowIndex][colIndex*2], fake_audios[lineIndex][rowIndex][colIndex*2+1])

istftAudio = librosa.istft(complexDataArray[0])
glAudio = librosa.griffinlim(complexDataArray[0])

#show
plt.figure(figsize=(12, 6))
plt.plot(istftAudio*100)

plt.figure(figsize=(12, 6))
plt.plot(glAudio)

Save generated output to file

In [None]:
sf.write("tm5g4d1gen.wav", istftAudio*100, 11025)
sf.write("tm5g4d1gl.wav", glAudio, 11025)