# Noize net
The following notebook contains the code used for both an LSTM and RNN used for investigating the feasibility of music generation using an RNN. 

Note that berevity of code both the RNN and LSTM are implimented in one code base. This does mean that the code is somewhat more complicated and necessitates the use of "LSTMBool" which defines throughout the code if operations will take place for an LSTM or RNN. 

Notes to reader:
* This notebook is a refactor of the main python file noizenet.py
* This work contains many code iterations and testing which means that there is legacy code scattered throughout the implimentation for different software schemes. Things like batching were tested and then removed. 


# Imports

First we must import all the necessary dependencies.

Then check if we can run on a GPU.

Additionally we can define the scaler for scaling input data and inverting a scale on output data. 

In [227]:
import matplotlib   
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import librosa as lib
import os
import soundfile as sf #For writing
import matplotlib.pyplot as plt
import gc
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import sklearn as skl
import pandas as pd
import utils
import librosa.display
import noise


# torch.autograd.detect_anomaly(True) #Check for errors and return a stack trace. (Used to debug nan loss)

# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

#Print if we are able to use a GPU
if(train_on_gpu):
    print('Processing on GPU.')
else:
    print('No GPU available.')

scaler = sklearn.preprocessing.StandardScaler()

Processing on GPU.


# Helper functions
Bellow we define some functions that will be used later in this work

## Time to FT
Convert an array of time/amplitude values to frequency domain

In [228]:
def time_to_fft(time_domain):
    # Compute FFT
    fft = np.fft.fft(time_domain)
    # Concatinate real and imaginary values
    new_input = np.concatenate((np.real(fft), np.imag(fft))) #Scheme that takes imaginary into account
    #new_input = np.real(fft) #Only consider real values
    return new_input

## FT to Time
Convert an array of fourier values to the time domain

In [229]:
def fft_to_time(ft_domain):
    # check if the input can be divinded in two and assign as needed
    if ft_domain.shape[0] % 2 == 0:
        num_elems = (int)(ft_domain.shape[0] / 2)
    else:
        ft_domain = ft_domain[0:-1]
        num_elems = (int)(ft_domain.shape[0] / 2)
    #If we took complex into account
    # Get real part 
    real = ft_domain[0:num_elems]
    # Get imaginary part
    imag = ft_domain[num_elems:]
    # Recompose real and Im parts
    composition = real + 1.0j * imag
    # ifft back to time domain
    
    #If we ignore complex values
    #composition = ft_domain
    time = np.fft.ifft(composition)
    return time

## Batching function 
This function is rarely used in this work but some testing was done with batches to check their performance.

In [230]:
def get_batches(arr, batch_size, seq_length):
    #Note this code was adapted fome Udacity course on machine learning. 
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough elements to fill a batch
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

## Perlin noise generator
For some input array convert each value to perlin noise. 

In [231]:
def genPerlin(x):
    tmp = []
    for xx in x:
        tmp.append(noise.pnoise1(xx)) #Generate new noise value based on single data point
    return tmp

# Defining the model
Then we can define the Model using the pytorch class definition.

The model contains the following layers:
* LSM or RNN layer 
* Dropout Layer
* Fully connected layer for translating oyutput

After defining the layers in the model we can define a forward function used for the formward pass during training and prediction. 
* Given c0 (Or None for RNN), the hidden state and some imput x.
* Pass the hidden state and input through the RNN layer and recieve some output
* Selectively choose values to drop (Note this only happens for hidden layers > 1)
* Pass the result through the fully connected layer to obtain the final result
* Return the prediction and hidden states

Additionally we define a helper funcation to generate the hidden states for an LSTM

In [232]:
class NoizeNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, LSTMBool, dropout_prob):
        super(NoizeNet, self).__init__()

        self.LSTMBool = LSTMBool  #Boolean to determine if the model is RNN or LSTM
        self.hidden_dim = hidden_dim #The size of the hidden layer
        self.num_layers = n_layers #The number of hidden layers
        self.output_size = output_size #The size of the output dimension, always one for this work
        self.input_size = input_size #The size of the input dimension, always one for this work
        self.dropout_prob = dropout_prob #Prabability for dropout


        #If We want an LSTM define LSTM layer and initialise
        if LSTMBool:
            self.lstm = nn.LSTM(input_size, hidden_dim, n_layers, dropout=dropout_prob,batch_first=True)
        else:
            # define an RNN with specified parameters
            self.rnn = nn.RNN(input_size, hidden_dim, n_layers, dropout=dropout_prob, batch_first=True)
        
        #Define the dropout layer
        self.dropout = nn.Dropout(dropout_prob)

        # last, fully-connected layer
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x, hidden, c0=None):
        batch_size = x.size(0)
        
        if (train_on_gpu):
            x.cuda()
        else:
            x.cpu()

        if self.LSTMBool:
            h_0 = hidden
            c_0 = c0
            #Rename variables as convention
            state = (h_0, c_0)
            # Propagate input through LSTM
            r_out, (hn, cn) = self.lstm(x, state) #lstm with input, hidden, and internal state
            hidden = hn
        else:
            # get RNN outputs
            r_out, hidden = self.rnn(x , hidden)

        r_out = self.dropout(r_out) #Dropout

        if (train_on_gpu):
            hidden.cuda()
        
        #Reshape output for fully connected layer
        r_out = r_out.view(-1, self.hidden_dim)

        # get final output
        output = self.fc(r_out)
        
        #Return states based on LSTM bool
        if self.LSTMBool:
            return output, (hidden, cn)
        else:
            return output, hidden
        
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        #Initialise the hidden state
        if (train_on_gpu):
            hidden = torch.autograd.Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim)).cuda() #hidden state
            c0 = torch.autograd.Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim)).cuda() #internal state
        else:
            hidden = torch.autograd.Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim)).cuda() #hidden state
            c0 = torch.autograd.Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim)).cuda() #internal state

        return hidden, c0

# Training
We can then define the training function. This function takes in the model, n_steps which is a bit of a misnoma as it is used as the number of full "frame" steps that will complete the impute sequence. An array of genreTracks which are translated into file names. Using AUDIO_DIR and file names we can read in the input songs. Step size is used to determine how large of a step to take in the training data. Duration is the duration of the input song to read in. Number of tracks is the number of songs that we will train on. Clip is the value of the griadient clipping. 

In this work we vary the training procedue among trails. However the procedure remains unchanged and is as follows:
* Ensure model is in training mode
* load validation data and perform and needed transformations/scaling
* Initialize hidden layers
* Loop over a number of input tracks
* At each loop iteration
    * Load and transform input training data
    * Define the needed variables
    * Check if input data contains a NaN value
    * For some number of steps that was calculated earlier
        * For some number of epochs 
            * Select some data
            * Ensure data has a shift in input and target
            * Convert to tensors and give the input a batch size dimension of one
            * Move data to GPU if we can
            * Get a prediction frome the LSTM or RNN
            * Detach hidden states from history
            * Zero gradients
            * Calculate loss and add it to the plot array
            * Perform backward step
            * Clip gradients
            * Take an optimizer step to optimize weights
            * Perform validation (I will not walk through this as it is the same procedure in eval mode)
    * Clean memory if the model is large
* Plot losses
* Return the trained model

In [234]:
def train(noizeNet, n_steps, AUDIO_DIR, genreTracks, LSTM ,step_size=1, duration=5, numberOfTracks=1, clip=5, fft_bool=False):
    fileCount = 0 #Used for displaying the file that is currently being trained on
    noizeNet.train() #Set the model to training mode
    lossArr = [] #Array used to plot loss over time
    val_losses = [] #Array used to store validation losses
    if(train_on_gpu):
        noizeNet.cuda() #Move the model to GPU if available

    val_file = utils.get_audio_path(AUDIO_DIR, genreTracks[-1]) #Get file name of validation data
    val_data, sr = lib.load(val_file, mono=True, duration = duration) #Load validation data
    
    
    if fft_bool:
        print("We are training on FFT data")
        val_data = time_to_fft(val_data) #FFT validation data
    val_data = scaler.fit_transform(val_data.reshape(-1,1)) #Scale data
    

    
    #Loop over all the files in our filtered list
    for id in genreTracks: 
        #Stop training after n files
        fileCount+=1
        if(fileCount > numberOfTracks):
            break 

        filename = utils.get_audio_path(AUDIO_DIR, id) #Get the actual path to the file from the id
        
        fileData, sr = lib.load(filename, mono=True, duration = duration) #Load training song
        fileData = fileData.reshape(-1,1) #Convert to correct shape
        
        if fft_bool:
            fileData = time_to_fft(fileData) #If we want to train on FFT data
        
        fileData = scaler.fit_transform(fileData) #Scale the data

        batch_size = (int)(duration*sr/n_steps) #Find the size of the window that slides across the input song
        number_of_steps = (int)(duration*sr)-batch_size #Assumes step size of one as a larger step size produced poor results
        if(number_of_steps == 0):
            number_of_steps = 1
        if(np.isnan(np.sum(fileData))):
            print("NAN ON FILE:\t", filename)
            break
        for e in range(0,100):
            if LSTM:
                    (hidden, c0) = noizeNet.init_hidden(1)
                else:
                    hidden = None
            for batch_i in (range(0,number_of_steps, step_size)):
                
                # for x, y in get_batches(fileData, 500, 50): #Note this was used in the testing of batched data
                
                # defining the training data
                data = fileData[batch_i: batch_size + batch_i]
                data = data.reshape(-1,1)   
                x = data[:-1] #Select all but the last element in the input data
                y = data[1:] #Select all but the first element in the input data. Essentially a forward shift in time

                # convert data into Tensors
                x_tensor = torch.Tensor(x).unsqueeze(0)  # unsqueeze gives a 1, batch_size dimension
                y_tensor = torch.Tensor(y)
                if(train_on_gpu):
                        x_tensor, y_tensor = x_tensor.cuda(), y_tensor.cuda() #Move tensors to GPU
                
                if LSTM:
                    prediction, (hidden, c0) = noizeNet(x_tensor, hidden, c0) #Get output from LSTM
                else:
                    prediction, hidden = noizeNet(x_tensor, hidden) #Get output from RNN
                
                #Detach hidden state from history
                hidden = hidden.data

                if LSTM:
                    c0 = c0.data #Detatch cell state from history


                # zero gradients
                optimizer.zero_grad()

                # calculate the loss
                
                loss = criterion(prediction, y_tensor)
                # if(np.isnan(loss)):
                #     break
                lossArr.append(loss.item())
                # perform backprop and update weights
                loss.backward()
                
                torch.nn.utils.clip_grad_value_(noizeNet.parameters(), clip) #Clip gradient
                optimizer.step()


                #Validation step. I won't comment the case as the logic is same as above. Note we are in eval mode though!
                if(int((batch_i)) % 1000 == 0):
                    if LSTM:
                        (val_h, val_c) = noizeNet.init_hidden(1)
                    else:
                        val_h = None

                    
                    noizeNet.eval()
                    val_x = val_data[:-1]
                    val_y = val_data[1:]

                    

                    for ep in range(0,10):
                        val_x = val_data[:-1]
                        val_y = val_data[1:]
                        val_x, val_y = torch.from_numpy(val_x), torch.from_numpy(val_y)
                        
                        
                        inputs, targets = val_x.reshape(-1,1).unsqueeze(0), val_y.reshape(-1,1)
                        if(train_on_gpu):
                            inputs, targets = inputs.cuda(), targets.cuda()
                        if LSTM:
                            output, (val_h, val_c) = noizeNet(inputs, val_h, val_c)
                        else:
                            output, val_h = noizeNet(inputs, val_h)
                        val_loss  = criterion(output, targets)

                        val_losses.append(val_loss.item())

                        if LSTM:
                            val_h = val_h.data
                            val_c = val_c.data
                        else:
                            val_h = val_h.data


                    print("Training Progress:\t", round((fileCount*e/(number_of_steps*numberOfTracks*100))*100, 2), "%"  , sep="")
                    print("P:\t", prediction.cpu().data.numpy().flatten()[-5:],"\nY:\t", y[-5:].flatten() ,"\nX:\t" , x[-5:].flatten() ,sep="")
                    print('Training Loss: ', loss.item(), "Validation loss: ", val_loss.item() , "\t num:", batch_i, "\t File:", fileCount)
                    noizeNet.train() # reset to train mode after validation
            
        #Clean unused variables to ensure memory is kept free
        del prediction
        del x_tensor
        del y_tensor
        del data
        del x
        del y
        gc.collect()
    #Plot validation loss and training loss
    fig, ax = plt.subplots(nrows=2)
    ax[0].plot(lossArr)
    ax[0].set(title="Training loss", ylabel="Loss", xlabel="Epochs")
    ax[1].plot(val_losses)
    ax[1].set(title="Validation loss", ylabel="Loss", xlabel="Epochs")
    fig.tight_layout()
    plt.show()
    return noizeNet

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 47)

# Prediction and Music generation

Then we define the prediction function. 

In this work we have two prediction schemes that produce equivalent results. I will explain them independently. 

## Prediction scheme 1 
This prediction scheme itteratively runs through the sequence of input seed values and creates a prediction which is appended to the seed. This new sequence is used as a seed for the next prediction. Here we attempt to replace any knowledge of the old seed with purely knowledge from predicted values. 

This prediction sheme functions as follows
* Recieve the model, prediction seed track, duration of seed, prediction duration and LSTM bool
* Read in the seed data and perform any needed scaling/transformation
* Select some range of the seed and obtain a prediction from the model
* Record the last element in the prediction
* Append this predicted value to a music array that will become the generated song
* Shift the data window to exclude an early data point and append the predicted value
* Repeat this procedure until we have predicted enough points for the desired music output. 

In [None]:
def predict(noizeNet, genreTrack ,duration=1, n_steps=30, LSTMBool=False, predictDuration = 30, step_size=1, fft_bool=False):
    print("PREDICTING...")
    noizeNet.eval() #move to eval mode
    hidden = None #instantiate hidden
    c0 = None #Instantiate c0
    if LSTMBool:
        (hidden, c0) = noizeNet.init_hidden(1) #initialize hidden states
        
    filePath = utils.get_audio_path(AUDIO_DIR, genreTrack) #Get the actual path to the file from the id
    y, sr = lib.load(filePath, mono=True, duration = duration) #Read the input seed
    if fft_bool:
        y = time_to_fft(y) #Move to frequency domain if needed
    y = scaler.fit_transform(y.reshape(-1,1)) #Scale seed

    data = y

    # data = np.random.normal(-1,1,y.shape) #Generate normal noise
    # data = genPerlin(np.linspace(0,1,y.size)) #Generate perlin noise
    batch_size = (int)(sr*duration)

    number_of_steps = (int)(sr*predictDuration/step_size) #Calculate the number of steps

    music = [] #Instantiate the music array
    next = data[batch_size-1] #Find the "next value"

    sf.write('predictionSeed.wav', np.append( data[step_size: batch_size], next), sr,format="WAV") #Wite the seed
    print("BATCH SIZE:", batch_size ,sep="\t")
    print("NUMBER OF STEPS:", number_of_steps , sep="\t")

    
    for batch_i in (range(0, number_of_steps)):
        if LSTMBool:
            hidden, c0 = noizeNet.init_hidden(1) #init hidden states
        if(train_on_gpu):
            noizeNet.cuda() #Move to GPU
        data = data[batch_i: batch_size-1 + batch_i] #Select piece of data
        data = np.append(data,next) #Append data to music
        x = data.reshape(-1,1) #Ensure correct vector shape
        print(x.shape)
        if(np.isnan(np.sum(data))):
            print("data contains NAN", data) #Check if input seed has a NaN value

        x_tensor = torch.Tensor(x).unsqueeze(0)  # unsqueeze gives a 1, batch_size dimension
        if(train_on_gpu):
                x_tensor = x_tensor.cuda() #Move to GPU if possible

        if(LSTMBool):
            prediction, (hidden, c0) = noizeNet(x_tensor, hidden, c0) #Get prediction from LSTM
            c0 = c0.data #detach cell state from history
        else:
            prediction, hidden = noizeNet(x_tensor, hidden) #Get RNN prediction
        hidden = hidden.data #detach hidden state from history
        
        
        music = np.append(music,(prediction.cpu().data.numpy().flatten())[-1]) #Append predicted value to music
        next = prediction.cpu().data.numpy().flatten()[-1] #Set next to the predicted value
        
        #Print progress
        if(int((batch_i)) % 1000 == 0):
            print("PROGRESS:\t", round(((batch_i)/number_of_steps)*100, 2), "%"  , sep="")
            print("Prediction dimensions:\t", prediction.cpu().size(), "\t" ,prediction.cpu().data.numpy().flatten().size, "\nMusic dimensions:\t", music.size ,sep="")
       
    print(music)
    music = scaler.inverse_transform(music.reshape(-1,1)) #Invert scale
    if fft_bool:
        music = fft_to_time(music).astype('float32') #Move to time domain if needed
    
    #Write the song
    sf.write('/home/liam/Desktop/University/2021/MAM3040W/thesis/writeup/code/outputSoundFile.wav', (music), 22050,format="WAV")
    time_steps = np.linspace(0, len(music), len(music))
    plt.plot(time_steps, music,"b.",  markersize=0.1) #Plot predicted result
    plt.show()
    return music

## Prediction Scheme 2
This scheme is similar to the previous one, however, more efficient as we process the seed element by elements and then pass the hidden states to a new function that will begin predicting new values from the last predicted value from the seed and hidden states. 

In [None]:
def predict2(noizeNet, genreTrack ,duration=1, n_steps=30, LSTMBool=False, predictDuration = 30, step_size=1, fft_bool=False):
    print("PREDICTING...")
    noizeNet.eval() #Ensure we are in eval mode

    if LSTMBool:
        hidden, c0 = noizeNet.init_hidden(1) #init hidden state and cell state
    else:
        hidden = None
    filePath = utils.get_audio_path(AUDIO_DIR, genreTrack) #Get the actual path to the file from the id
    y, sr = lib.load(filePath, mono=True, duration = duration) # Load seed data
    if fft_bool:
        y = time_to_fft(y) #Move to frequency domain if needed
    y = scaler.fit_transform(y.reshape(-1,1)) #Scale data
#     y = np.random.normal(-1,1,y.shape) #Generate normal noise as seed
#     data = genPerlin(np.linspace(0,1,y.size)) #Generate perlin noise as seed
    sf.write('Predict2Seed.wav', y, 22050,format="WAV") #Write the seed
    

    number_of_steps = (int)(sr*duration/step_size) #Calculate the number of steps needed to process the seed
    music = [] #Initialise a list for the predicted music 
    for batch_i in (range(0, number_of_steps)): #Loop over the number of steps required
        if(train_on_gpu):
            noizeNet.cuda() #Move to GPU if we can
        data = y[batch_i] #Get the current prediction seed value
        data = data.reshape(-1,1) #Ensure the correct shape
        x = data 


        x_tensor = torch.Tensor(x).unsqueeze(0)  # unsqueeze gives a 1, batch_size dimension
        if(train_on_gpu):
                x_tensor = x_tensor.cuda() #Move to GPU if we can

        if(LSTMBool):
            prediction, (hidden, c0) = noizeNet(x_tensor, hidden, c0) #Get the LSTM prediction
            c0 = c0.data #Detach cell state from history
        else:
            prediction, hidden = noizeNet(x_tensor, hidden) #Get the RNN prediction
        hidden = hidden.data #Detach hidden state from history
        
        music = np.append(music,(prediction.cpu().data.numpy().flatten())[-step_size:]) #Append predicted value to music
        #Print progress
        if(int((batch_i)) % 1000 == 0):
            print("PROGRESS:\t", round(((batch_i)/number_of_steps)*100, 2), "%"  , sep="")
            print("Prediction dimensions:\t", prediction.cpu().size(), "\t" ,prediction.cpu().data.numpy().flatten().size, "\nMusic dimensions:\t", music.size ,sep="")
       
    sf.write('outputSoundFilePredict2.wav', (music), 22050,format="WAV") #Write the reproduction
    time_steps = np.linspace(0, len(music), len(music)) 
    plt.plot(time_steps, music,"b.",  markersize=1) #Plot the reproduction
    plt.show()
    if LSTMBool:
        return music, (hidden, c0) #Return hidden states and prediction
    else: 
        return music, hidden #Return hidden state and prediction

## Predict 3
This function, uses the hidden states from predict 2 to begin generating new music predictions. 

In [None]:
def predict3(noizeNet, seeded ,duration=1, n_steps=30, LSTMBool=False, predictDuration = 30, step_size=1, hidden = None, c0 = None, fft_bool=False):
    print("PREDICTING...")
    noizeNet.eval() #Ensure we are in eval mode
    number_of_steps = (int)(22050*predictDuration/step_size) #Calculate the number of predictions needed
    music = [] #Instantiate the list used for sotring the generated music
    x = [0] #Give x a value. This is only needed for noise injection (perturation)
    for batch_i in (range(0, number_of_steps)): #Loop for the number of predicted steps needed
        if(train_on_gpu):
            noizeNet.cuda()
        seeded = np.array([seeded]).reshape(-1,1) #Ensure vector is in the correct shape
        data = seeded
#         if x[0] - seeded[0] < 0.0001:
#             data = seeded + np.random.normal(-1,1)
        data = data.reshape(-1,1) #Ensure vector is in the correct shape
        x = data
        
        x_tensor = torch.Tensor(x).unsqueeze(0)  # unsqueeze gives a 1, batch_size dimension

        if(train_on_gpu):
                x_tensor = x_tensor.cuda()

        if (LSTMBool):
            prediction, (hidden, c0) = noizeNet(x_tensor, hidden, c0)  #Obtain prediction
            c0 = c0.data  #Detach cell state from history
        else:
            prediction, hidden = noizeNet(x_tensor, hidden) #Obtain RNN prediction
        hidden = hidden.data # Detach hidden state from history
        
        seeded = prediction.cpu().data.numpy()[-1:step_size:] #Get the next value for prediction 
        music = np.append(music,(prediction.cpu().data.numpy().flatten())[-step_size:]) #Append predicted value to music
        #Print progress
        if(int((batch_i)) % 100 == 0):
            print("PROGRESS:\t", round(((batch_i)/number_of_steps)*100, 2), "%"  , sep="")
            print("Prediction dimensions:\t", prediction.cpu().size(), "\t" ,prediction.cpu().data.numpy().flatten().size, "\nMusic dimensions:\t", music.size ,sep="")
       
    
    music = scaler.inverse_transform(music.reshape(-1,1)) #Invert the scaling
    if fft_bool:
        music = fft_to_time(music).astype('float32') #Move back to time domain if needed
    
    print(music)
    sf.write('outputSoundFilePrediction3.wav', (music), 22050,format="WAV") #Write generated music
    time_steps = np.linspace(0, len(music), len(music))
    plt.plot(time_steps, music,"b.",  markersize=1) #Plot generated music sample view
    plt.show()
    return music

## Saving function
A helper function used for naming a model by it's hyer parameters.

In [None]:
def generateModelName(n_steps = 30, print_every = 5, step_size =  1, duration = 5, numberOfTracks = 1, clip = 5, LSTMBool=False, hidden_dim=50,n_layers=1, fft_bool=False):
    return "n_steps="+ str(n_steps) + "__" +"print_every="+ str(print_every) + "__" +"step_size="+ str(step_size) + "__" +"duration="+ str(duration) + "__" +  \
    "numberOfTracks="+ str(numberOfTracks) + "__" +  "clip="+ str(clip) + "__" +  "LSTMBool="+ str(LSTMBool) + "hidden_dim="+ str(hidden_dim) + "__" +"n_layers="+ str(n_layers) +"__"+ "lr=" + str(lr) + "__fft_bool" + str(fft_bool)+ ".pt"


# Hyper parameters
Before we start training we must declair the hyperparameters and instantiate the model.

I believve all of these variables should be self explanatory. Note this is also where we choose if the miodel will be an LSTM or RNN.

In [None]:
# decide on hyperparameters
input_size=1
output_size=1
hidden_dim=30 
n_layers=1
LSTMBool = True
dropout_prob = 0.5

# instantiate an RNN/LSTM
noizeNet = NoizeNet(input_size, output_size, hidden_dim, n_layers, LSTMBool, dropout_prob)
print(noizeNet)

# The two criterions used in this work
criterion = nn.MSELoss()
# criterion = nn.L1Loss()
lr=0.00001

# The various optimizers used in this work
optimizer = torch.optim.Adam(noizeNet.parameters(), lr=lr)
# optimizer = torch.optim.SGD(noizeNet.parameters(),lr=lr)
# optimizer = torch.optim.Adadelta(noizeNet.parameters())

# Music filtering by genre

Here we just filter all the data in FMA for some chosen genre. 

In [None]:
#Get metadata for fma dataset
AUDIO_DIR = "data/fma_small/"

tracks = utils.load('data/fma_metadata/tracks.csv') #Load metadata

small = tracks['set', 'subset'] <= 'small' #select the FMA small dataset
genre1 = tracks['track', 'genre_top'] == 'Instrumental' #Select only instrumental songs
# genre2 = tracks['track', 'genre_top'] == 'Hip-Hop' #We can set multilpe genres bellow as (genre1 | genre2)
genreTracks = list(tracks.loc[small & (genre1),('track', 'genre_top')].index) #Get a list of filtered songs

# Training variables

Initialise the variables needed for training the network

In [None]:
#Set if we want to train new model or load and predict with saved model
TRAIN = True

fft_bool = False
n_steps = 1 #The number of full frame steps to be taken to complete training
print_every = 5 #Not actually used. This was replaced by persentage printing
step_size =  1 #The step size taken by each training frame 
duration = 0.1 #The duration of the training segment
predictDuration = 0.1 #The duration of the predicted song is seconds
numberOfTracks = 2 #The number of tracks to be trained on
clip = 1 #Gradient clipping
seedDuration = 1

# Begin Training or load model and predict
Then we can call our training and prediction functions as needed. 

In [None]:
if TRAIN:
    print("Training...")
    trained_rnn = train(noizeNet, n_steps,AUDIO_DIR, genreTracks, LSTMBool ,step_size=step_size, duration = duration, numberOfTracks=numberOfTracks, clip=clip)
    torch.save(trained_rnn.state_dict(), generateModelName(n_steps, print_every, step_size, duration, numberOfTracks, clip, LSTMBool, hidden_dim,n_layers, fft_bool))
    duration = seedDuration
    predict(trained_rnn, genreTracks[-1] ,duration=duration, n_steps=n_steps, LSTMBool=LSTMBool, predictDuration = predictDuration, step_size=step_size)
    if LSTMBool:
        predicted, (hidden, c0) = predict2(trained_rnn, genreTracks[-2] ,duration=duration, n_steps=n_steps, LSTMBool=LSTMBool, predictDuration = predictDuration, fft_bool=fft_bool)
        predict3(noizeNet, predicted[-1] ,duration=duration, n_steps=n_steps, LSTMBool=LSTMBool, predictDuration = predictDuration,  hidden=hidden, c0 =c0, fft_bool=fft_bool)
    else: 
        predicted, hidden = predict2(trained_rnn, genreTracks[-2] ,duration=duration, n_steps=n_steps, LSTMBool=LSTMBool, predictDuration = predictDuration, fft_bool=fft_bool)
        predict3(noizeNet, predicted[-1] ,duration=duration, n_steps=n_steps, LSTMBool=LSTMBool, predictDuration = predictDuration,  hidden=hidden, fft_bool=fft_bool)
    

    

else:
    # instantiate an model
    noizeNet = NoizeNet(input_size, output_size, hidden_dim, n_layers, LSTMBool, dropout_prob)
    #Load saved model
    noizeNet.load_state_dict(torch.load("n_steps=1__print_every=5__step_size=1__duration=10__numberOfTracks=100__clip=5__LSTMBool=Truehidden_dim=200__n_layers=1__lr=0.0001.pt"))
    duration = seedDuration #Set duration = seed duration for prediction seed
    predict(trained_rnn, genreTracks[-1] ,duration=duration, n_steps=n_steps, LSTMBool=LSTMBool, predictDuration = predictDuration, step_size=step_size)
    if LSTMBool:
        predicted, (hidden, c0) = predict2(noizeNet, genreTracks[-2] ,duration=duration, n_steps=n_steps, LSTMBool=LSTMBool, predictDuration = predictDuration, fft_bool=fft_bool)
        predict3(noizeNet, predicted[-1] ,duration=duration, n_steps=n_steps, LSTMBool=LSTMBool, predictDuration = predictDuration,  hidden=hidden, c0 =c0)
    else: 
        predicted, hidden = predict2(noizeNet, genreTracks[-2] ,duration=duration, n_steps=n_steps, LSTMBool=LSTMBool, predictDuration = predictDuration, fft_bool=fft_bool)
        predict3(noizeNet, predicted[-1] ,duration=duration, n_steps=n_steps, LSTMBool=LSTMBool, predictDuration = predictDuration,  hidden=hidden, fft_bool=fft_bool)