### Set Up

#### Standard library imports

In [1]:
import datetime
import os
import sys

#### Third party imports 

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file

#### Local imports

In [3]:
from modules.magenta.rl_tuner import rl_tuner
from modules.magenta.rl_tuner import rl_tuner_ops

#### Autoreload

In [4]:
%load_ext autoreload
%autoreload 2

#### Setting relative directories and file names

In [5]:
# Place to save your model checkpoints and composion
Working_Directory = os.getcwd()
Project_Directory = os.path.abspath(os.path.join(Working_Directory,'..'))
Output_Directory = Project_Directory + "/outputs/"
Model_Directory = Output_Directory + "models/"
Music_Out_Directory = Output_Directory + "midi/"
Checkpoint_Directory = Model_Directory + "ckpt/"

In [6]:
current_time_str = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
try:
    os.mkdir(Checkpoint_Directory + current_time_str[:-7])    
except:
    pass
DRL_ckpt_dir_name = "20211016" + "/rl_tuner/"
DRL_ckpt_name = "Long_Train_plus_chopin"
DRL_ckpt_dir = Checkpoint_Directory + DRL_ckpt_dir_name
DRL_ckpt_file = DRL_ckpt_dir + DRL_ckpt_name

DL_ckpt_dir_name = "20211006/"
DL_ckpt_name = "Long_Train_256_plus_chopin"
DL_ckpt_dir = Checkpoint_Directory + DL_ckpt_dir_name
DL_ckpt_file = DL_ckpt_dir + DL_ckpt_name

#### Compare ckpt -files

In [None]:
print_tensors_in_checkpoint_file(file_name = DRL_ckpt_file, tensor_name='', all_tensors=False)

In [None]:
print_tensors_in_checkpoint_file(file_name = DL_ckpt_file, tensor_name='', all_tensors=False)

### Initialise network

In [None]:
# Model parameter settings
ALGORITHM = 'q'
REWARD_SCALER = 1
OUTPUT_EVERY_NTH = int(5e4)
NUM_NOTES_IN_COMPOSITION = 32
PRIME_WITH_MIDI = False

In [None]:
rl_tuner_hparams = tf.contrib.training.HParams(random_action_probability = 0.1,
                                               store_every_nth = 1,
                                               train_every_nth = 5,
                                               minibatch_size = 32,
                                               discount_rate = 0.5,
                                               max_experience = 10000,
                                               target_network_update_rate = 0.01)

In [None]:
rl_net = rl_tuner.RLTuner(DRL_ckpt_dir,
                          Music_Out_Directory,
                          note_rnn_checkpoint_dir = DL_ckpt_dir,
                          note_rnn_checkpoint_file = DL_ckpt_file,
                          dqn_hparams = rl_tuner_hparams, 
                          algorithm = ALGORITHM,
                          reward_scaler = REWARD_SCALER,
                          output_every_nth = OUTPUT_EVERY_NTH,
                          num_notes_in_melody = NUM_NOTES_IN_COMPOSITION)

### Generate initial music sequence before training with RL 

In [None]:
rl_net.generate_music_sequence(visualize_probs=True, title='pre_rl', length=48)

### DRL training

In [None]:
n = int(1e6)
rl_net.train(num_steps=n, exploration_period=n // 2)

### Analyse Results

In [None]:
grad = tf.constant([-0.1,1.,0.,0.])
epsilon = tf.constant(0.001)
grad_pos_eps = tf.cast(tf.greater_equal(grad, 0), dtype=grad.dtype) * epsilon
grat_neg_eps = tf.cast(tf.less(grad, 0), dtype=grad.dtype) * (-epsilon)
grad = grad + grad_pos_eps + grat_neg_eps

with tf.Session() as sess:  print(grad.eval()) 

In [None]:
# Plot the rewards received during training. Improves as chance of random exploration action decreases.
rl_net.plot_rewards()

In [None]:
# Plot rewards received during calls to evaluation function throughout training. 
# Does not include exploration or random actions.
rl_net.plot_evaluation()

### Generate sequence after training with RL 

In [None]:
rl_net.generate_music_sequence(visualize_probs=True, title='post_rl', length=96)

### Save Model

In [None]:
# If you're happy with the model, save a version!
rl_net.save_model('model_full_no_chopin', DRL_ckpt_dir)

### Compute music theory statistics

In [None]:
# Compute statistics about how well the model adheres to the music theory rules.
stat_dict = rl_net.evaluate_music_theory_metrics(num_compositions=100)

### Restore from directory

In [None]:
rl_tuner_hparams = tf.contrib.training.HParams(random_action_probability = 0.1,
                                               store_every_nth = 1,
                                               train_every_nth = 5,
                                               minibatch_size = 32,
                                               discount_rate = 0.5,
                                               max_experience = 10000,
                                               target_network_update_rate = 0.01)

In [None]:
x      = rl_tuner.RLTuner(DRL_ckpt_dir,
                          Music_Out_Directory,
                          note_rnn_checkpoint_dir = DL_ckpt_dir,
                          note_rnn_checkpoint_file = DL_ckpt_file,
                          dqn_hparams = rl_tuner_hparams, 
                          algorithm = ALGORITHM,
                          reward_scaler = REWARD_SCALER,
                          output_every_nth = OUTPUT_EVERY_NTH,
                          num_notes_in_melody = NUM_NOTES_IN_COMPOSITION)

In [None]:
x.restore_from_directory(Checkpoint_Directory+DRL_ckpt_dir_name,'model_full_no_chopin-995000', reward_file_name =  )