In [5]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
import scipy.signal
import multiprocessing
import threading
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
%matplotlib inline
import os



from random import choice
from time import sleep
from time import time


#function to update local variables to the global network per A3C
def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder


#define the discount function
def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]    
    

KeyboardInterrupt: 

## Bandit Environments

In [None]:
class bandit_environments():
    
    def __init__(self,difficulty):
        self.num_actions = 2
        self.difficulty = difficulty
        self.reset()
    
    
        
    def reset(self):
        #Even though a timestep is not part of MetaRL, I set it as in the study in order to keep track of actions taken
        self.timestep = 0
        if self.difficulty == 'easy': bandit_prob = np.random.choice([0.9,0.1])
        if self.difficulty == 'medium': bandit_prob = np.random.choice([0.70,0.30])
        if self.difficulty == 'hard': bandit_prob = np.random.choice([0.6,0.4])
        if self.difficulty == 'uniform': bandit_prob = np.random.uniform()
        if self.difficulty != 'independent':
            self.bandit = np.array([bandit_prob,1 - bandit_prob])
        else:
            self.bandit = np.random.uniform(size=2)

        #Action for each of the bandits
    def pullArm(self,action):
        #Get a random number.
        self.timestep += 1
        bandit = self.bandit[action]
        result = np.random.uniform()
        #if the action taken by the bandit is greater than a sample from uniform distribution - give a reward
        if result < bandit:
            reward = 1
        else:
            #return a negative reward.
            reward = 0
        if self.timestep > 99: 
            done = True
        else: done = False
        return reward,done,self.timestep

In [None]:
class AC_Network():
    def __init__(self,a_size,scope,trainer):
        with tf.variable_scope(scope):
            #previous rewards, actions and the current timestep as input
            self.prev_rewards = tf.placeholder(shape=[None,1],dtype=tf.float32)
            self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32)
            self.timestep = tf.placeholder(shape =[None, 1], dtype=tf.float32)
            #utilize one-hot encoding on previous actions
            self.prev_actions_onehot = tf.one_hot(self.prev_actions,a_size,dtype=tf.float32)
            
            hidden = tf.concat([self.prev_rewards, self.prev_actions_onehot,self.timestep],1)
            
            
            #Recurrent neural network for temporal dependencies
            #48 number of units in the LSTM cell as in 
            lstm_cell = tf.contrib.rnn.BasicLSTMCell(48,state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
            self.state_in = (c_in, h_in)
            rnn_in = tf.expand_dims(hidden, [0])
            step_size = tf.shape(self.prev_rewards)[:1]
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
                time_major=False)
            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 48])
            self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
            self.actions_onehot=tf.one_hot(self.actions,a_size,dtype=tf.float32)
                    
            def normalized_columns_initializer(std=1.0):
                def _initializer(shape, dtype=None, partition_info=None):
                    out = np.random.randn(*shape).astype(np.float32)
                    out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
                    return tf.constant(out)
                return _initializer

            #Output Layers
            self.policy = slim.fully_connected(rnn_out,a_size,
                activation_fn = tf.nn.softmax,
                weights_initializer = normalized_columns_initializer(0.01),
                biases_initializer=None)
            self.value = slim.fully_connected(rnn_out,1,
                activation_fn = None,
                weights_initializer = normalized_columns_initializer(1.0),
                biases_initializer=None)
            
            #loss functions and gradients only for local networks to then apply to the global network
            
            if scope != 'global':
                self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)
                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot,[1])
                
                #loss function
                self.value_loss = 0.5 *tf.reduce_sum(tf.square(self.target_v -
                                                               tf.reshape(self.value,[-1])))
                self.entropy = -tf.reduce_sum(self.policy *tf.log(self.policy + 1e-7))
                self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs + 1e-7)*self.advantages)
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.05
                
                #retrieve gradients from the local network (workers)
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,50.0)
                
                #apply gradients to global
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))

In [None]:
#create each worker to interact with environment asynchronously

class Worker():
    def __init__(self, game, name, a_size, trainer, model_path, global_episodes):
        self.name = "Agent_" + str(name)
        self.number = 'mediumtraining_70-30'
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_"+str(self.number))
    
        
                
        #copy the global parameters from the AC network and create a local copy
        self.local_AC = AC_Network(a_size, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)
        self.env = game
        
    #Training paradigm
    def train(self, rollout, sess, gamma, bootstrap_value):
        #use rollout to simulate the system and update value functions
        rollout = np.array(rollout)
        actions = rollout[:,0]
        rewards = rollout[:,1]
        timesteps = rollout[:,2]
        #creating a variable for previous rewards and actions for Meta-Learning
        prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [0] + actions[:-1].tolist()
        
        values = rollout[:,4]
        
        self.pr = prev_rewards
        self.pa = prev_actions
        
        
        #Use the rewards and actions to generate the advantage and discounted returns
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus, gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages, gamma)
        
        #Now, update global network using gradients
        #generate the network statistics in a local dictionary to save
        rnn_state = self.local_AC.state_init
        feed_dict = {self.local_AC.target_v:discounted_rewards,
                    self.local_AC.prev_rewards:np.vstack(prev_rewards),
                    self.local_AC.prev_actions:prev_actions,
                    self.local_AC.actions:actions,
                    self.local_AC.timestep:np.vstack(timesteps),
                    self.local_AC.advantages:advantages,
                    self.local_AC.state_in[0]:rnn_state[0],
                    self.local_AC.state_in[1]:rnn_state[1]}
        v_l,p_l,e_l,g_n,v_n,_ = sess.run([self.local_AC.value_loss,
                                         self.local_AC.policy_loss,
                                         self.local_AC.entropy,
                                         self.local_AC.grad_norms,
                                         self.local_AC.var_norms,
                                         self.local_AC.apply_grads],
                                        feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout), e_l / len(rollout), g_n, v_n
    
    def work(self,gamma,sess,coord,saver,train):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print ("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = [0,0]
                episode_step_count = 0
                d = False
                r = 0
                a = 0
                t = 0
                self.env.reset()
                rnn_state = self.local_AC.state_init
                
                
                while d == False:
                    #take an action by using the probabilities generated from the policy network output
                    a_dist, v, rnn_state_new = sess.run([self.local_AC.policy, self.local_AC.value,self.local_AC.state_out],
                            feed_dict = {
                            self.local_AC.prev_rewards:[[r]],
                            self.local_AC.timestep:[[t]],
                            self.local_AC.prev_actions:[a],
                            self.local_AC.state_in[0]:rnn_state[0],
                            self.local_AC.state_in[1]:rnn_state[1]})
                    a = np.random.choice(a_dist[0],p=a_dist[0])
                    a = np.argmax(a_dist == a)
                
                    rnn_state = rnn_state_new
                    r, d, t = self.env.pullArm(a)
                    episode_buffer.append([a, r, t, d, v[0,0]])
                    episode_values.append(v[0,0])
                    
                    episode_reward[a] += r
                    total_steps += 1
                    episode_step_count += 1
                
                
                self.episode_rewards.append(np.sum(episode_reward))
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))
            
                #at the end of each episode, update the local network using the experience buffer
                if len(episode_buffer) != 0 and train == True:
                    v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)


                #Save statistics and model parameters periodically
                if episode_count % 50 == 0 and episode_count != 0:
                    if episode_count % 500 == 0 and self.name == 'Agent_0' and train == True:
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print ('Saved Model')
            
              
                    
                    
                    mean_reward = np.mean(self.episode_rewards[-50:])
                    mean_length = np.mean(self.episode_lengths[-50:])
                    mean_value = np.mean(self.episode_mean_values[-50:])
                    summary = tf.Summary()
                    summary.value.add(tag='Accuracy', simple_value = float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))

                    if train == True:
                        summary.value.add(tag='Losses/Value loss', simple_value=float(v_l))
                        summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                        summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                        summary.value.add(tag='losses/Grad norm', simple_value=float(g_n))
                        summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))

                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'Agent_0':
                    sess.run(self.increment)
                episode_count += 1


In [None]:
gamma = .8 #Discount Rate for advantage estimation
a_size = 2 
load_model = True
train = True
model_path = './model_medium-70-30'



In [None]:
tf.reset_default_graph()

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
if not os.path.exists('./frames'):
    os.makedirs('./frames')

    
with tf.device("/cpu:0"): 
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    #Learning rate determined by the values in the study
    trainer = tf.train.AdamOptimizer(learning_rate=1e-3)
    master_network = AC_Network(a_size,'global',None) # Generate global network
    #num_workers = multiprocessing.cpu_count() # Set workers to number of available CPU threads
    num_workers = 1
    workers = []
    # Create worker classes
    for i in range(num_workers):
        workers.append(Worker(bandit_environments('medium'),i,a_size,trainer,model_path,global_episodes))
    saver = tf.train.Saver(max_to_keep=5)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    if load_model == True:
        print ('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
        
    worker_threads = []
    for worker in workers:
        worker_work = lambda: worker.work(gamma,sess,coord,saver,train)
        thread = threading.Thread(target=(worker_work))
        thread.start()
        worker_threads.append(thread)
    coord.join(worker_threads)
    