# Simulation 1 
Ruohui HU, Xinyu HUANG, Qingyuan Yao

In [94]:
import threading
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tf_slim as slim
import scipy.signal
from PIL import Image
from PIL import ImageDraw 
from PIL import ImageFont
%matplotlib inline
from helper import *

from random import choice
from time import sleep
from time import time
tf.compat.v1.disable_eager_execution()

### Helper Functions

In [124]:
def get_random_p0():
    p0 = np.random.uniform(0,.5)
    if (0.1 <p0 < 0.2 )or (0.3 < p0 < 0.4):
        return get_random_p0()
    return p0

class TwoArmbandit():
    def __init__(self):
        self.num_actions = 2
        self.reset()
    
    #set the baseline probability of reward for action a.
    #sampling from a uniform Benoulli distribution and held fix for the entire episode
    def reset(self):
        self.timestep = 0 
        self.nb_al = 0
        self.nb_ar = 0
        self.nb = [0,0]
        self. timestepmax = np.random.randint(50,100)
        #print("timestepmax",self.timestepmax)
        variance = get_random_p0()
        self.baseline_prob = [variance,0.5-variance]
        print("baseline prob",self.baseline_prob)

    ##get action from the network
    def pullArm(self,action,prev_actions):
        self.timestep += 1       
        p_action = self.baseline_prob[action]
        if action == 0 and len(prev_actions)!=0 :
            if prev_actions[-1] == 0: 
                self.nb_al+=1
                p_action = 1 - np.power((1- self.baseline_prob[action]),self.nb_al +1)
            else:
                self.nb_al = 0 
            reward = random.choices([1,0],weights=[p_action,1-p_action])[0]
        elif action == 1 and len(prev_actions)!=0: 
            if prev_actions[-1] == 1: 
                self.nb_ar+=1
                p_action = 1 - np.power((1- self.baseline_prob[action]),self.nb_ar +1)
            else:
                self.nb_ar = 0 
            reward = random.choices([1,0],weights=[p_action,1-p_action])[0]
        else:
            if action == 0 : 
                self.nb_al+=1
                reward = random.choices([1,0],weights=[p_action,1-p_action])[0]  
            else :
                self.nb_ar+=1
                reward = random.choices([1,0],weights=[p_action,1-p_action])[0]
        if self.timestep > self.timestepmax: 
            done = True
        else: done = False
        return  reward,done,self.timestep    

      

### Actor-Critic Network

In [125]:

class AC_Network():
    def __init__(self,a_size,scope,trainer):
        #print("in the AC_Network")
        with tf.compat.v1.variable_scope(scope):
            #Input and visual encoding layers
            self.prev_rewards = tf.compat.v1.placeholder(shape=[None,1],dtype=tf.float32)
            self.prev_actions = tf.compat.v1.placeholder(shape=[None],dtype=tf.int32)
            self.timestep = tf.compat.v1.placeholder(shape=[None,1],dtype=tf.float32)
            self.prev_actions_onehot = tf.one_hot(self.prev_actions,a_size,dtype=tf.float32)
            hidden = tf.concat([self.prev_rewards,self.prev_actions_onehot,self.timestep],1)
            #tf.print("hidden",hidden)
            #print("hidden",hidden)
            #Recurrent network for temporal dependencies
            #tf.keras.layers.LSTMCell
            lstm_cell = tf.compat.v1.nn.rnn_cell.BasicLSTMCell(48,state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.compat.v1.placeholder(tf.float32, [1, lstm_cell.state_size.c])
            h_in = tf.compat.v1.placeholder(tf.float32, [1, lstm_cell.state_size.h])
            self.state_in = (c_in, h_in)
            rnn_in = tf.expand_dims(hidden, [0])
            #print(rnn_in)
            step_size = tf.shape(self.prev_rewards)[:1]
            state_in = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(c_in, h_in)
            lstm_outputs, lstm_state = tf.compat.v1.nn.dynamic_rnn(
                lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
                time_major=False)
            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 48])
            #print("rnn_out",rnn_out)
           
            self.actions = tf.compat.v1.placeholder(shape=[None],dtype=tf.int32)
            self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
            #print("action onehot",self.actions_onehot)
                        
            #Output layers for policy and value estimations
            self.policy = slim.fully_connected(rnn_out,a_size,
                activation_fn=tf.nn.softmax,
                weights_initializer=normalized_columns_initializer(0.01),
                biases_initializer=None)
            self.value = slim.fully_connected(rnn_out,1,
                activation_fn=None,
                weights_initializer=normalized_columns_initializer(1.0),
                biases_initializer=None)
            #print("policy and value",self.policy,self.value)
            
            #Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                
                self.target_v = tf.compat.v1.placeholder(shape=[None],dtype=tf.float32)
                self.advantages = tf.compat.v1.placeholder(shape=[None],dtype=tf.float32)
                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])
                #Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.math.log(self.policy + 1e-7))
                self.policy_loss = -tf.reduce_sum(tf.math.log(self.responsible_outputs + 1e-7)*self.advantages)
                self.loss = 0.05* self.value_loss + self.policy_loss - self.entropy * 0.05
                #print("loss",self.loss.shape)
                #Get gradients from local network using local losses
                local_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope)
                #print(len(local_vars)) #52,192
                self.gradients = tf.gradients(self.loss,local_vars)
                #with tf.compat.v1.Session() as sess:
                #    sess.run("loss",self.loss)
                self.var_norms = tf.linalg.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,50.0)
                #print(self.var_norms) 

                #Apply local gradients to global network
                global_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, 'global')
                
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))

### Worker Agent

In [126]:
class Worker():
    def __init__(self,game,name,a_size,trainer,model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name        
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.episode_buffers = []
        self.summary_writer = tf.compat.v1.summary.FileWriter("train_"+str(self.number))

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        #print("create a copy")
         
        #print("self.updat_local",self.update_local_ops)   
        self.env = game
        self.local_AC = AC_Network(a_size,self.name,trainer) 
      
        self.update_local_ops = update_target_graph('global',self.name)  
        
    def train(self,rollout,sess,gamma,bootstrap_value):
        #print("now train")
        rollout = np.array(rollout)
        #print("rollout",rollout)
        actions = rollout[:,0]
        #print("action",actions)
        rewards = rollout[:,1]
        #print("rewards",rewards)
        timesteps = rollout[:,2]
        #print("timesteps",timesteps)
        prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [0] + actions[:-1].tolist()
        values = rollout[:,4]
        self.pr = prev_rewards
        self.pa = prev_actions
        #print("pr,pa",self.pr,self.pa)
        # Here we take the rewards and values from the rollout, and use them to generate the advantage and discounted returns. 
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])#np.asarray(rewa=rds.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        #r = self.reward_buffer[t] + self.discount_factor * r
        #discounted_rewards[t] = r
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages,gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        rnn_state = self.local_AC.state_init
        feed_dict = {self.local_AC.target_v:discounted_rewards,
            self.local_AC.prev_rewards:np.vstack(prev_rewards),
            self.local_AC.prev_actions:prev_actions,
            self.local_AC.actions:actions,
            self.local_AC.timestep:np.vstack(timesteps),
            self.local_AC.advantages:advantages,
            self.local_AC.state_in[0]:rnn_state[0],
            self.local_AC.state_in[1]:rnn_state[1]}
        v_l,p_l,e_l,g_n,l,v_n,apply_grads= sess.run(
            [self.local_AC.value_loss,
            self.local_AC.policy_loss,
            self.local_AC.entropy,
            self.local_AC.grad_norms,
            self.local_AC.loss, 
            self.local_AC.var_norms,
            self.local_AC.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout),l,g_n,v_n,feed_dict
        #print( "in train",v_l,p_l ,e_l ,l,g_n,v_n,feed_dict)
        #return v_l,p_l ,e_l ,l,g_n,v_n,apply_grads
        
        
    def work(self,gamma,global_AC,sess,coord,saver,train,num_episodes=1000):
        #print("now work")
        episode_count = sess.run(self.global_episodes)
        #print("episode_count",episode_count)
        total_steps = 0
        #print (" Starting worker " + str(self.number))
        spisode_Cl = []
        spisode_Cr = []
        spisode_Rl = []
        spisode_Rr = []
        with sess.as_default(), sess.graph.as_default():              
            while not coord.should_stop() and episode_count <= num_episodes:
                #if i == 10000 : coord.should_stop()
                #print("session run update local ops")
                sess.run(self.update_local_ops)
                #hold = sess.run(self.update_local_ops)
                #print("update from global to local",hold)

                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = [0,0]
                prev_actions = []
                episode_step_count = 0
                d = False
                r = 0
                a = 0
                t = 0
                self.env.reset()
                rnn_state = self.local_AC.state_init
                nb_aL = 0
                nb_aR = 0
                nb_rL = 0
                nb_rR = 0
              

                while d == False :
                    #Take an action using probabilities from policy network output.
                    a_dist,v,rnn_state_new = sess.run([self.local_AC.policy,self.local_AC.value,self.local_AC.state_out], 
                        feed_dict={
                        self.local_AC.prev_rewards:[[r]],
                        self.local_AC.timestep:[[t]],
                        self.local_AC.prev_actions:[a],
                        self.local_AC.state_in[0]:rnn_state[0],
                        self.local_AC.state_in[1]:rnn_state[1]})
                    print("a_dist",a_dist)
                    a = np.random.choice(a_dist[0],p=a_dist[0])
                    #a = np.where(a_dist[0]==a)[0][0]
                    #print("a",a)
                    a = np.argmax(a_dist == a)
                    #print("argmax",a)
                    #a = 1
                    rnn_state = rnn_state_new
                    r,d,t = self.env.pullArm(a,prev_actions) 
                    prev_actions.append(a)
                    #print("prev_actions",prev_actions)
                    #print("r",r) 
                    #a = 1 
                    if a==0 :
                        #r = 0
                        #Count CL
                        nb_aL+=1 
                        if r==1 : 
                            nb_rL+=1
                        
                    if a==1 : 
                        #r = 1
                        #Count CR
                        nb_aR+=1
                        if r==1:
                            nb_rR+=1  

                   
                  
                    #print(nb_aL,nb_aR,nb_rL,nb_rR)
                    
                    episode_buffer.append([a,r,t,d,v[0,0]])
                    episode_values.append(v[0,0])
                    episode_frames.append(set_image_bandit(episode_reward,self.env.baseline_prob,a,t))
                    episode_reward[a] += r
                    total_steps += 1
                    episode_step_count += 1
                    #print("episode_step_count",episode_step_count)
                    #print(episode_buffer)
                    
                                           
                self.episode_rewards.append(np.sum(episode_reward))
                #print("self.episode_rewards",self.episode_rewards)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))
                
                # Update the network using the experience buffer at the end of the episode.
                if len(episode_buffer) != 0 and train == True:
                    #print('================================')
                    v_l,p_l,e_l,l,g_n,v_n,apply_grads= self.train(episode_buffer,sess,gamma,0.0)
                    #print("apply_grad",apply_grads)
                    
                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count != 0:
                    if episode_count % 10000 == 0 and self.name == 'worker_0' and train == True:
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    if episode_count % 10000 == 0 and self.name == 'worker_0' and train == True:
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    if episode_count %100 == 0 and self.name == 'worker_0':
                        self.images = np.array(episode_frames)
                        make_gif(self.images,'./frames/image'+str(episode_count)+'.gif',
                            duration=len(self.images)*0.1,true_image=True)

                    mean_reward = np.mean(self.episode_rewards[-50:])
                    mean_length = np.mean(self.episode_lengths[-50:])
                    mean_value = np.mean(self.episode_mean_values[-50:])
                    summary = tf.compat.v1.Summary()
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
                    if train == True:
                        summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
                        summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                        summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                        summary.value.add(tag='Losses/Loss', simple_value=float(l))
                        summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                        summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'worker_0' :
                    sess.run(self.increment)
                    spisode_Cl.append(nb_aL)
                    spisode_Cr.append(nb_aR)
                    spisode_Rl.append(nb_rL)
                    spisode_Rr.append(nb_rR)
                    #print("episode_count",episode_count," : ",spisode_Cl,spisode_Cr,spisode_Rl,spisode_Rr)
                    #print("global_episodes",self.global_episodes,"reward",self.episode_rewards)
                if episode_count == num_episodes:
                    print("episode_count",episode_count," : ",spisode_Cl,spisode_Cr,spisode_Rl,spisode_Rr)
                episode_count += 1

In [127]:
gamma = 0.75 # discount rate for advantage estimation and reward discounting
a_size = 2 # Agent can move Left, Right, or Fire
load_model = False
train = True
model_path = './model_meta'

In [90]:
!tensorboard --logdir train_0

2023-01-11 14:20:50.412658: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.11.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C
