# Prepare

## Import

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from datetime import datetime
from scipy.stats import entropy
import scipy as sp



In [2]:
gamma = 0.9  # Discount factor for past rewards
nb_steps = 100
nb_episodes = 10000
learning_rate = 7e-4
bootstrap_n = 10

beta_v = 0.05
beta_e = 0.05

In [3]:
path = datetime.now().strftime("%m%d-%H:%M:%S_lr="+str(learning_rate)+"_epoch="+str(nb_episodes))
log_dir = path+'/logs/'
ckpt_dir = path+'/ckpt/'
train_summary_writer = tf.summary.create_file_writer(log_dir)

2023-01-03 01:16:18.991637: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Two Step Task

In [4]:
# encoding of the higher stages
S_1 = 0
S_2 = 1
S_3 = 2
nb_states = 3

class two_step_task():
    def __init__(self):
        # start in S_1
        self.state = S_1
        
        # defines what is the stage with the highest expected reward. Initially random
        self.highest_reward_second_stage = np.random.choice([S_2,S_3])
        
        self.num_actions = 2
        self.reset()
        
        # initialization of plotting variables
        common_prob = 0.8
        self.transitions = np.array([
            [common_prob, 1-common_prob],
            [1-common_prob, common_prob]
        ])
        self.transition_count = np.zeros((2,2,2))
        
        self.last_action = None
        self.last_state = None
    
    def get_state(self):
        one_hot_array = [0.,0.,0.]
        one_hot_array[self.state] = 1.0
        return tf.convert_to_tensor(one_hot_array)

    def possible_switch(self):
        if (np.random.uniform() < 0.025):
            # switches which of S_2 or S_3 has expected reward of 0.9
            self.highest_reward_second_stage = S_2 if (self.highest_reward_second_stage == S_3) else S_3
            
    def get_rprobs(self):
        """
        probability of reward of states S_2 and S_3, in the form [[p, 1-p], [1-p, p]]
        """
        if (self.highest_reward_second_stage == S_2):
            r_prob = 0.9
        else:
            r_prob = 0.1
        
        rewards = np.array([
            [r_prob, 1-r_prob],
            [1-r_prob, r_prob]
        ])
        return rewards
            
    def isCommon(self,action,state):
        if self.transitions[action][state] >= 1/2:
            return True
        return False
        
    def updateStateProb(self,action):
        if self.last_is_rewarded: #R
            if self.last_is_common: #C
                if self.last_action == action: #Rep
                    self.transition_count[0,0,0] += 1
                else: #URep
                    self.transition_count[0,0,1] += 1
            else: #UC
                if self.last_action == action: #Rep
                    self.transition_count[0,1,0] += 1
                else: #URep
                    self.transition_count[0,1,1] += 1
        else: #UR
            if self.last_is_common:
                if self.last_action == action:
                    self.transition_count[1,0,0] += 1
                else:
                    self.transition_count[1,0,1] += 1
            else:
                if self.last_action == action:
                    self.transition_count[1,1,0] += 1
                else:
                    self.transition_count[1,1,1] += 1
                    
        
    def stayProb(self):
        print(self.transition_count)
        row_sums = self.transition_count.sum(axis=-1)
        stay_prob = self.transition_count / row_sums[:,:,np.newaxis] 
       
        return stay_prob

    def reset(self):
        self.timestep = 0
        
        # for the two-step task plots
        self.last_is_common = None
        self.last_is_rewarded = None
        self.last_action = None
        self.last_state = None
        
        # come back to S_1 at the end of an episode
        self.state = S_1
        
        return self.get_state()
        
    def step(self,action):
        self.timestep += 1
        self.last_state = self.state
        
        # get next stage
        if (self.state == S_1):
            # get reward
            reward = 0
            # update stage
            self.state = S_2 if (np.random.uniform() < self.transitions[action][0]) else S_3
            # keep track of stay probability after first action
            if (self.last_action != None):    
                self.updateStateProb(action)
            self.last_action = action
            # book-keeping for plotting
            self.last_is_common = self.isCommon(action,self.state-1)
            
        else:# case S_2 or S_3
            # get probability of reward in stage
            r_prob = 0.9 if (self.highest_reward_second_stage == self.state) else 0.1
            # get reward
            reward = 1 if np.random.uniform() < r_prob else 0
            # update stage
            self.state = S_1
            # book-keeping for plotting
            self.last_is_rewarded = reward

        # new state after the decision
        new_state = self.get_state()
        if self.timestep >= 200: 
            done = True
        else: 
            done = False
        return new_state,reward,done,self.timestep
    
    def trial(self,action):
        # do one action in S_1, and keep track of the perceptually distinguishable state you arive in
        observation,_,_,_ = self.step(action)
        # do the same action in the resulting state (S_2 or S_3). The action doesn't matter, the reward does
        _,reward,done,_ = self.step(action)
        return observation,reward,done,self.timestep
    
env = two_step_task()

## LOSS

In [5]:

def calculate_R_t(rewards: tf.Tensor, last_value):
    bootstrap_n = tf.shape(rewards)[0]
    
    returns = tf.TensorArray(dtype=tf.float32, size=bootstrap_n)
    rewards = rewards[::-1]
    
    discounted_sum = last_value
    for i in tf.range(bootstrap_n):
        discounted_sum = rewards[i] + gamma * discounted_sum
        returns = returns.write(bootstrap_n-i-1, discounted_sum)

    return returns.stack()



In [6]:
def compute_loss(
        action_probs: tf.Tensor,
        values: tf.Tensor,
        rewards: tf.Tensor,
        entropy : tf.Tensor) -> tf.Tensor:
    """Computes the combined actor-critic loss."""
    
    R_t = calculate_R_t(rewards, values[-1])
    delta = R_t - values
     

    critic_loss = beta_v * 0.5 * tf.reduce_sum(tf.square(delta))
    #critic_loss = beta_v * tf.reduce_sum(delta * values)
    actor_loss = tf.reduce_sum(tf.math.log(action_probs + 1e-7) * delta)
    entropy = beta_e * entropy

    # critic_loss = beta_v * tf.reduce_sum(delta_nogradient * values)
    
    # R_t = calculate_R_t(rewards, values[-1])
    # delta = tf.reshape(R_t,(tf.shape(rewards)[0],1)) - values
    # delta_nogradient = tf.stop_gradient(delta)

    # critic_loss = beta_v * 0.5 * tf.reduce_sum(tf.square(delta))
    # action_log_probs = tf.math.log(action_probs + 1e-7)
    # actor_loss = tf.reduce_sum(action_log_probs * delta_nogradient)
    # entropy = beta_e * entropy
    
    total_loss = actor_loss + critic_loss + entropy
    
    return total_loss, actor_loss, critic_loss, entropy

## LSTM

In [7]:
num_inputs = 7 #states + reward + action + timestep= 3 + 1 + 2 + 1
num_actions = 2
num_hidden = 48

inputs = layers.Input(shape=(num_inputs))
state_h = layers.Input(shape=(num_hidden))
state_c = layers.Input(shape=(num_hidden))

common, states = layers.LSTMCell(num_hidden)(inputs, states=[state_h, state_c], training=True)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs=[inputs,state_h,state_c], outputs=[action, critic, states], )

#model.save('init.h5')

# Train

In [None]:
optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
running_reward = 0
episode_count = 0
#model = keras.models.load_model('init.h5')
  

for episode in range(nb_episodes):  # Run until solved
    state = env.reset()
    action_probs_history = []
    critic_value_history = []
    rewards_history = []
    episode_reward = 0
    reward = 0.0
    action_onehot = np.zeros((2))
    cell_state = [tf.zeros((1,num_hidden)),tf.zeros((1,num_hidden))]
    
    episode_entropy = tf.zeros(())
    
    with tf.GradientTape() as tape:
        for timestep in range(nb_steps):
            
            input = np.append(state.numpy(),action_onehot)
            input = np.append(input,reward)
            input = np.append(input,timestep)
            input = tf.reshape(input, (1, num_inputs))

            # Predict action probabilities and estimated future rewards from environment state
            action_probs, critic_value, cell_state = model([input,cell_state[0],cell_state[1]])
            
            critic_value_history.append(tf.squeeze(critic_value))

            # Sample action from action probability distribution
            action_probs = tf.squeeze(action_probs)
            action = np.random.choice(num_actions, p=action_probs.numpy())
            action_probs_history.append(action_probs[action])
            action_onehot[action] = 1

            # Apply the sampled action in our environment
            state, reward, done, _ = env.trial(action)
            #state, reward, done, _ = env.step(np.random.randint(0,2))
            rewards_history.append(reward)
            episode_reward += reward
            
            #entropy
            #entropy = -tf.math.reduce_sum(tf.math.multiply(tmp,tf.math.log(tmp + 1e-7)))
            entropy = sp.stats.entropy(action_probs)
            episode_entropy += entropy
            
            if done:
                break


        # Calculating loss values to update our network
        total_loss, actor_loss, critic_loss, entropy = compute_loss(
            tf.convert_to_tensor(action_probs_history,dtype=tf.float32), 
            tf.convert_to_tensor(critic_value_history, dtype=tf.float32), 
            tf.convert_to_tensor(rewards_history, dtype=tf.float32), 
            episode_entropy)
        
        # Backpropagation
        grads = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        with train_summary_writer.as_default():
            tf.summary.scalar('loss/total_loss', total_loss, step=episode_count)
            tf.summary.scalar('loss/actor_loss', actor_loss, step=episode_count)
            tf.summary.scalar('loss/critic_loss', critic_loss, step=episode_count)
            tf.summary.scalar('loss/entropy', episode_entropy, step=episode_count)
            tf.summary.scalar('game/reward', episode_reward, step=episode_count)
            tf.summary.histogram('game/action_probs', action_probs_history, step=episode_count)

    # Log details
    episode_count += 1
    if episode_count % 1000 == 0:
        template = "reward: {:.2f} at episode {}"
        print(template.format(episode_reward, episode_count))
        checkpoint = tf.train.Checkpoint(model)
        save_path = checkpoint.save(ckpt_dir+'checkpoints_'+str(episode_count)+'/two_steps.ckpt')

In [None]:
model.save(path+'/RMS+_7e-4.h5')

In [None]:
%load_ext tensorboard