In [4]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Configuration parameters for the whole setup
seed = 42
gamma = 0.75  # Discount factor for past rewards
max_steps_per_episode = 90 #10000
# env = gym.make("CartPole-v0")  # Create the environment
# env.seed(seed)
eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0

Using device:  /device:GPU:0


# Two Step Task

In [2]:
# encoding of the higher stages
S_1 = 0
S_2 = 1
S_3 = 2
nb_states = 3

class two_step_task():
    def __init__(self):
        # start in S_1
        self.state = S_1
        
        # defines what is the stage with the highest expected reward. Initially random
        self.highest_reward_second_stage = np.random.choice([S_2,S_3])
        
        self.num_actions = 2
        self.reset()
        
        # initialization of plotting variables
        common_prob = 0.8
        self.transitions = np.array([
            [common_prob, 1-common_prob],
            [1-common_prob, common_prob]
        ])
        self.transition_count = np.zeros((2,2,2))
        
        self.last_action = None
        self.last_state = None
    
    def get_state(self):
        one_hot_array = np.float32(np.zeros(nb_states))
        one_hot_array[self.state] = 1.0
        return one_hot_array

    def possible_switch(self):
        if (np.random.uniform() < 0.025):
            # switches which of S_2 or S_3 has expected reward of 0.9
            self.highest_reward_second_stage = S_2 if (self.highest_reward_second_stage == S_3) else S_3
            
    def get_rprobs(self):
        """
        probability of reward of states S_2 and S_3, in the form [[p, 1-p], [1-p, p]]
        """
        if (self.highest_reward_second_stage == S_2):
            r_prob = 0.9
        else:
            r_prob = 0.1
        
        rewards = np.array([
            [r_prob, 1-r_prob],
            [1-r_prob, r_prob]
        ])
        return rewards
            
    def isCommon(self,action,state):
        if self.transitions[action][state] >= 1/2:
            return True
        return False
        
    def updateStateProb(self,action):
        if self.last_is_rewarded: #R
            if self.last_is_common: #C
                if self.last_action == action: #Rep
                    self.transition_count[0,0,0] += 1
                else: #URep
                    self.transition_count[0,0,1] += 1
            else: #UC
                if self.last_action == action: #Rep
                    self.transition_count[0,1,0] += 1
                else: #URep
                    self.transition_count[0,1,1] += 1
        else: #UR
            if self.last_is_common:
                if self.last_action == action:
                    self.transition_count[1,0,0] += 1
                else:
                    self.transition_count[1,0,1] += 1
            else:
                if self.last_action == action:
                    self.transition_count[1,1,0] += 1
                else:
                    self.transition_count[1,1,1] += 1
                    
        
    def stayProb(self):
        print(self.transition_count)
        row_sums = self.transition_count.sum(axis=-1)
        stay_prob = self.transition_count / row_sums[:,:,np.newaxis] 
       
        return stay_prob

    def reset(self):
        self.timestep = 0
        
        # for the two-step task plots
        self.last_is_common = None
        self.last_is_rewarded = None
        self.last_action = None
        self.last_state = None
        
        # come back to S_1 at the end of an episode
        self.state = S_1
        
        return self.get_state()
        
    def step(self,action):
        self.timestep += 1
        self.last_state = self.state
        
        # get next stage
        if (self.state == S_1):
            # get reward
            reward = 0
            # update stage
            self.state = S_2 if (np.random.uniform() < self.transitions[action][0]) else S_3
            # keep track of stay probability after first action
            if (self.last_action != None):    
                self.updateStateProb(action)
            self.last_action = action
            # book-keeping for plotting
            self.last_is_common = self.isCommon(action,self.state-1)
            
        else:# case S_2 or S_3
            # get probability of reward in stage
            r_prob = 0.9 if (self.highest_reward_second_stage == self.state) else 0.1
            # get reward
            reward = 1 if np.random.uniform() < r_prob else 0
            # update stage
            self.state = S_1
            # book-keeping for plotting
            self.last_is_rewarded = reward

        # new state after the decision
        new_state = self.get_state()
        if self.timestep >= 200: 
            done = True
        else: 
            done = False
        return new_state,reward,done,self.timestep
    
    def trial(self,action):
        # do one action in S_1, and keep track of the perceptually distinguishable state you arive in
        observation,_,_,_ = self.step(action)
        # do the same action in the resulting state (S_2 or S_3). The action doesn't matter, the reward does
        _,reward,done,_ = self.step(action)
        return observation,reward,done,self.timestep
    
env = two_step_task()

# LOSS

In [3]:

def get_n_step_return(
    rewards: tf.Tensor,
    values: tf.Tensor,
    n: int,
    gamma: float):
    '''Fonction qui retourne R_t, le gamma utilisé est celui préconisé par 
    Wang et al. (2018), Methods/Simulation1
    Version AVEC bootstrap (utilisation de la valeur prédite au dernier step
    comme point de départ)
    '''
    returns = tf.TensorArray(dtype=tf.float32, size=n)
    # Start from the end of `rewards` and accumulate reward sums
  # into the `returns` array
    rewards = rewards[::-1]
    values =  values[::-1]
    
    # values is inverted
    discounted_sum = values[0]
    for i in tf.range(n):
        discounted_sum = rewards[i] + gamma * discounted_sum
# I think it is a typo in the article, I put it above in defining discounted_sum
#        discounted = discounted_sum +  values[n-1]* tf.pow(tf.constant(gamma, dtype = tf.float32),tf.cast(n,tf.float32))
        returns = returns.write(i, int(discounted_sum))
    
    return returns.stack()



In [4]:
def compute_loss(
        action_probs: tf.Tensor,
        values: tf.Tensor,
         rewards: tf.Tensor,
         entropy : tf.Tensor,  
         gamma: float = gamma,
         beta_v: float = 0.05,
         beta_e : float = 0.05 ) -> tf.Tensor:
    """Computes the combined actor-critic loss."""
    R_t = get_n_step_return(
                rewards=rewards,
                values = values, 
                n=rewards.shape[0], 
                gamma=gamma
                )
    delta = R_t - values
    delta_nogradient = tf.stop_gradient(delta)
      
# huber loss dans le tuto mais square dans l'article
#   huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
#   critic_loss = huber_loss(values, R_t)
          
    critic_loss = 0.5 * tf.reduce_sum(tf.square(delta))
     
    action_log_probs = tf.math.log(action_probs + 1e-7)
          
    # careful with the sign of this one
    #no gradient through temporal difference here
    actor_loss = tf.reduce_sum(action_log_probs * delta_nogradient)
          
    # l'entropie est bien avec un +
    total_loss = actor_loss + beta_v * critic_loss + beta_e * entropy
    
#   tf.print("actor loss", actor_loss)
#   tf.print("critic loss", beta_v*critic_loss)
#   tf.print("entropy reg", beta_e*entropy)
#   tf.print("total loss", total_loss)
    
    return total_loss

# LSTM

In [5]:
num_inputs = 6 #states + reward + action = 7
num_actions = 2
num_hidden = 48

inputs = layers.Input(shape=(None,num_inputs))
common = layers.LSTM(num_hidden, activation="relu")(inputs)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs=inputs, outputs=[action, critic])

2022-12-30 02:35:23.221801: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Train

In [6]:

optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

while True:  # Run until solved
    state = env.reset()
    episode_reward = 0
    reward = 0.0
    action_probs = np.float32([1,0])
    inputs = tf.convert_to_tensor(np.float32(np.zeros((1,1,num_inputs))))
    
    episode_entropy = tf.zeros(())
    
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            # env.render(); Adding this line would show the attempts
            # of the agent in a pop up window.

            input = np.append(state, action_probs)
            input = np.append(input, reward)
            #state = tf.expand_dims(state, 1)
            input = input.reshape((1, 1, num_inputs))
            input = tf.convert_to_tensor(np.float32(input))
            #print(state)
            
         
            inputs = tf.concat([inputs, input], 1)
            #print(inputs)

            # Predict action probabilities and estimated future rewards
            # from environment state
            action_probs, critic_value = model(inputs)
            
            if np.isnan(action_probs.numpy()).any():
                print(inputs)
                print(action_probs)
                break                
            
            critic_value_history.append(critic_value[0, 0])

            # Sample action from action probability distribution
            #print(action_probs)
            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            #print(action)
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            # Apply the sampled action in our environment
            state, reward, done, _ = env.step(action)
            rewards_history.append(reward)
            episode_reward += reward
            

            if done:
                break

        # Update running reward to check condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        # Calculate expected value from rewards
        # - At each timestep what was the total reward received after that timestep
        # - Rewards in the past are discounted by multiplying them with gamma
        # - These are the labels for our critic
        # returns = []
        # discounted_sum = 0
        # for r in rewards_history[::-1]:
        #     discounted_sum = r + gamma * discounted_sum
        #     returns.insert(0, discounted_sum)

        # # Normalize
        # returns = np.array(returns)
        # returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        # returns = returns.tolist()

        # Calculating loss values to update our network
        #entropy
        entropy = -tf.math.reduce_sum(tf.math.multiply(action_probs,tf.math.log(action_probs + 1e-7)))
        episode_entropy += entropy
    
        # history = zip(action_probs_history, critic_value_history, returns)
        # actor_losses = []
        # critic_losses = []
        # for log_prob, value, ret in history:
        #     # At this point in history, the critic estimated that we would get a
        #     # total reward = `value` in the future. We took an action with log probability
        #     # of `log_prob` and ended up recieving a total reward = `ret`.
        #     # The actor must be updated so that it predicts an action that leads to
        #     # high rewards (compared to critic's estimate) with high probability.
        #     diff = ret - value
        #     actor_losses.append(-log_prob * diff)  # actor loss

        #     # The critic must be updated so that it predicts a better estimate of
        #     # the future rewards.
        #     critic_losses.append(
        #         huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
        #     )
            
        #     #print(huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)))

        # # Backpropagation
        # loss_value = sum(actor_losses) + sum(critic_losses)
        if len(rewards_history) >= max_steps_per_episode - 1:
            loss_value = compute_loss(
                tf.convert_to_tensor(np.float32(action_probs_history)), 
                tf.convert_to_tensor(np.float32(critic_value_history)), 
                tf.convert_to_tensor(np.float32(rewards_history)), 
                tf.convert_to_tensor(episode_entropy))
            grads = tape.gradient(loss_value, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Log details
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))

    if running_reward > 100:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break
    
model.save('model1.h5')

running reward: 5.05 at episode 10
running reward: 7.37 at episode 20
running reward: 8.82 at episode 30
running reward: 10.00 at episode 40
running reward: 10.31 at episode 50
running reward: 10.65 at episode 60
running reward: 10.55 at episode 70
running reward: 10.94 at episode 80
running reward: 10.58 at episode 90
running reward: 11.19 at episode 100
running reward: 11.49 at episode 110
running reward: 11.47 at episode 120
running reward: 11.59 at episode 130
running reward: 11.65 at episode 140
running reward: 11.34 at episode 150
running reward: 11.18 at episode 160
running reward: 10.90 at episode 170
running reward: 10.60 at episode 180
running reward: 10.71 at episode 190
running reward: 11.25 at episode 200
running reward: 11.17 at episode 210
running reward: 10.80 at episode 220
running reward: 10.56 at episode 230
running reward: 11.71 at episode 240
running reward: 11.58 at episode 250
running reward: 11.92 at episode 260
running reward: 12.18 at episode 270
running rewar

KeyboardInterrupt: 