## This notebook is intended to solve the CartPole-v0 problem with the vanilla apporach DQN.

It will be a benchmark to future algorithms devoloped by Guilherme Viveiros.


> Solved Requirements for CartPole: Considered solved when the average return is greater than or equal to 195.0 over 100 consecutive trials.

In [None]:
import gym #envrionment to test the algorithms
import numpy as np #vector calculations
import tensorflow as tf #tensor / ML operations
#!pip install jdc
import jdc #jupyter dynamic classes
import collections #collect experiences from real environment to sample within DQN
import random #random environment's
import math#math

In [None]:
#ignore this, just plotting settings
import matplotlib
import matplotlib.pyplot as plt
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython: from IPython import display

> Frist let's define our Neural Network to define the policy 

> As it is the Mountaint Car environmet let's define a simple MLP

### Simple MLP

In [None]:
from tensorflow.keras.regularizers import L2

#MLP to CartoPole-v0 only outputs one action-state value, since it
def MLP(input_dim,hidden_layers,output_dim):
        
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(input_dim,)))
        
    for layer_dim in hidden_layers:
        model.add(tf.keras.layers.Dense(units=layer_dim,activation='relu'))
            
    model.add(tf.keras.layers.Dense(units=output_dim,activation='linear'))
    
    return model

### Now let's define the Replay Buffer to use within DQN

> Also use name tuples to save experiments

1. Each experiment contains: 
    1. State
    2. Action
    3. Next state
    4. Reward

In [5]:
from collections import namedtuple

Experience = namedtuple('Experience',
                   ('last_state', 'action', 'state', 'reward','done')
                  )

In [6]:
class ReplayBuffer():
    #initialize the parameters and the memory buffer
    def __init__(self,max_size):
        self.memory = []
        self.size = 0
        self.max_size = max_size
        
    #push a experience to the memory buffer
    def push(self,experience):
        #case when the buffer is full
        if(self.size >= self.max_size):
            self.memory[self.size % self.max_size] = experience
       #if it isn't full
        else:
            self.memory.append(experience)
        
        self.size +=1
    
    #check if the memory can provide a batch sample with a specific size
    def can_provide_batch(self,batch_size):
        self.batch_size = batch_size
        return self.size >= self.batch_size
    
    #return a batch sample
    def batch_sample(self):
        return random.sample(self.memory,self.batch_size)

### Now let's define the Mountain Car Environment

In [7]:
#I want reproductible results, so let's define a define seed
#I always return state - reward - done in this order
class Environment():
    #initialize the environment
    def __init__(self,env_info={}):
        #seed
        #self.seed = env_info.get("sedd")
        #self.rand_generator = np.random.seed(seed = self.seed)
        #environment
        self.env = gym.make('CartPole-v0')
        #renderization
        if(env_info.get("render",False)):
            self.env.render()
    
    #initialize the game
    def env_start(self):
        state = self.env.reset()
        #state,reward,done
        return (state,0,False)
    
    #take a step in the game
    def env_step(self,action):
        state, reward, done, info = self.env.step(action)
        return (state,reward,done)

    #end the game
    def env_end(self):
        env.close()

In [8]:
#Tested cell
env_info = {
    "render" : False
}

env = Environment(env_info)
#assert env.seed == 3

state,reward,done = env.env_start()
assert reward == 0 , print("Agent start error")
assert done == False , print("Agent start error")

state,reward,done = env.env_step(1)
assert reward == 1 , print("Agent step error")
assert done == False , print("Agent step error")

### Now let's define the core part, the Agent , the briliant agent that will solve this game :p

In [9]:
import math

class DQN_Agent():
    #initialize the parameters of the environment
    def agent_init(self,agent_info={}):
        raise NotImplementedError
    #choose and action
    def agent_step(self,state):
        raise NotImplementedError
    #terminal state
    def agent_end(self):
        raise NotImplementedError
    #rate to explore
    def get_exploration_rate(self, current_step):
        return self.end + (self.start - self.end) * math.exp(-1. * current_step * self.decay)

> All the parameters needed by the agent

In [10]:
%%add_to DQN_Agent

def agent_init(self,agent_info={}):
    #discount factor
    self.discount = agent_info.get("discount")
    
    #step-size parameter
    self.step_size = agent_info.get("step_size")
    #number of available actions
    self.number_actions = agent_info.get("number_actions")
    
    #epsilon-greedy - Start , End and a Decay
    self.start = agent_info.get("epsilon_start")
    self.end = agent_info.get("epsilon_end")
    self.decay = agent_info.get("epsilon_decay")
    #current-step
    self.current_step = 0


In [11]:
%%add_to DQN_Agent

def agent_step(self,state):
    
    rate = self.get_exploration_rate(self.current_step)
    self.current_step += 1
    
    #exploitation case
    if(random.random() > rate):
        action = np.argmax(tf.stop_gradient(policy_network(state[np.newaxis]),name="Choosen-State").numpy()[0])
    #exploration
    else:
        action = random.randrange(self.number_actions) # explore 
    
    return action

### Initialize the Hyper-Parameters for the DQN Agent

In [12]:
in_dim = 4
out_dim = 2
hidden_layers = [64,32]

policy_network = MLP(in_dim,hidden_layers,out_dim)
#policy_network.build(tf.TensorShape([None, 4]))

target_network = MLP(in_dim,hidden_layers,out_dim)
#target_network.build(tf.TensorShape([None, 4]))

#set the target networks trainable parameter to false.
#Every episode update this weights
target_network.trainable = False
target_network.set_weights(policy_network.get_weights())

policy_network.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                320       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 2,466
Trainable params: 2,466
Non-trainable params: 0
_________________________________________________________________


In [13]:
eps_start = 1
eps_end = 0.01
eps_decay = 0.0005

discount_factor = 0.99
number_actions = out_dim

num_iterations = 1000
num_episodes = 10

max_buffer_size = 100000
batch_size = 64 #(was 256)

step_size = 0.0007
optimizer = tf.keras.optimizers.Adam(lr = step_size)
#target_update = 20

In [14]:
env_info = {
    "sedd" : 4,
    "render" : False
}

env = Environment(env_info)
#state,reward,done = env.env_start()

In [15]:
agent_info = {
    "discount" : discount_factor,
    "step_size" : step_size,
    "epsilon_start": eps_start,
    "epsilon_end": eps_end,
    "epsilon_decay": eps_decay,
    "number_actions": number_actions
}

agent = DQN_Agent()
agent.agent_init(agent_info)

In [16]:
class PlotRewards():
    def __init__(self,plot_info,agent):
    
        self.epi = plot_info
        self.sum_reward_per_episode = []
        self.mean_reward = []

    def plot(self,reward_per_episode):
        plt.figure(figsize=(8,4))
        plt.clf()
        plt.title('Training --- Each iteration has 10 episodes inside')
        plt.xlabel('Episode')
        plt.ylabel('Duration')

       
        self.sum_reward_per_episode.append(reward_per_episode)
        
        #check if the game requirements are completed - 195.0 over 100
        if(len(self.sum_reward_per_episode) >= 100):
            tmp = self.self.sum_reward_per_episode[-100:] >= 195
            if(tmp.sum() == 100):
                print("Completeded")
                return True
        
        #if the user want to check an average window of 10, put it innto the dict
        if(len(self.sum_reward_per_episode) >= self.epi):
            
            #moving average of epi
            self.mean_reward.append(
                np.mean(
                    self.sum_reward_per_episode[-self.epi:]
                )
            )
            
            plt.plot(self.mean_reward,label='Moving Average',c='r');
            
        
        #Plot the sum reward of every episode in each iteration that consis$
        plt.plot(self.sum_reward_per_episode,label='Mean Reward per Episode');
        plt.legend(loc='upper right',fontsize='large');
            
        plt.pause(0.001)
        rate = agent.get_exploration_rate(agent.current_step)
        print("Episode " + str(len(self.sum_reward_per_episode)) + " With epsilon of : " + str(rate))
        if is_ipython: display.clear_output(wait=True)
        
        return False


### Auxiliary functions

1. Get the q value for the current state
2. Get the q value for the next state, predicted by the target network
3. Extract the tensors from the batch, respectivelly:
    > State\
    > Next state\
    > Reward\
    > Action\
    > Done

In [17]:
#loss functions
from keras.losses import MSE

'''
#Steps 
1. Iterate over the "current" states from the batch. When I say current it's the actual taken states by the agent
2. For each state, check the action that the agent performed
3. Extract the q value from the ploicy network for that state following the action
'''
def get_current_q(states,actions):
    
    q_value = []
    count = 0
    
    for state in states:
        
        #action selected in s
        action_selected = actions[count]
        
        #extract the float number from the tensorflow tensor
        #and the q value with respect to the chosen action
        q = policy_network(state[np.newaxis])[0][action_selected]
        q_value.append(q)
        
        count+=1
        
    return q_value

'''
#Steps 
1. Same logic as with get_current_q, with the exception that if it's the terminal state the q value is 0.
2. I'm using a greedy 
'''
def get_q_prime(next_states,done):
    
    q_value = []
    count=0
    
    for state in next_states:
       
        #need to check if it's the last state, for this I use the bool returned by the Envrionment 
        if (done[count] == True) :
            q_value.append(tf.Variable(0.0 , dtype= tf.float32))
        else:
            q = target_network(state[np.newaxis]) 
            q_value.append(tf.reduce_max(q))#extract the float number from the tensorflow tensor
        
        count+=1
    
    return q_value

def extract_tensor(experiences):
    
    experiences = Experience(*zip(*experiences))
    
    last_state = experiences.last_state
    reward = experiences.reward
    state = experiences.state
    action = experiences.action
    done = experiences.done
    
    
    return last_state,action,state,reward,done

### Main Program

In [18]:
#remove existing file
!rm -rf dqn_agent_weights

#I will create a folder that will retain the weights of the policy network every 10 iterations
import os
if not os.path.exists('dqn_agent_weights'):
    os.mkdir('dqn_agent_weights')

file = 'dqn_agent_weights/weights'

In [20]:
#have the agent and the environment all set up

#plot class with a window average of 10
plot = PlotRewards(10,agent)

#Initialize the repllayBuffer
replay_buffer = ReplayBuffer(max_buffer_size)

#for each iteration -> 10 episodes
for iteration in range(num_iterations):
    
    for episode in range(num_episodes):
        #all the rewards received in one episode will be appended
        rewards_per_episode = []
        
        #start state
        state,reward,done = env.env_start()
        
        
        #Iterate until the terminal state
        while done != True:
            
            #Agent choose the action according to the policy network and the current state
            action = agent.agent_step(state)
            
            #new state after executing the action chosen by the agent
            next_state,reward,done = env.env_step(action)

            #append the rewards 
            rewards_per_episode.append(reward)
        
            #build an experience
            experience = Experience(state,action,next_state,reward,done)
            
            #add the experience to the replay buffer
            replay_buffer.push(experience)
            
            state = next_state

            
            #When collected 32 experiences we can start optimizing our network
            if(replay_buffer.can_provide_batch(batch_size)):
                
                #batch a sample of data (32 samples)
                experiences_buffer = replay_buffer.batch_sample()
                
                #extract all the features of this experience batch
                states,actions,next_states,rewards,dones = extract_tensor(experiences_buffer)
            
                #From now one we need a gradient tape to calculate the gradients of the policy with respect to the loss
                #The loss is the MSE between the Q function and the Q' function
                with tf.GradientTape() as tape:
                    
                    #watch the variables of the policy network
                    tape.watch(policy_network.trainable_variables)
                    
                    #Get the q values for the taken states and actions
                    current_q = get_current_q(states,actions)
                    
                    #Get the q prime values for the next states computed by the target network
                    q_prime = get_q_prime(next_states,dones)
                    
                    #compute the target error
                    target_error = tf.convert_to_tensor(rewards, dtype = tf.float32) + tf.Variable(agent.discount,dtype= tf.float32) * q_prime
            
                    #compute the loss function. I will use the Mean Squared error as the loss
                    loss = MSE(target_error,current_q)
            
                    # Backprop
                    #compute the gradients with respect to the policy_network
                    grads = tape.gradient(loss,policy_network.trainable_variables)
                    #Then aplly the gradients
                    optimizer.apply_gradients(zip(grads, policy_network.trainable_variables))
       
        #1 in 1 episode -> plot the rewards as a function of time
        episode_sum_reward = np.sum(np.asarray(rewards_per_episode))
        tag = plot.plot(episode_sum_reward)
        
        if(tag == True):
            print("Solved")
            env.env_end()
            quit()
        
    #ended the X episodes, e.g, one iteration terminated
    #In each iteration I update the target weights
    target_network.set_weights(policy_network.get_weights())
        
    #if(iteration % 100 == 0):
    #    tmp = file+str(iteration)+"reward-"+str(episode_sum_reward)+'.hdf5'
    #    policy_network.save_weights(tmp)

env.env_end()

KeyboardInterrupt: 

![result](images/result.png "Result")