# 0. Install Dependencies

In [None]:
!pip install tensorflow==2.3.0
!pip install gym
!pip install keras
!pip install keras-rl2

# 1. Test Random Environment with OpenAI Gym

In [2]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
# import cv2
# import matplotlib.pyplot as plt

In [3]:
class StaircaseEnv(Env):
    """The Staircase Environment as a random walking problem in a 1D space.
    The goal is to reach the target state as fast as possible.
    """
    def __init__(self):
        ## Actions we can take
        self.action_space = Discrete(7) # action space is initially defined as [0,6]
        ## Observation array
        self.observation_space = Box(low=np.array([0]),high=np.array([20]))
        self.initial_state = 10
        self.state = self.initial_state
        self.num_step = 0 # initial num_step taken
        
        self.target = 10 + random.randint(-10,10) # a random target between [0, 20]
        self.previous_dist = abs(self.state - self.target) # initial distance to target
        
    def step(self, action):
        "Perform an action and get the next state and reward"
        self.state += action-3 # scale action space to [-3,3]
        self.num_step += 1 # increment num_step
        reward = 0 - (0.1*self.num_step) # penalty for each step taken

        ## Calculate reward based on distance to target
        self.current_dist = abs(self.state - self.target)
        if self.current_dist < self.previous_dist:
            reward += 3 # reward for getting closer to target
        else:
            reward -= 1 # penalty for getting further away from target

        self.previous_dist = self.current_dist # update previous distance
        ## Check if the target is reached
        if self.state == self.target:
            done=True
            reward += 5 # reward for reaching target
        else:
            done=False

        ## Set a placeholder for info
        info = {}

        ## Return step information to the actor
        return self.state,reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        "Reset the environment"
        self.state = self.initial_state
        self.num_step = 0
        return self.state

In [4]:
env = StaircaseEnv()

  logger.warn(


In [5]:
## Try walking naively without optimization for 10 times
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state,reward, done, info = env.step(action)
        score+=reward
    print(f'{env.num_step} steps taken')
    print('Episode:{} Score:{}'.format(episode, score))

13 steps taken
Episode:1 Score:10.899999999999999
88 steps taken
Episode:2 Score:-334.6
5 steps taken
Episode:3 Score:6.5
162 steps taken
Episode:4 Score:-1209.3
1 steps taken
Episode:5 Score:3.9
19 steps taken
Episode:6 Score:-1.0000000000000009
182 steps taken
Episode:7 Score:-1506.3
2 steps taken
Episode:8 Score:6.699999999999999
39 steps taken
Episode:9 Score:-40.0
6 steps taken
Episode:10 Score:8.9


# 2. Create a Deep Learning Model with Keras

In [6]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, LayerNormalization
from tensorflow.keras.optimizers import Adam

In [9]:
NUM_STATES = env.observation_space.shape
NUM_ACTIONS = env.action_space.n
print(NUM_STATES)

(1,)


In [10]:
def build_model(num_states, num_actions):
    "Define the model"
    initialiser = tf.keras.initializers.HeNormal(seed=42) # initialiser for the layers' kernel weights for numeric stability
    model = Sequential()    
    model.add(Dense(32, activation='relu', kernel_initializer=initialiser,input_shape=(num_states)))
    model.add(LayerNormalization())
    model.add(Dense(32, activation='relu',kernel_initializer=initialiser))
    model.add(Dropout(0.2))
    model.add(Dense(num_actions, activation='linear'))
    return model

# 3. Build Agent with Keras-RL

In [11]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [12]:
def build_agent(model, num_actions):
    "Define the agent to walk the environment"
    policy = BoltzmannQPolicy() # policy used to explore the environment (think of it as what the model will use to prioritize actions)
    memory = SequentialMemory(limit=500000, window_length=1) # store the experience (helps with remembering the past experience)
    dqn = DQNAgent(
        model=model, 
        memory=memory,
        policy=policy, 
        test_policy=policy, 
        enable_double_dqn=True,
        nb_actions=num_actions, 
        nb_steps_warmup=1000, 
        target_model_update=1e-2
        )
    return dqn

In [15]:
import tensorflow as tf

## training hyperparameters (performance varies based on these and other parameters of the agents like policy, memory, warmup steps, etc.)
lr = 1e-3 # learning rate
hparams = {
    'training_steps' : 100000,
    'metrics': ['mae'],
    'optimizer': Adam(lr),
}

# build and train faster under the GPU hardware
with tf.device('/device:GPU:0'): # wrapper for the model to use the GPU
    tf.keras.backend.clear_session() # clear the session before building the model each time (avoid continuing training from previous session)
    model = build_model(num_states=NUM_STATES, num_actions=NUM_ACTIONS) # build the model
    dqn = build_agent(model, num_actions=NUM_ACTIONS) # build the agent
    dqn.compile(hparams['optimizer'], metrics=hparams['metrics']) # compile the agent using Mean Absolute Error as the loss function
    history = dqn.fit(env, nb_steps=hparams['training_steps'], visualize=False, verbose=1) # Now we train the agent!

Training for 100000 steps ...
Interval 1 (0 steps performed)
   32/10000 [..............................] - ETA: 33s - reward: -0.8969

  updates=self.state_updates,


 1726/10000 [====>.........................] - ETA: 1:15 - reward: -85.7640done, took 15.825 seconds


In [None]:
with tf.device('/device:GPU:0'):
    _ = dqn.test(env, nb_episodes=20, visualize=False)

In [23]:
dqn.save_weights('models/StaircaseEnv_dqn_weights.h5f', overwrite=True)

# 4. Reloading Agent from Memory

In [52]:
ENV= StaircaseEnv() # instantiate the environment again
actions = ENV.action_space.n
states = ENV.observation_space.shape
with tf.device('/device:GPU:0'):
    tf.keras.backend.clear_session()
    MODEL = build_model(states, actions)
    DQN = build_agent(MODEL, actions)
    DQN.compile(Adam(lr=1e-3), metrics=['mae'])
    DQN.load_weights('models/StaircaseEnv_dqn_weights.h5f')

In [59]:
with tf.device('/device:GPU:0'):
    scores = DQN.test(ENV, nb_episodes=1000, visualize=False, verbose=0)

In [60]:
print(f"Mean episode reward = {np.mean(scores.history['episode_reward'])}".center(100,'-'))
print(f"Average steps taken per episode = {np.mean(scores.history['nb_steps'])}".center(100,'-'))

-----------------------------Mean episode reward = -11.428900000000002------------------------------
------------------------------Average steps taken per episode = 19.963------------------------------
