# Installization & Import

In [1]:
!pip install tensorflow==2.3.1 gym keras-rl2 gym[atari]

^C


In [14]:
import gym 
import random
import time

import numpy as np
from tensorflow import keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Convolution2D
from tensorflow.keras.optimizers import Adam

# Environment

In [5]:
env = gym.make('CartPole-v0')
observations = env.observation_space.shape[0]
actions = env.action_space.n
action_space = [x for x in range(actions)]

In [6]:
print(observations)
print(actions)
state = env.reset()
print(state)
state, reward, done, info = env.step(0)
print(state, reward, done, info)

4
2
[0.01708181 0.03674176 0.03509429 0.00209433]
[ 0.01781664 -0.15886546  0.03513617  0.30564009] 1.0 False {}


In [4]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = random.choice(action_space)
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:17.0
Episode:2 Score:12.0
Episode:3 Score:33.0
Episode:4 Score:16.0
Episode:5 Score:17.0


# KerasRL's DQN

## Import

In [3]:
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy, BoltzmannQPolicy

## Model

In [8]:
def build_model(observations, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(24, activation='tanh'))
    model.add(Dense(48, activation='tanh'))
    model.add(Dense(actions, activation='linear'))
    return model

In [9]:
model = build_model(observations, actions)

In [36]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 24)                120       
_________________________________________________________________
dense_1 (Dense)              (None, 48)                1200      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 98        
Total params: 1,418
Trainable params: 1,418
Non-trainable params: 0
_________________________________________________________________


## DQN

In [18]:
def build_agent(model, actions):
    memory = SequentialMemory(limit=50000, window_length=1)
    policy = BoltzmannQPolicy()
    dqn = DQNAgent(model=model, nb_actions=actions, memory=memory, nb_steps_warmup=2000,
                   target_model_update=1e-2, policy=policy)
    dqn.compile(Adam(lr=0.01, decay=0.01), metrics=['mse'])
    return dqn

In [19]:
dqn = build_agent(model, actions)

In [24]:
dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 76.169 seconds


<tensorflow.python.keras.callbacks.History at 0x1b85b1d8388>

In [25]:
dqn.save_weights('./trained_models/KeraRL/model_10000')

## Test

In [26]:
dqn.load_weights('./trained_models/KeraRL/model_10000')

In [34]:
scores = dqn.test(env, nb_episodes=5, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
200.0


In [48]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action = dqn.forward(state)
        state, reward, done, info = env.step(action)
        env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 200.0
Episode: 1 score: 200.0
Episode: 2 score: 200.0
Episode: 3 score: 200.0
Episode: 4 score: 200.0


# Stable baseline

## Import

In [18]:
from stable_baselines3.common.cmd_util import make_atari_env
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np 
import os## Import

## Callback

In [26]:
class SavingBestTrainingRewardCallback(BaseCallback):
    def __init__(self, check_freq:int, save_path: str, verbose=1):
        super(SavingBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
    def _init_callback(self):
        if self.save_path:
            os.makedirs(self.save_path, exist_ok=True)
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path)
        return True

In [35]:
CHECKPOINT_DIR = './trained_models/StableBaselines/'
LOG_DIR = './logs/StableBaselines/'
callback = SavingBestTrainingRewardCallback(check_freq=1000, save_path=CHECKPOINT_DIR)

## Train

In [54]:
agent = A2C('MlpPolicy', env, verbose=0, tensorboard_log=LOG_DIR)
# agent = DQN('MlpPolicy', env, verbose=0, tensorboard_log=LOG_DIR)
#agent = ACER('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR)
#agent = PPO2('CnnPolicy', env, minibaches=2, verbose=1, tensorboard_log=LOG_DIR)
#agent = DQN('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR)

In [None]:
# trained_agent = A2C.load('./train/model_10000', env=env, tensorboard_log=LOG_DIR)

In [55]:
agent.learn(total_timesteps= 20000, callback= callback)

<stable_baselines3.a2c.a2c.A2C at 0x1bb5fae4908>

## Test

In [64]:
agent = A2C.load(CHECKPOINT_DIR + '/model_20000', env=env)

In [56]:
evaluate_policy(agent, env, n_eval_episodes=10, render=True)

(200.0, 0.0)

In [65]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action, states = agent.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 197.0
Episode: 1 score: 37.0
Episode: 2 score: 191.0
Episode: 3 score: 118.0
Episode: 4 score: 199.0


# T

# T