Author: Jakub Łyskawa

In [1]:
import tensorflow as tf
import gym
import numpy as np

# Discrete env

In [2]:
sample_env = gym.make('LunarLander-v2')

In [3]:
sample_env.reset()

array([ 0.00379124,  1.4129066 ,  0.38401106,  0.08828293, -0.00438645,
       -0.08698429,  0.        ,  0.        ], dtype=float32)

In [4]:
sample_env.observation_space

Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)

In [5]:
sample_env.action_space

Discrete(4)

# Show env

In [6]:
sample_env.reset()
sample_env.render()

while True:
    _, _, done, _ = sample_env.step(sample_env.action_space.sample())
    sample_env.render()
    if done:
        sample_env.reset()
        break

# Utils

In [8]:
def show_agent(env, agent):
    obs = env.reset()
    env.render()

    reward_sum = 0
    
    while True:
        obs, reward, done, _ = env.step(agent.act(obs, explore=False))
        reward_sum += reward
        env.render()
        if done:
            env.reset()
            return reward_sum

In [9]:
def test_agent(env, agent):
    obs = env.reset()

    reward_sum = 0
    
    while True:
        obs, reward, done, _ = env.step(agent.act(obs, explore=False))
        reward_sum += reward
        if done:
            return reward_sum

In [10]:
def make_model(env, outputs, neurons=32):
    return tf.keras.Sequential([
        tf.keras.layers.Input(env.observation_space.shape),
        tf.keras.layers.Dense(neurons, activation='tanh'),
        tf.keras.layers.Dense(neurons, activation='tanh'),
        tf.keras.layers.Dense(outputs),
    ])

In [11]:
TEST_I = 5000
TEST_N = 5

# Q-learning

In [12]:
class QAgent:
    def __init__(self, env, model, gamma, exploration, lr):
        self.env = env
        self.obs = self.env.reset()

        self.model = model
        self.saved_model = tf.keras.models.clone_model(model)
        self.gamma = gamma
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        self.exploration = exploration

    def step(self):
        action = self.act(self.obs)
        obs, reward, done, _ = self.env.step(action)
        prev_obs = self.obs

        if done:
            self.obs = self.env.reset()
        else:
            self.obs = obs

        return prev_obs, obs, action, reward, done

    def act(self, obs, explore=True):
        if explore and np.random.rand() < self.exploration:
            return self.env.action_space.sample()
        else:
            out = self.model(obs.reshape(1, -1)).numpy()
            return np.argmax(out)

    def loss(self, obs, obs_next, actions, rewards, dones):
        batch_size = obs.shape[0]

        q_next = self.model(obs_next).numpy() * (1 - dones).reshape(batch_size, 1)
        q = self.model(obs)

        diffs = tf.gather_nd(q, actions.reshape(batch_size, 1), batch_dims=1) - (rewards + self.gamma * q_next.max(axis=-1))

        return tf.reduce_sum(diffs ** 2)

    def learning_step(self, obs, obs_next, actions, rewards, dones):
        with tf.GradientTape() as tape:
            loss = self.loss(obs, obs_next, actions, rewards, dones)

        v = self.model.trainable_variables
        self.optimizer.minimize(loss, v, tape=tape)

        return loss.numpy().mean()

    def save_model(self):
        self.saved_model.set_weights(self.model.get_weights())

    def restore_model(self):
        self.model.set_weights(self.saved_model.get_weights())

In [18]:
agent = QAgent(env, make_model(env, env.action_space.n), 0.9, 0.05, 0.001)

In [22]:
show_agent(env, agent)

-455.678304544497

In [23]:
def train_q_agent(env, gamma=0.9, exploration=0.05, lr=0.0001, batch_size=100, steps=100000, neurons=32):
    train_env = gym.make(env)
    test_env = gym.make(env)
    agent = QAgent(train_env, make_model(train_env, train_env.action_space.n, neurons), gamma, exploration, lr)

    total_loss = 0
    t = 0

    best_rewards = -np.inf

    def test():
        nonlocal best_rewards
        reward_mean = sum(test_agent(test_env, agent) for _ in range(TEST_N)) / TEST_N
        print(f'Step: {i + 1} mean reward sum: {reward_mean} mean loss: {total_loss / t if t else 0}')
        if reward_mean > best_rewards:
            agent.save_model()
            print('Saved')
            best_rewards = reward_mean

    for i in range(steps):
        obs, obs_next, action, reward, done = agent.step()

        total_loss += agent.learning_step(
            obs.reshape((1, -1)),
            obs_next.reshape((1, -1)),
            np.array(action).reshape((1, -1)),
            np.array(reward).reshape((1,)),
            np.array(done).reshape((1,)),
        )

        t += 1

        if i % TEST_I == 0:
            test()
            total_loss = 0
            t = 0
    
    test()

    return agent

In [32]:
agent1 = train_q_agent('LunarLander-v2', gamma=0.97, lr=0.003, neurons=64)

Step: 1 mean reward sum: -577.5624259495737 mean loss: 1.9993765354156494
Step: 5001 mean reward sum: -132.9692685234541 mean loss: 77.90897583691327
Step: 10001 mean reward sum: -160.18069955759742 mean loss: 48.059068123479726
Step: 15001 mean reward sum: -437.350393322355 mean loss: 20.770620531397352
Step: 20001 mean reward sum: -2359.1489176483856 mean loss: 22.181179281515675
Step: 25001 mean reward sum: -1334.8973639842793 mean loss: 12.627284348996357
Step: 30001 mean reward sum: -246.41629337096884 mean loss: 13.009079466306416
Step: 35001 mean reward sum: -133.75057802899585 mean loss: 5.696019855153421
Step: 40001 mean reward sum: -152.87909088970406 mean loss: 7.985132964754268
Step: 45001 mean reward sum: -133.162337494026 mean loss: 16.87861307651841
Step: 50001 mean reward sum: -142.32250552443446 mean loss: 7.772499423858103
Step: 55001 mean reward sum: -72.03246179312387 mean loss: 5.992745566139879
Step: 60001 mean reward sum: -115.32136350613139 mean loss: 3.54883241

In [38]:
show_agent(env, agent1)

-38.85185349665383

In [39]:
agent1.restore_model()

In [45]:
show_agent(env, agent1)

-429.5328211961983

# Experience replay

In [13]:
class Memory:
    def __init__(self, size, observation_space, action_space):
        self.obs = np.zeros((size,) + observation_space.shape)
        self.obs_next = np.zeros((size,) + observation_space.shape)
        self.actions = np.zeros((size,) + action_space.shape, dtype=action_space.dtype)
        self.dones = np.zeros((size,))
        self.rewards = np.zeros((size,))
        self.probs = np.zeros((size,))
        self.size = size
        self.cur_size = 0
        self.cur = 0

    def put(self, obs, obs_next, action, reward, done, prob=1.):
        self.obs[self.cur] = obs
        self.obs_next[self.cur] = obs_next
        self.actions[self.cur] = action
        self.dones[self.cur] = done
        self.rewards[self.cur] = reward
        self.probs[self.cur] = prob

        self.cur = (self.cur + 1) % self.size
        self.cur_size = min(self.cur_size + 1, self.size)

    def get(self, batch_size, probs=False):
        ids = np.random.choice(self.cur_size, size=batch_size)
        batch = self.obs[ids], self.obs_next[ids], self.actions[ids], self.rewards[ids], self.dones[ids]
        return batch + ((self.probs[ids],) if probs else ())

# DQN

In [14]:
def train_dqn_agent(env, gamma=0.9, exploration=0.05, lr=0.0001, batch_size=100, steps=100000, memsize=100000, neurons=32):
    train_env = gym.make(env)
    test_env = gym.make(env)
    agent = QAgent(train_env, make_model(train_env, train_env.action_space.n, neurons), gamma, exploration, lr)

    memory = Memory(memsize, train_env.observation_space, train_env.action_space)

    total_loss = 0
    t = 0
    
    best_rewards = -np.inf

    def test():
        nonlocal best_rewards
        reward_mean = sum(test_agent(test_env, agent) for _ in range(TEST_N)) / TEST_N
        print(f'Step: {i + 1} mean reward sum: {reward_mean} mean loss: {total_loss / t if t else 0}')
        if reward_mean > best_rewards:
            agent.save_model()
            print('Saved')
            best_rewards = reward_mean
        
    
    for i in range(steps):
        obs, obs_next, action, reward, done = agent.step()
        memory.put(obs, obs_next, action, reward, done)

        total_loss += agent.learning_step(*memory.get(batch_size))

        t += 1

        if i % TEST_I == 0:
            test()
            total_loss = 0
            t = 0
    
    test()

    return agent

In [49]:
agent = train_dqn_agent('LunarLander-v2', gamma=0.97, lr=0.003, neurons=64, steps=100000)

Step: 1 mean reward sum: -589.4532259495973 mean loss: 451.6076965332031
Saved
Step: 5001 mean reward sum: 25.566366433121694 mean loss: 1603.343657772827
Saved
Step: 10001 mean reward sum: -2.6532086871345277 mean loss: 1019.8448606384277
Step: 15001 mean reward sum: 91.62939103652526 mean loss: 786.3614880767823
Saved
Step: 20001 mean reward sum: 52.431662887182014 mean loss: 935.8190541107177
Step: 25001 mean reward sum: -62.101914926346254 mean loss: 1114.9148112884523
Step: 30001 mean reward sum: 144.5247762769173 mean loss: 1187.7598142807008
Saved
Step: 35001 mean reward sum: 95.73312910134895 mean loss: 1148.4959586090088
Step: 40001 mean reward sum: 105.62987031618741 mean loss: 1078.9828122619629
Step: 45001 mean reward sum: -56.76358648452176 mean loss: 1040.9512893493652
Step: 50001 mean reward sum: 188.78410902172124 mean loss: 922.8347435745239
Saved
Step: 55001 mean reward sum: -18.72456238830589 mean loss: 931.2889545135498
Step: 60001 mean reward sum: 159.9261816203336

In [53]:
show_agent(env, agent)

263.09914197710935

In [57]:
agent.restore_model()

In [58]:
show_agent(env, agent)

152.80746353685973

# Actor-Critic

In [15]:
sample_env_c = gym.make('LunarLanderContinuous-v2')

In [16]:
sample_env_c.observation_space

Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)

In [17]:
sample_env_c.action_space

Box([-1. -1.], [1. 1.], (2,), float32)

In [18]:
import tensorflow_probability as tfp

In [35]:
class ActorCriticAgent:
    def __init__(self, env, actor, critic, gamma, sigma, actor_lr, critic_lr, beta=0.1, b=3):
        self.env = env
        self.obs = self.env.reset()

        self.gamma = gamma
        self.actor = actor
        self.critic = critic
        self.b = b
        self.beta = beta
        self.bounds = np.maximum(self.env.action_space.low, self.env.action_space.high).reshape((1, -1))

        self.noise = tfp.distributions.MultivariateNormalDiag(
            tf.zeros(self.env.action_space.shape),
            tf.ones(self.env.action_space.shape) * sigma
        )
        
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)
        
    def step(self):
        action, prob = self.act(self.obs.reshape(1, -1))
        obs, reward, done, _ = self.env.step(action)
        prev_obs = self.obs

        if done:
            self.obs = self.env.reset()
        else:
            self.obs = obs

        return prev_obs, obs, action, reward, done, prob

    def act(self, obs, explore=True):
        means = self.actor(obs.reshape((1, -1))).numpy()[0]
        
        if explore:
            noise = self.noise.sample()
            probs = self.noise.prob(noise)
            return means + noise.numpy(), probs.numpy()
        else:
            return means

    def learning_step(self, obs, obs_next, actions, rewards, dones, probs):
        with tf.GradientTape(persistent=True) as tape:
            v = self.critic(obs)
            v_next = tf.stop_gradient(self.critic(obs_next)) * (1 - dones).reshape((-1, 1))
            cur_actions = self.actor(obs)
            td = (rewards.reshape((-1, 1)) + self.gamma * v_next) - v
            
            cur_probs = self.noise.prob(actions - cur_actions)

            is_base = tf.reshape((cur_probs / probs), (-1, 1))
            is_ = np.tanh(is_base / self.b) * self.b

            critic_loss = tf.reduce_mean(td ** 2 * tf.stop_gradient(is_))
            actor_loss = tf.reduce_mean(-self.noise.log_prob(actions - cur_actions) * tf.stop_gradient(td * is_))

            bounds_penalty = tf.reduce_mean(tf.maximum(tf.abs(cur_actions) - self.bounds, 0.) ** 2 * self.beta)

            actor_loss_with_bounds = actor_loss + bounds_penalty

        self.actor_optimizer.minimize(actor_loss_with_bounds, self.actor.trainable_variables, tape=tape)
        self.critic_optimizer.minimize(critic_loss, self.critic.trainable_variables, tape=tape)
        
        return np.array([critic_loss.numpy(), actor_loss.numpy(), bounds_penalty.numpy()])

    def save_model(self):
        self.saved_actor = tf.keras.models.clone_model(self.actor)
        self.saved_critic = tf.keras.models.clone_model(self.critic)

    def restore_model(self):
        self.actor = tf.keras.models.clone_model(self.saved_actor)
        self.critic = tf.keras.models.clone_model(self.saved_critic)

In [20]:
def train_actor_critic_agent(
    env, 
    gamma=0.9, sigma=0.25, actor_lr=0.00001, critic_lr=0.00001, 
    batch_size=100, steps=100000, memsize=100000, learning_starts=1000, neurons=64):
    train_env = gym.make(env)
    test_env = gym.make(env)
    agent = ActorCriticAgent(
        train_env,
        make_model(train_env, train_env.action_space.shape[0], neurons),
        make_model(train_env, 1, neurons),
        gamma,
        sigma,
        actor_lr,
        critic_lr
    )

    memory = Memory(memsize, train_env.observation_space, train_env.action_space)

    total_loss = 0
    t = 0
    
    best_rewards = -np.inf

    def test():
        nonlocal best_rewards
        reward_mean = sum(test_agent(test_env, agent) for _ in range(TEST_N)) / TEST_N
        print(f'Step: {i + 1} mean reward sum: {reward_mean} mean loss: {total_loss / t if t else 0}')
        if reward_mean > best_rewards:
            agent.save_model()
            print('Saved')
            best_rewards = reward_mean
        
    
    for i in range(steps):
        obs, obs_next, action, reward, done, prob = agent.step()
        memory.put(obs, obs_next, action, reward, done, prob)

        if i >= learning_starts:
            total_loss += agent.learning_step(*memory.get(batch_size, True))

        t += 1

        if i % TEST_I == 0:
            test()
            total_loss = 0
            t = 0
    
    test()

    return agent

In [None]:
train_actor_critic_agent('LunarLanderContinuous-v2', gamma=0.97, actor_lr=0.001, critic_lr=0.001, sigma=0.4)

Step: 1 mean reward sum: -401.41748249522874 mean loss: 0.0
Saved
Step: 5001 mean reward sum: -679.6874770824703 mean loss: [   1.075956 -549.7446    242.49194 ]
Step: 10001 mean reward sum: -157.08632907158258 mean loss: [ 5.3108168e-01 -7.7965417e+02  3.5481546e+02]
Saved
Step: 15001 mean reward sum: -416.2332664487665 mean loss: [  1.4009084 -20.140121    1.473731 ]
Step: 20001 mean reward sum: -427.3849108320004 mean loss: [  1.4612473 -33.61339     1.1907741]
Step: 25001 mean reward sum: -403.70088146613335 mean loss: [  1.764594  -29.53783     1.8621186]
Step: 30001 mean reward sum: -241.80583687378072 mean loss: [  1.6304636 -28.923712    1.8546697]
Step: 35001 mean reward sum: -108.74096714086495 mean loss: [ 2.9188602 -5.2435503  1.555732 ]
Saved
Step: 40001 mean reward sum: -530.2717268279341 mean loss: [  2.1983721 -13.903837    1.28333  ]
Step: 45001 mean reward sum: -1044.5089603356525 mean loss: [  2.572347  -10.231569    1.3730661]
Step: 50001 mean reward sum: -150.80501

In [None]:
train_actor_critic_agent('LunarLanderContinuous-v2', gamma=0.97, actor_lr=0.0001, critic_lr=0.0001, sigma=0.4)

In [None]:
train_actor_critic_agent('LunarLanderContinuous-v2', gamma=0.97, actor_lr=0.01, critic_lr=0.01, sigma=0.4)

In [None]:
train_actor_critic_agent('LunarLanderContinuous-v2', gamma=0.97, actor_lr=0.001, critic_lr=0.001, sigma=0.25)