In [1]:
import gym
import numpy as np
import tensorflow as tf

In [2]:
sample_env = gym.make('LunarLander-v2')

In [3]:
sample_env

<TimeLimit<LunarLander<LunarLander-v2>>>

In [4]:
sample_env.observation_space

Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)

In [5]:
sample_env.action_space

Discrete(4)

# Show time

In [12]:
sample_env.reset()

array([ 0.00192852,  1.4072621 ,  0.1953179 , -0.16257307, -0.00222785,
       -0.04424242,  0.        ,  0.        ], dtype=float32)

In [7]:
sample_env.render()

True

In [13]:
done = False

while not done:
    _, _, done, _ = sample_env.step(sample_env.action_space.sample())
    sample_env.render()

In [14]:
sample_env.step(0)

(array([ 0.07781267, -0.03735193,  0.01773274, -0.48825493,  0.01060515,
        -3.9004784 ,  1.        ,  0.        ], dtype=float32),
 -100,
 True,
 {})

# Q-learning

In [15]:
def test_agent(env, agent):
    obs = env.reset()
    
    rewards = 0
    done = False
    
    while not done:
        obs, reward, done, _ =  env.step(agent.act(obs, explore = False))
        rewards += reward
    
    return rewards

In [16]:
def show_agent(env, agent):
    obs = env.reset()
    
    rewards = 0
    done = False
    
    while not done:
        env.render()
        obs, reward, done, _ =  env.step(agent.act(obs, explore = False))
        rewards += reward
    
    return rewards

In [21]:
def make_model(env, outputs, neurons):
    return tf.keras.Sequential([
        tf.keras.layers.Input(env.observation_space.shape),
        tf.keras.layers.Dense(neurons, activation='tanh'),
        tf.keras.layers.Dense(neurons, activation='tanh'),
        tf.keras.layers.Dense(outputs)
    ])

In [18]:
TEST_I = 5000
TEST_N = 5

In [61]:
class QAgent:
    def __init__(self, env, model, gamma, exploration, lr):
        self.env = env
        self.obs = self.env.reset()
        
        self.model = model
        self.saved_model = tf.keras.models.clone_model(model)
        self.gamma = gamma
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        self.exploration = exploration

    def step(self):
        action = self.act(self.obs, explore=True)
        obs, reward, done, _ = self.env.step(action)
        prev_obs = self.obs
        
        if done:
            self.obs = self.env.reset()
        else:
            self.obs = obs

        return prev_obs, obs, action, reward, done

    def act(self, obs, explore):
        if explore and np.random.rand() < self.exploration:
            return self.env.action_space.sample()
        else:
            q = self.model(obs.reshape((1, -1))).numpy()
            return np.argmax(q)

    def learn(self, obs, obs_next, actions, rewards, done):
        with tf.GradientTape() as tape:
            loss = self.loss(obs, obs_next, actions, rewards, done)
        
        v = self.model.trainable_variables
        self.optimizer.minimize(loss, v, tape=tape)
        
        return loss.numpy().mean()

    def loss(self, obs, obs_next, actions, rewards, done):
        q = self.model(obs)
        q_next = self.model(obs_next) * (1 - done).reshape(-1, 1)
        
        diffs =\
            tf.stop_gradient(rewards.reshape(-1, 1) + self.gamma * tf.reduce_max(q_next, axis=-1, keepdims=True))\
            - tf.reshape(tf.gather_nd(q, actions.reshape(-1, 1), batch_dims=1), (-1, 1))
        
        return tf.reduce_mean(diffs ** 2)

    def save_model(self):
        self.saved_model.set_weights(self.model.get_weights())

    def load_model(self):
        self.model.set_weights(self.saved_model.get_weights())

In [22]:
agent = QAgent(sample_env, make_model(sample_env, sample_env.action_space.n, 64), 0.97, 0.05, 0.001)

In [27]:
show_agent(sample_env, agent)

-446.3824781359425

In [40]:
def train_q_agent(env_name, gamma, exploration, lr, neurons, steps):
    test_env = gym.make(env_name)
    train_env = gym.make(env_name)
    agent = QAgent(train_env, make_model(train_env, train_env.action_space.n, neurons), gamma, exploration, lr)
    
    best_rewards = -np.inf
    total_loss = 0
    t = 0
    i = 0
    
    def test():
        nonlocal best_rewards
        reward_mean = sum(test_agent(test_env, agent) for _ in range(TEST_N)) / TEST_N
        print(f'Step: {i}; mean rewards: {reward_mean} mean loss: {total_loss / t}')
        if reward_mean > best_rewards:
            best_rewards = reward_mean
            agent.save_model()

    for i in range(steps):
        obs, obs_next, action, reward, done = agent.step()
        
        loss = agent.learn(
            obs.reshape((1, -1)),
            obs_next.reshape((1, -1)),
            np.array([action]),
            np.array([reward]),
            np.array([done])
        )
        
        total_loss += loss
        t += 1
        
        if i % TEST_I == 0:
            test()
            total_loss = 0
            t = 0

    test()
    
    return agent

In [48]:
agent = train_q_agent('LunarLander-v2', 0.97, 0.05, 0.001, 64, 75000)

Step: 0; mean rewards: -177.17759518167736 mean loss: 0.018093472346663475
Step: 5000; mean rewards: -558.6611788753227 mean loss: 111.38982937799896
Step: 10000; mean rewards: -91.03338388189124 mean loss: 89.59023281809345
Step: 15000; mean rewards: -509.3658352870933 mean loss: 172.99317659972957
Step: 20000; mean rewards: -526.6223625931985 mean loss: 186.43246589719314
Step: 25000; mean rewards: -105.56304294211853 mean loss: 162.472920534716
Step: 30000; mean rewards: -572.1939757566408 mean loss: 140.09841206688267
Step: 35000; mean rewards: -109.21074362285876 mean loss: 192.83290152246377
Step: 40000; mean rewards: -236.6381625160443 mean loss: 196.18119077835246
Step: 45000; mean rewards: -289.5913443539947 mean loss: 162.15580050305823
Step: 50000; mean rewards: -536.0540739040664 mean loss: 168.2273138917415
Step: 55000; mean rewards: -240.6857538616799 mean loss: 167.3019231625149
Step: 60000; mean rewards: -192.52857487252624 mean loss: 168.18432192580292
Step: 65000; mea

In [57]:
show_agent(sample_env, agent)

-50.57960776961261

In [53]:
agent.load_model()

In [58]:
class Memory:
    def __init__(self, size, observation_space, action_space):
        self.obs = np.zeros((size,) + observation_space.shape, dtype=observation_space.dtype)
        self.obs_next = np.zeros((size,) + observation_space.shape, dtype=observation_space.dtype)
        self.actions = np.zeros((size,) + action_space.shape, dtype=action_space.dtype)
        
        self.rewards = np.zeros((size,), dtype=np.float32)
        self.dones = np.zeros((size,), dtype=np.float32)
        
        self.size = size
        self.cur_size = 0
        self.next = 0

    def put(self, obs, obs_next, action, reward, done):
        self.obs[self.next] = obs
        self.obs_next[self.next] = obs_next
        self.actions[self.next] = action
        self.rewards[self.next] = reward
        self.dones[self.next] = done
        
        self.next = (self.next + 1) % self.size
        self.cur_size = min(self.cur_size + 1, self.size)

    def get(self, batch_size):
        ids = np.random.choice(self.cur_size, size=batch_size)
        return self.obs[ids], self.obs_next[ids], self.actions[ids], self.rewards[ids], self.dones[ids]

In [59]:
def train_q_agent(env_name, gamma, exploration, lr, neurons, steps, batch_size, mem_size):
    test_env = gym.make(env_name)
    train_env = gym.make(env_name)
    agent = QAgent(train_env, make_model(train_env, train_env.action_space.n, neurons), gamma, exploration, lr)
    
    memory = Memory(mem_size, train_env.observation_space, train_env.action_space)
    
    best_rewards = -np.inf
    total_loss = 0
    t = 0
    i = 0
    
    def test():
        nonlocal best_rewards
        reward_mean = sum(test_agent(test_env, agent) for _ in range(TEST_N)) / TEST_N
        print(f'Step: {i}; mean rewards: {reward_mean} mean loss: {total_loss / t}')
        if reward_mean > best_rewards:
            best_rewards = reward_mean
            agent.save_model()

    for i in range(steps):
        obs, obs_next, action, reward, done = agent.step()
        memory.put(obs, obs_next, action, reward, done)
        
        loss = agent.learn(*memory.get(batch_size))
        
        total_loss += loss
        t += 1
        
        if i % TEST_I == 0:
            test()
            total_loss = 0
            t = 0

    test()
    
    return agent

In [62]:
agent = train_q_agent('LunarLander-v2', 0.97, 0.05, 0.001, 64, 75000, 100, 75000)

Step: 0; mean rewards: -337.71626465031477 mean loss: 2.7995450496673584
Step: 5000; mean rewards: 72.49090555055014 mean loss: 39.592219853186606
Step: 10000; mean rewards: 8.005744037706513 mean loss: 14.014898508131504
Step: 15000; mean rewards: -53.987472826910675 mean loss: 7.9595151462435725
Step: 20000; mean rewards: 117.83277603120018 mean loss: 6.8369340963721275
Step: 25000; mean rewards: 93.03897114283225 mean loss: 7.572686772716045
Step: 30000; mean rewards: 45.401753638273455 mean loss: 8.869809253382682
Step: 35000; mean rewards: 127.78036391856662 mean loss: 9.444670363330841
Step: 40000; mean rewards: 39.39960036487311 mean loss: 9.127043556249141
Step: 45000; mean rewards: 44.698645065746256 mean loss: 9.685314298141003
Step: 50000; mean rewards: 11.17170726632415 mean loss: 10.15371111023426
Step: 55000; mean rewards: 112.00017803436006 mean loss: 9.886017969548702
Step: 60000; mean rewards: 202.0123223733297 mean loss: 10.50576485325098
Step: 65000; mean rewards: 14

In [65]:
show_agent(sample_env, agent)

210.3417547808289

In [64]:
agent.load_model()

In [66]:
sample_env.close()

In [67]:
sample_env = gym.make('LunarLanderContinuous-v2')

In [68]:
sample_env.observation_space

Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)

In [69]:
sample_env.action_space

Box([-1. -1.], [1. 1.], (2,), float32)

In [70]:
import tensorflow_probability as tfp

In [72]:
class ACMemory:
    def __init__(self, size, observation_space, action_space):
        self.obs = np.zeros((size,) + observation_space.shape, dtype=observation_space.dtype)
        self.obs_next = np.zeros((size,) + observation_space.shape, dtype=observation_space.dtype)
        self.actions = np.zeros((size,) + action_space.shape, dtype=action_space.dtype)
        
        self.rewards = np.zeros((size,), dtype=np.float32)
        self.probs = np.zeros((size,), dtype=np.float32)
        self.dones = np.zeros((size,), dtype=np.float32)
        
        self.size = size
        self.cur_size = 0
        self.next = 0

    def put(self, obs, obs_next, action, reward, done, probs):
        self.obs[self.next] = obs
        self.obs_next[self.next] = obs_next
        self.actions[self.next] = action
        self.rewards[self.next] = reward
        self.dones[self.next] = done
        self.probs[self.next] = probs
        
        self.next = (self.next + 1) % self.size
        self.cur_size = min(self.cur_size + 1, self.size)

    def get(self, batch_size):
        ids = np.random.choice(self.cur_size, size=batch_size)
        return (
            self.obs[ids], self.obs_next[ids], self.actions[ids],
            self.rewards[ids], self.dones[ids], self.probs[ids]
        )

In [78]:
class ActorCriticAgent:
    def __init__(self, env, actor_model, critic_model, gamma, sigma, lr, b, beta):
        self.env = env
        self.obs = self.env.reset()
        
        self.actor = actor_model
        self.critic = critic_model
        self.saved_actor = tf.keras.models.clone_model(actor_model)
        self.saved_critic = tf.keras.models.clone_model(critic_model)
        self.gamma = gamma
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        self.noise = tfp.distributions.MultivariateNormalDiag(
            tf.zeros(self.env.action_space.shape, dtype=tf.float32),
            tf.ones(self.env.action_space.shape, dtype=tf.float32) * sigma,
        )
        self.b = b
        self.beta = beta

    def step(self):
        action, prob = self.act(self.obs, explore=True)
        obs, reward, done, _ = self.env.step(action)
        prev_obs = self.obs
        
        if done:
            self.obs = self.env.reset()
        else:
            self.obs = obs

        return prev_obs, obs, action, reward, done, prob

    def act(self, obs, explore):
        actions = self.actor(obs.reshape(1, -1))[0]
        
        if explore:
            noise = self.noise.sample()
            probs = self.noise.prob(noise)
            return (actions + noise).numpy(), probs.numpy()
        else:
            return actions

    
    def learn(self, obs, obs_next, actions, rewards, done, probs):
        with tf.GradientTape() as tape:
            critic_loss, actor_loss, bounds_penalty = self.loss(obs, obs_next, actions, rewards, done, probs)
                
        return np.array([critic_loss.numpy(), actor_loss.numpy(), bounds_penalty.numpy()])
    
    @tf.function
    def loss(self, obs, obs_next, actions, rewards, done, probs):
        with tf.GradientTape(persistent=True) as tape:
            v = self.critic(obs)
            v_next = self.critic(obs_next)
            
            cur_actions = self.actor(obs)
            cur_log_probs = self.noise.log_prob(actions - cur_actions)
            cur_probs = tf.exp(cur_log_probs)
            
            is_base = tf.reshape(cur_probs / probs, (-1, 1))
            is_ = tf.tanh(is_base / self.b) * self.b
            
            td = tf.reshape(rewards, (-1, 1)) + self.gamma * tf.stop_gradient(v_next) - v
            
            critic_loss = tf.reduce_mean(td ** 2 * is_)
            actor_loss = tf.reduce_mean(-tf.reshape(cur_log_probs, (-1, 1)) * td * tf.stop_gradient(is_))
            
            bounds_penalty = tf.reduce_mean(
                tf.maximum(tf.abs(cur_actions) - tf.reshape(self.env.action_space.high, (1, -1)), 0)
                ** 2 * self.beta
            )
            
            actor_loss_with_bounds = actor_loss + bounds_penalty
        
        self.actor_optimizer.minimize(actor_loss_with_bounds, self.actor.trainable_variables, tape=tape)
        self.critic_optimizer.minimize(critic_loss, self.critic.trainable_variables, tape=tape)
        
        return actor_loss, critic_loss, bounds_penalty

    def save_model(self):
        self.saved_actor.set_weights(self.actor.get_weights())
        self.saved_critic.set_weights(self.critic.get_weights())

    def load_model(self):
        self.actor.set_weights(self.saved_actor.get_weights())
        self.critic.set_weights(self.saved_critic.get_weights())

In [74]:
def train_ac_agent(env_name, gamma, sigma, lr, b, beta, neurons, steps, batch_size, mem_size):
    test_env = gym.make(env_name)
    train_env = gym.make(env_name)
    agent = ActorCriticAgent(
        train_env,
        make_model(train_env, train_env.action_space.shape[0], neurons),
        make_model(train_env, 1, neurons),
        gamma, sigma, lr, b, beta
    )
    
    memory = ACMemory(mem_size, train_env.observation_space, train_env.action_space)
    
    best_rewards = -np.inf
    total_loss = 0
    t = 0
    i = 0
    
    def test():
        nonlocal best_rewards
        reward_mean = sum(test_agent(test_env, agent) for _ in range(TEST_N)) / TEST_N
        print(f'Step: {i}; mean rewards: {reward_mean} mean loss: {total_loss / t}')
        if reward_mean > best_rewards:
            best_rewards = reward_mean
            agent.save_model()

    for i in range(steps):
        obs, obs_next, action, reward, done, probs = agent.step()
        memory.put(obs, obs_next, action, reward, done, probs)
        
        loss = agent.learn(*memory.get(batch_size))
        
        total_loss += loss
        t += 1
        
        if i % TEST_I == 0:
            test()
            total_loss = 0
            t = 0

    test()
    
    return agent

In [80]:
agent = train_ac_agent('LunarLanderContinuous-v2', 0.97, 0.3, 0.001, 3, 0.1, 64, 50000, 100, 75000)

Step: 0; mean rewards: -388.3604990078528 mean loss: [-0.05058021 12.393817    0.        ]
Step: 5000; mean rewards: -147.75382120326506 mean loss: [-0.07964392  5.8693876   0.06297314]
Step: 10000; mean rewards: -203.96583217131396 mean loss: [-0.04786463  1.9669648   0.02933767]
Step: 15000; mean rewards: 58.715670562866286 mean loss: [-0.0310049   1.1684937   0.02051632]
Step: 20000; mean rewards: -128.98516214331724 mean loss: [-0.03323267  0.5207679   0.01062053]
Step: 25000; mean rewards: -95.16010696081925 mean loss: [-0.04176664  0.41489357  0.00858743]
Step: 30000; mean rewards: -88.0114589685005 mean loss: [-0.03559497  0.45041865  0.00724451]
Step: 35000; mean rewards: -26.00342753575557 mean loss: [-0.03597514  0.34646347  0.00603455]
Step: 40000; mean rewards: -59.711151495201726 mean loss: [-0.03253345  0.2602508   0.00534563]
Step: 45000; mean rewards: 31.811532687978676 mean loss: [-0.02531077  0.25308514  0.00502698]
Step: 49999; mean rewards: -71.84593574794485 mean l

In [88]:
show_agent(sample_env, agent)

-27.669053611754848

In [89]:
sample_env.close()