In [None]:
import gym
import tensorflow as tf
import numpy as np

In [None]:
def make_model(input_shape, outputs, neurons, out_activation):
    return tf.keras.Sequential([
        tf.keras.layers.Input(input_shape),
        tf.keras.layers.Dense(neurons, activation='tanh'),
        tf.keras.layers.Dense(neurons, activation='tanh'),
        tf.keras.layers.Dense(outputs, activation=out_activation)
    ])

In [None]:
class Critic:
    def __init__(self, input_shape, neurons, lr):
        self.model = make_model(input_shape, 1, neurons, None)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    
    @tf.function
    def advantage(self, obs, obs_last, rewards, done, discount):
        v = self.model(obs)
        v_last = self.model(tf.reshape(obs_last, (1, -1))) * (1 - tf.cast(done, tf.float32))
        v = v[:,0]
        v_last = v_last[:,0]
        vn = tf.concat([v[1:], v_last], axis=0)

        adv = rewards + tf.stop_gradient(discount * vn) - v

        return adv

    def minimize(self, loss, tape):
        self.optimizer.minimize(loss, self.model.trainable_variables, tape=tape)

In [None]:
class DiscreteActor:
    def __init__(self, input_shape, actions, neurons, lr):
        self.model = make_model(input_shape, actions, neurons, 'softmax')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    @tf.function
    def prob(self, obs, actions):
        return self.probs_prob(obs, actions)[1]
    
    @tf.function
    def probs_prob(self, obs, actions):
        outs = self.model(obs)
        actions = tf.reshape(actions, (-1, 1))
        return outs, tf.gather_nd(outs, actions, batch_dims=1)

    def act(self, obs, explore):
        out = self.model(obs.reshape((1, -1))).numpy()[0]
        if explore:
            return np.random.choice(out.shape[0], p=out)
        else:
            return np.argmax(out)

    def minimize(self, loss, tape):
        self.optimizer.minimize(loss, self.model.trainable_variables, tape=tape)

In [None]:
def test_actor(actor, test_env, num, show=False):
    rewards_sum = 0
    for _ in range(num):
        obs = test_env.reset()
        done = False
        
        if show:
            test_env.render()

        while not done:
            action = actor.act(obs, explore=False)
            obs, reward, done, _ = test_env.step(action)

            if show:
                test_env.render()

            rewards_sum += reward
    return rewards_sum / num