In [1]:
import tensorflow as tf
import tensorflow.keras.layers as layers
import numpy as np

import gym
import cv2
import collections
import gym.spaces

import time

In [2]:
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19.5

GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000

EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02

Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

In [3]:
class DQN(tf.keras.Model):
    def __init__(self, n_actions):
        super(DQN, self).__init__()
        self.conv = tf.keras.Sequential([
            layers.Conv2D(kernel_size=8, strides=4, filters=32),
            layers.ReLU(),
            layers.Conv2D(filters=64, kernel_size=4, strides=2),
            layers.ReLU(),
            layers.Conv2D(filters=64, kernel_size=3, strides=1),
            layers.ReLU(),
            layers.Flatten(),
            layers.Dense(512, activation="relu"),
            layers.Dense(n_actions)
        ])

        # self.fc = tf.keras.Sequential(
        #     layers.Dense(512, activation="relu"),
        #     layers.Dense(n_actions)
        # )

    def call(self, inputs, training=None, mask=None):
        # conv_out = tf.reshape(self.conv(inputs), shape=(inputs.shape()[0], -1))
        return self.conv(inputs)

In [4]:
class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        super(FireResetEnv, self).__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == "FIRE"
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def step(self, action):
        return self.env.step(action)

    def reset(self):
        self.env.reset()
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset()
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset()
        return obs



In [5]:
class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        """Return only every `skip`-th frame"""
        super(MaxAndSkipEnv, self).__init__(env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info


    def reset(self):
        """Clear past frame buffer and init. to first obs. from inner env."""
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs

In [6]:
class ProcessFrame84(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 210 * 160 * 3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        elif frame.size == 250 * 160 * 3:
            img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)

In [7]:
class ImageToTensor(gym.ObservationWrapper):
    def __init__(self, env):
        super(ImageToTensor, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]),
                                                dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)


In [8]:
class ConvertToTensorflowFormat(gym.ObservationWrapper):
    def __init__(self, env):
        super(ConvertToTensorflowFormat, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[1], old_shape[2], old_shape[0]),
                                                dtype=np.float32)

    def observation(self, observation):
        return tf.transpose(observation, perm=[2, 1, 0])

In [9]:
class ScaledFloatFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0


class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps, dtype=np.float32):
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
                                                old_space.high.repeat(n_steps, axis=0), dtype=dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer

In [10]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, donnes, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np. array(rewards, dtype=np.float32), \
            np.array(donnes, dtype=np.uint8), np.array(next_states)


In [11]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = self.env.reset()
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0):
        done_reward = None

        if np.random.random() < epsilon:
            action = self.env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = tf.convert_to_tensor(state_a)
            q_vals_v = net(state_v)
            act_v = tf.reduce_max(q_vals_v, axis=1)
            action = int(act_v.numpy())

        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward


In [34]:
def calc_loss(batch, net, tgt_net):
    states, actions, rewards, dones, next_states = batch

    states_v = tf.convert_to_tensor(states)
    next_states_v = tf.convert_to_tensor(next_states)
    actions_v = tf.convert_to_tensor(actions)
    rewards_v = tf.convert_to_tensor(rewards)
    done_mask = tf.convert_to_tensor(dones, dtype=tf.bool)
    net.build(input_shape=states_v.shape)
    tgt_net.build(input_shape=states_v.shape)
    out = net(states_v)
    # state_action_values = tf.squeeze(net(states_v).gather(1, tf.expand_dims(actions_v, axis=-1)), axis=-1)
    # print("Fucking", actions_v.numpy())
    out = tf.gather(out, axis=1, indices=actions_v.numpy())
    # state_action_values = tf.squeeze(out, axis=-1)
    # print("Done mask", done_mask)
    state_action_values = out
    next_state_values = tf.reduce_max(tgt_net(next_states_v), axis=1)
    # print(next_state_values.shape)
    # next_state_values[done_mask.numpy()] = 0.0
    tf.boolean_mask(next_state_values, done_mask)
    # next_state_values = next_state_values

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return tf.keras.losses.MSE(expected_state_action_values, state_action_values)




In [35]:
# calc_loss(batch, net, tgt_net)

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.09313925, 0.09335377, 0.09374555, 0.09297328, 0.09252632,
       0.09321696, 0.09295488, 0.09287617, 0.09300581, 0.09260686,
       0.09299167, 0.09290323, 0.09307529, 0.0928075 , 0.09273619,
       0.09338102, 0.09316211, 0.0931477 , 0.0931766 , 0.09287138,
       0.09293686, 0.09273384, 0.0929409 , 0.09226356, 0.09308209,
       0.09300581, 0.09316253, 0.09298011, 0.09295171, 0.09300581,
       0.09353915, 0.09360252], dtype=float32)>

In [37]:
env = gym.make(DEFAULT_ENV_NAME)
env = MaxAndSkipEnv(env)
env = FireResetEnv(env)
env = ProcessFrame84(env)
env = ImageToTensor(env)
env = BufferWrapper(env, 4)
env = ScaledFloatFrame(env)
env = ConvertToTensorflowFormat(env)

net = DQN(env.action_space.n)
tgt_net = DQN(env.action_space.n)

# tgt_net.build(tf.expand_dims(env.observation_space, axis=0).shape)
shape = list(env.observation_space.shape)
shape.insert(0, 1)
tgt_net.build(shape)
log_dir = "logs/dqn_pong"
summary = tf.summary.create_file_writer(logdir=log_dir)

buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_mean_reward = None

while True:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)

    reward = agent.play_step(net, epsilon)
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        mean_reward = np.mean(total_rewards[-100:])
        print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
                frame_idx, len(total_rewards), mean_reward, epsilon,
                speed
            ))

        with summary.as_default():
            tf.summary.scalar("epsilon", epsilon, frame_idx)
            tf.summary.scalar("speed", speed, frame_idx)
            tf.summary.scalar("reward_100", mean_reward, frame_idx)
            tf.summary.scalar("reward", reward, frame_idx)

        if best_mean_reward is None or mean_reward > best_mean_reward:
            # tf.saved_model.save(net, "models")
            net.save("models")
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
            best_mean_reward = mean_reward

        if mean_reward > 199:
            print("Solved in %d frame" % frame_idx)
            break
    if len(buffer) < REPLAY_START_SIZE:
        continue

    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.set_weights(net.get_weights())

    batch = buffer.sample(BATCH_SIZE)
    with tf.GradientTape() as g:
        loss_t = calc_loss(batch, net, tgt_net)

    gradients = g.gradient(loss_t, net.trainable_variables)
    optimizer.apply_gradients(zip(gradients, net.trainable_variables))

790: done 1 games, mean reward -21.000, eps 0.99, speed 635.43 f/s
INFO:tensorflow:Assets written to: models/assets
1630: done 2 games, mean reward -20.500, eps 0.98, speed 455.60 f/s
INFO:tensorflow:Assets written to: models/assets
Best mean reward updated -21.000 -> -20.500, model saved
2572: done 3 games, mean reward -20.667, eps 0.97, speed 510.24 f/s
3501: done 4 games, mean reward -20.500, eps 0.96, speed 749.44 f/s
4263: done 5 games, mean reward -20.600, eps 0.96, speed 726.06 f/s
5114: done 6 games, mean reward -20.667, eps 0.95, speed 726.14 f/s
6197: done 7 games, mean reward -20.571, eps 0.94, speed 730.13 f/s
6987: done 8 games, mean reward -20.625, eps 0.93, speed 652.54 f/s
8012: done 9 games, mean reward -20.556, eps 0.92, speed 688.25 f/s
8822: done 10 games, mean reward -20.600, eps 0.91, speed 668.31 f/s
9734: done 11 games, mean reward -20.636, eps 0.90, speed 669.46 f/s
10496: done 12 games, mean reward -20.667, eps 0.90, speed 14.60 f/s


KeyboardInterrupt: 

In [None]:
import ptan

ptan.actions.EpsilonGreedyActionSelector
