In [None]:
!pip install Box2D
!pip install gym
!apt-get install -y xvfb python-opengl
!pip install pyvirtualdisplay

Collecting Box2D
  Downloading Box2D-2.3.10-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
[?25l[K     |▎                               | 10 kB 27.9 MB/s eta 0:00:01[K     |▌                               | 20 kB 29.8 MB/s eta 0:00:01[K     |▊                               | 30 kB 19.9 MB/s eta 0:00:01[K     |█                               | 40 kB 11.5 MB/s eta 0:00:01[K     |█▎                              | 51 kB 9.6 MB/s eta 0:00:01[K     |█▌                              | 61 kB 11.2 MB/s eta 0:00:01[K     |█▊                              | 71 kB 11.9 MB/s eta 0:00:01[K     |██                              | 81 kB 11.2 MB/s eta 0:00:01[K     |██▎                             | 92 kB 12.3 MB/s eta 0:00:01[K     |██▌                             | 102 kB 11.5 MB/s eta 0:00:01[K     |██▊                             | 112 kB 11.5 MB/s eta 0:00:01[K     |███                             | 122 kB 11.5 MB/s eta 0:00:01[K     |███▏                            | 133 kB 11.

In [None]:
from google.colab import drive
from time import time

drive_path = '/content/drive'

drive.mount(drive_path, force_remount=True)

output_dir = drive_path + '/MyDrive/Colab Notebooks/lunarlander_dqn'
output_dir = f'{output_dir}/{str(time())}/'
print(output_dir)

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/lunarlander_dqn/1650895970.2012837/


In [None]:
import random
from collections import deque, defaultdict
import numpy as np
import tensorflow as tf
from tensorflow import keras
import json

class DQN:
    def __init__(
        self,
        input_dim,
        num_actions,
        epsilon_start=1,
        epsilon_min=0.01,
        epsilon_decay=0.995,
        discount=0.9,
        experience_buffer_size=500000,
        batch_size=80,
        max_steps_per_episode=5000,
        use_fixed_target_network=False,
        loss_fn=tf.keras.losses.MeanSquaredError,
        update_rate=1000,
    ):
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.loss_fn = loss_fn
        self.model = self.build_model(input_dim, num_actions)
        print(self.model.summary())
        self.use_fixed_target_network = use_fixed_target_network
        if use_fixed_target_network:
            self.target_model = keras.models.clone_model(self.model)
        self.experience_buffer = deque(maxlen=experience_buffer_size)
        self.discount = discount
        self.max_steps_per_episode = max_steps_per_episode
        self.update_rate = update_rate
        self.batch_size = batch_size
        self.input_dim = input_dim

    def build_model(self, input_dim, num_actions):
        model = keras.Sequential(
            [
                keras.layers.Dense(512, input_dim=input_dim, activation='relu', name='layer1'),
                keras.layers.Dense(256, activation='relu', name='layer2'),
                keras.layers.Dense(num_actions, activation='linear', name='layer4'),
            ]
        )
        model.compile(
            optimizer=keras.optimizers.Adam(),
            loss=self.loss_fn(),
            # metrics=['accuracy'],
        )
        return model

    def choose_action(self, state, actions):
        """
        Chooses an action using e-greedy policy

        :param state: current state
        :param actions: all possible actions
        :return: action to take
        """
        if np.random.uniform() < self.epsilon:
            return actions.sample()
        action_values = self.model.predict(state)
        return np.argmax(action_values[0])

    def train(self, env, episodes=1000):
        # print('episodes', episodes)
        episode_rewards_dict = defaultdict(int)
        total_step = 0
        for episode in range(0, episodes):
            # print('episode', episode)
            state = env.reset()
            state = np.reshape(state, [1, self.input_dim])
            done = False
            time_step = 0
            episode_reward = 0
            while not done and time_step < self.max_steps_per_episode:
                time_step += 1
                total_step += 1

                if self.use_fixed_target_network and total_step % self.update_rate == 0:
                    total_step = 0  # reset total_step here, so that it cycles between 0 and update_rate
                    self.update_target_model()

                action = self.choose_action(state, env.action_space)
                next_state, reward, done, info = env.step(action)
                episode_reward += reward
                
                next_state = np.reshape(next_state, [1, self.input_dim])
                self.experience_buffer.append((state, action, reward, next_state, done))
                
                # We only do training every 5 steps 
                # and stop training once last 100 rewards mean reaches 180
                # to increase runtime performance
                if time_step % 5 == 0 and self.get_rewards_mean(episode_rewards_dict) < 180:
                    self.learn_from_experiences()
                
                state = next_state

            # self.learn_from_experiences()
            episode_rewards_dict[episode] = episode_reward
            
            reward_mean = self.get_rewards_mean(episode_rewards_dict)
            
            print(
                episode,
                '{:.6f}'.format(episode_reward),
                '{:.6f}'.format(reward_mean),
                '{:.6f}'.format(self.epsilon),
                time_step,
            )
            
            # save weights and rewards
            self.model.save_weights(f'{output_dir}/weights.h5')
            json_object = json.dumps(episode_rewards_dict, indent = 4)
            # Writing to rewards.json
            with open(f'{output_dir}/rewards.json', 'w') as outfile:
                outfile.write(json_object)

            if reward_mean > 200:
                print('Training Complete')
                break
            
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay

    def get_rewards_mean(self, episode_rewards_dict):
        return np.mean(list(episode_rewards_dict.values())[-100:])

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def learn_from_experiences(self, batch_size=None):
        batch_size = batch_size or self.batch_size
        if len(self.experience_buffer) < batch_size:
            return

        s_batch, a_batch, r_batch, next_s_batch, d_batch = self.sample_batch(batch_size)

        target_reward_model = self.target_model if self.use_fixed_target_network else self.model

        targets = r_batch + self.discount * (np.amax(target_reward_model.predict_on_batch(next_s_batch), axis=1)) * (1 - d_batch)
        target_vec = self.model.predict_on_batch(s_batch)
        indexes = np.array([i for i in range(self.batch_size)])
        target_vec[[indexes], [a_batch]] = targets
        self.model.fit(s_batch, target_vec, epochs=1, verbose=0)

    def sample_batch(self, batch_size):
        # get a batch from experience buffer
        batch = random.sample(self.experience_buffer, batch_size)
        s_batch = np.array([i[0] for i in batch])
        a_batch = np.array([i[1] for i in batch])
        r_batch = np.array([i[2] for i in batch])
        next_s_batch = np.array([i[3] for i in batch])
        d_batch = np.array([i[4] for i in batch])
        s_batch = np.squeeze(np.squeeze(s_batch))
        next_s_batch = np.squeeze(next_s_batch)
        return s_batch, a_batch, r_batch, next_s_batch, d_batch


In [None]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

<pyvirtualdisplay.display.Display at 0x7f03852b5ad0>

In [None]:
import gym
from gym.wrappers import Monitor

env = gym.make("LunarLander-v2")
# For recording videos
env = Monitor(env, f'{output_dir}/videos/', video_callable=lambda episode_id: True, force=True)

In [None]:
dqn = DQN(env.observation_space.shape[0], env.action_space.n, use_fixed_target_network=True)
dqn.train(env)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 layer1 (Dense)              (None, 512)               4608      
                                                                 
 layer2 (Dense)              (None, 256)               131328    
                                                                 
 layer4 (Dense)              (None, 4)                 1028      
                                                                 
Total params: 136,964
Trainable params: 136,964
Non-trainable params: 0
_________________________________________________________________
None


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


0 -191.967931 -191.967931 1.000000 114
1 -162.705904 -177.336917 0.995000 77
2 -386.639263 -247.104366 0.990025 94
3 -129.498266 -217.702841 0.985075 122
4 -79.641549 -190.090582 0.980150 73
5 -297.081313 -207.922371 0.975249 92
6 -125.181532 -196.102251 0.970373 82
7 -87.965331 -182.585136 0.965521 70
8 -80.459406 -171.237833 0.960693 119
9 -59.631866 -160.077236 0.955890 70
10 -81.087361 -152.896338 0.951110 88
11 -127.399470 -150.771599 0.946355 88
12 -51.948371 -143.169813 0.941623 70
13 -69.421624 -137.902085 0.936915 131
14 -193.547834 -141.611801 0.932230 143
15 -348.948040 -154.570316 0.927569 101
16 -93.840700 -150.997986 0.922931 126
17 -91.177658 -147.674634 0.918316 109
18 -217.025686 -151.324690 0.913725 83
19 -193.707126 -153.443811 0.909156 109
20 -251.034780 -158.091000 0.904610 90
21 -247.017776 -162.133127 0.900087 145
22 20.402688 -154.196787 0.895587 1000
23 -176.637605 -155.131821 0.891109 68
24 -140.243129 -154.536273 0.886654 126
25 -22.326768 -149.451292 0.88222