<a href="https://colab.research.google.com/github/KornelWitkowski/Deep-Q-Learning-with-Tensorflow/blob/main/MountainCar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MountainCar

In [None]:
%%bash

apt-get install swig

git clone https://github.com/pybox2d/pybox2d
cd pybox2d
python setup.py build
python setup.py install

apt-get install -y xvfb

pip install \
    gym==0.21 \
    gym[box2d]==0.21 \
    pyglet==1.5.27 \
    pyvirtualdisplay

In [3]:
from pyvirtualdisplay import Display
Display(visible=False, size=(1400, 900)).start();

In [4]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam


class DeepQLearningModel:
  def __init__(self, observation_size, hidden_size, action_size, learning_rate=0.001):

    self.model = Sequential([Dense(observation_size, activation="relu"),
                             Dense(hidden_size, activation="relu"),
                             Dense(hidden_size, activation="relu"),
                             Dense(action_size-1)])
    self.compile()
    
  def compile(self, learning_rate=0.001):  
    self.model.compile(loss="mse",
                       optimizer=Adam(learning_rate=learning_rate))
    
  def fit(self, x, y):
    history = self.model.fit(x, y, epochs=1, verbose=0)
    loss =  history.history["loss"][0]
    return loss

In [5]:
import numpy as np

def epsilon_greedy_policy(state, environment, model, epsilon=0.0):

  if np.random.random() < epsilon:
    action = random.choice([0, 1])
  else:
    q_values = model(tf.expand_dims(state, axis=0))
    action = tf.math.argmax(q_values, axis=1)
    action = int(action)
  return action

In [6]:
from collections import deque, namedtuple
import random

class ReplayBuffer:

  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

In [7]:
import random
from collections import deque

class ReplayBuffer:

  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    return random.sample(self.buffer, batch_size)

In [8]:
import gym
from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit

def create_gym_environment(name):
  environment = gym.make(name)
  environment = TimeLimit(environment, max_episode_steps=800)
  environment = RecordVideo(environment, video_folder='./recored_episodes', episode_trigger=lambda x: x % 50 == 0)
  environment = RecordEpisodeStatistics(environment)

  return environment

In [13]:
import copy
import pandas as pd

from gym.wrappers import RecordVideo, RecordEpisodeStatistics, TimeLimit

import gym

class DeepQLearningAlgorithm:

    def __init__(self, environment_name="MountainCar-v0", policy=epsilon_greedy_policy, capacity=100_000,
                batch_size=256, learning_rate=1e-3, hidden_size=128, gamma=0.99, epsilon_start=1.0, epsilon_end=0.15,
                epsilon_last_episode=600, samples_per_epoch=2_048//8, q_net_update_rate=10):
    
      self.environment = create_gym_environment(environment_name)
      observation_size = self.environment.observation_space.shape[0]
      actions_size = self.environment.action_space.n

      self.q_net = DeepQLearningModel(observation_size, hidden_size, actions_size, learning_rate)
      self.target_q_net = copy.deepcopy(self.q_net.model)

      self.policy = policy
      self.buffer = ReplayBuffer(capacity=capacity)

       
      self.current_epoch = 0
      self.total_reward = 0


      # hyperparameters
      self.batch_size = batch_size
      self.learning_rate = learning_rate
      self.gamma = 0.99
      self.epsilon_start = epsilon_start
      self.epsilon_end = epsilon_end
      self.epsilon_last_episode = epsilon_last_episode
      self.samples_per_epoch = samples_per_epoch
      self.q_net_update_rate = 10


      while len(self.buffer) < self.samples_per_epoch:
        self.play_episode(epsilon=0)
        
    def play_episode(self, policy=None, epsilon=0.):
      state = self.environment.reset()
      done = False
      self.total_reward = 0

      while not done:
        if policy:
          action = policy(state, self.environment, self.q_net.model, epsilon=epsilon)
        else:
          action = random.choice([0, 1])

        next_state, _, done, info = self.environment.step(2*action)

        reward = (10 * next_state[1])**2

        if next_state[0] >= 0.45:
          reward += 100
        self.total_reward += reward


        exp = (state, action, reward, done, next_state)
        self.buffer.append(exp)
        state = next_state

      return

    def get_batch(self):
      sample = pd.DataFrame(self.buffer.sample(self.batch_size))

      state = np.stack(sample[0].values)
      action = np.stack(sample[1].values)
      reward = np.stack(sample[2].values)
      done = np.stack(sample[3].values)
      next_state = np.stack(sample[4].values)

      return state, action, reward, done, next_state


    def train_step(self):
      states, actions, rewards, dones, next_states = self.get_batch()

      state_action_values = tf.gather(self.q_net.model(states), actions, axis=1, batch_dims=1)

      next_action_values = tf.math.reduce_max(self.target_q_net(next_states), axis=1)
      next_action_values = next_action_values.numpy()
      next_action_values[dones] = 0.0

      expected_state_action_values = rewards + self.gamma * next_action_values

      q_net_predictions = self.q_net.model(states).numpy()
      q_net_predictions[range(self.batch_size), actions] = expected_state_action_values

      loss = self.q_net.fit(states, q_net_predictions)

      return loss

      
    def training_epoch_end(self):

        epsilon = max(self.epsilon_end,
                      self.epsilon_start - self.current_epoch / self.epsilon_last_episode)

        self.play_episode(policy=self.policy, epsilon=epsilon)

        if self.current_epoch % self.q_net_update_rate == 0:
          self.target_q_net = copy.deepcopy(self.q_net.model)
          

        return
        
    def train(self, epochs):

      for i in range(epochs):

        loss = 0

        for _ in range(self.samples_per_epoch//self.batch_size):  
          loss += self.train_step()

        self.training_epoch_end()

        if self.current_epoch % 50 == 0:
          returns = list(self.environment.return_queue)[-1]
          print(f"Epoch: {self.current_epoch}, loss: {loss}, last total reward: {self.total_reward}")

        self.current_epoch += 1


In [14]:
!rm -r /content/recored_episodes

algorithm =  DeepQLearningAlgorithm(hidden_size=32)

In [15]:
algorithm.train(2000)

Epoch: 0, loss: 0.001230426598340273, last total reward: 0.6439800962735318
Epoch: 50, loss: 8.219757000915706e-05, last total reward: 0.8422570181630848
Epoch: 100, loss: 0.00016307328769471496, last total reward: 5.858195979504444
Epoch: 150, loss: 0.00014067997108213603, last total reward: 7.989061202533485
Epoch: 200, loss: 0.0006104748463258147, last total reward: 7.712216994972469
Epoch: 250, loss: 0.001088780234567821, last total reward: 6.33884359408185
Epoch: 300, loss: 0.009495184756815434, last total reward: 6.720035123371619
Epoch: 350, loss: 0.004412907175719738, last total reward: 1.2443257794631468
Epoch: 400, loss: 0.02027062140405178, last total reward: 309.93332401415705
Epoch: 450, loss: 18.959522247314453, last total reward: 1.199041807604208
Epoch: 500, loss: 0.22823959589004517, last total reward: 0.49191510244754344
Epoch: 550, loss: 24.51347541809082, last total reward: 309.0028566574047
Epoch: 600, loss: 1.0472352504730225, last total reward: 214.9071179456203
