In [None]:
!pip install gym[classic_control]
!apt-get install x11-utils
!pip install pyglet
!apt-get install -y xvfb python-opengl
!pip install gym pyvirtualdisplay

In [2]:
import gym
from keras import models
from keras import layers
from keras.optimizers import Adam
from collections import deque
import random
import numpy as np

import tensorflow as tf
tf.keras.utils.disable_interactive_logging()

import keras
config = tf.compat.v1.ConfigProto( device_count = {'GPU': 2 , 'CPU': 1} ) 
sess = tf.compat.v1.Session(config=config) 
keras.backend.set_session(sess)

In [3]:
'''To render the interaction with the environment in a Jupyter Notebook'''

import imageio
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.animation as animation
# To get smooth animations
mpl.rc('animation', html='jshtml')
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

def plot_animation(model, environment, gif_name=None, seed=1):
    environment.seed(seed)
    np.random.seed(seed)
    frames = render_policy_net(model, environment)

    # saving gif
    if not gif_name:
        gif_name = "DQN.gif"
    imageio.mimsave(gif_name, frames, fps=30)
    
    # plotting animation
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=True, interval=40)
    plt.close()

    return anim

def render_policy_net(model, env, n_max_steps=200):
    frames = list()

    obs = env.reset()
    for step in range(n_max_steps):
        frames.append(env.render(mode="rgb_array"))
        action = np.argmax(model.predict(obs[np.newaxis])[0])
        obs, reward, done, info = env.step(action)
        if done:
            break
    return frames

def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch

In [4]:
NUM_EPISODES = 400
NUM_ITERATIONS = 201 #max is 200

In [5]:
class MountainCarAgentDQN:
    def __init__(self, env):
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n

        self.gamma = 0.99
        self.learning_rate = 0.001

        self.epsilon = 1
        self.epsilon_decay = 0.05
        self.epsilon_min=0.01

        self.replay_buffer = deque(maxlen=20000)

        self.batch_size = 32

        # networks
        self.train_network = self.create_model()
        self.target_network = self.create_model()
        self.sync_networks()

    def create_model(self):
        model = models.Sequential()
        model.add(layers.Dense(24, activation='relu', input_shape=[self.state_size]))
        model.add(layers.Dense(48, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
    
    def sync_networks(self):
        self.target_network.set_weights(self.train_network.get_weights())

    def get_next_action(self, state):
        self.epsilon = max(self.epsilon_min, self.epsilon)

        if np.random.rand() < self.epsilon:
            action = np.random.randint(self.action_size)
        else:
            action = np.argmax(self.train_network.predict(state[np.newaxis])[0])

        return action
    
    def store_in_replay(self, current_state, next_action, reward, new_state, done):
      self.replay_buffer.append([current_state, next_action, reward, new_state, done])

    def train(self):
        # only when the replay memory has enough experience
        if len(self.replay_buffer) < self.batch_size:
            return
        
        states, actions, rewards, next_states, dones = self.sample_experiences()

        # compute the boostrap targets
        new_targets = self.target_network.predict(next_states) # next Q values
        max_next_Q_values = np.max(new_targets, axis=1)

        # compute the targets
        targets = self.train_network.predict(states)
        targets[(np.arange(self.batch_size), actions.reshape(self.batch_size,).astype(int))] =\
          rewards + (1 - dones) * self.gamma * max_next_Q_values

        self.train_network.fit(states, targets, epochs=1, verbose=0)
    
    def sample_experiences(self):
        batch = random.sample(self.replay_buffer, self.batch_size)
        states, actions, rewards, next_states, dones = [
            np.array([experience[field_index] for experience in batch])
            for field_index in range(5)]
        return states, actions, rewards, next_states, dones
    
    def end_episode(self, success):
        if success:
            dqn.train_network.save(f'./dqnInEp{episode}.h5')
        self.epsilon -= self.epsilon_decay
        self.sync_networks()
        

env = gym.make('MountainCar-v0')
dqn = MountainCarAgentDQN(env=env)
for episode in range(NUM_EPISODES):
    #save the animation every 50 eps
    #if episode%50==0:
    #    plot_animation(dqn.train_network, env, gif_name=f"DQN_ep{episode}.gif")
    
    current_state = env.reset()

    tot_reward = 0
    max_position=-99

    for iteration in range(NUM_ITERATIONS):
        next_action = dqn.get_next_action(current_state)
        new_state, reward, done, _ = env.step(next_action)

        # # Keep track of max position
        if new_state[0] > max_position:
            max_position = new_state[0]
        # # Adjust reward for task completion
        if new_state[0] >= 0.5:
            reward += 10

        dqn.store_in_replay(current_state, next_action, reward, new_state, done)
        dqn.train()

        tot_reward += reward
        current_state = new_state

        if done:
            break

    if iteration >= 199:
        print(f"Failed to finish task in episode {episode}")
    else:
        print(f"Success in episode {episode}, used {iteration} iterations!")
    print(f"the reward is {tot_reward} maxPosition is {max_position}")

    dqn.end_episode(iteration < 199)

plot_animation(dqn.train_network, env, gif_name=f"DQN_final.gif")

env.close()

  super().__init__(name, **kwargs)
2023-01-08 14:41:04.880201: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Failed to finish task in episode 0
the reward is -200.0 maxPosition is -0.46404293179512024
Failed to finish task in episode 1
the reward is -200.0 maxPosition is -0.4220111668109894
Failed to finish task in episode 2
the reward is -200.0 maxPosition is -0.3520282506942749
Failed to finish task in episode 3
the reward is -200.0 maxPosition is -0.31948697566986084
Failed to finish task in episode 4
the reward is -200.0 maxPosition is -0.1712750494480133
Failed to finish task in episode 5
the reward is -200.0 maxPosition is -0.3761269152164459
Failed to finish task in episode 6
the reward is -200.0 maxPosition is -0.4116733968257904
Failed to finish task in episode 7
the reward is -200.0 maxPosition is -0.27033567428588867
Failed to finish task in episode 8
the reward is -200.0 maxPosition is -0.297061562538147
Failed to finish task in episode 9
the reward is -200.0 maxPosition is -0.26634955406188965
Failed to finish task in episode 10
the reward is -200.0 maxPosition is -0.408250123262

: 

: 

In [None]:
env = gym.make('MountainCar-v0')
plot_animation(dqn.train_network, env, gif_name=f"DQN_final.gif")
env.close()