# <span style="color:teal"> Deep Reinforcement Learning for Atari Enduro-v0 </span>

### Import Required Packages

In [1]:
from time import sleep

import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

from PIL import Image

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Dropout, concatenate, Permute
from keras.layers import Input, Conv2D
from keras.optimizers import Adam
from keras.activations import relu, linear
from keras.layers.advanced_activations import LeakyReLU

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor

Using TensorFlow backend.


***
### RoadRunner Environment

In [2]:
env = gym.make('Enduro-v0')

env.render()
sleep(1)
env.close()

#### 1. *Number of possible action*

In [3]:
nb_actions = env.action_space.n
print('Total number of Possible actoin is :', nb_actions)

Total number of Possible actoin is : 9


#### 2. *Taking stack of 4 consecutive frames*

In [4]:
frame_shape = (84, 84)
window_length = 4
input_shape = (window_length,) + frame_shape
print('Input Shape is :', input_shape)

Input Shape is : (4, 84, 84)


***
### Defining class for pre-processing the game_frames

In [5]:
class GameProcess(Processor):
    def process_observation(self, observation):
        img = Image.fromarray(observation)
        img = np.array(img.resize(frame_shape).convert('L'))
        return img.astype('uint8')  

    def process_state_batch(self, batch):
        Processed_batch = batch.astype('float32') / 255.
        return Processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

***
## DeepMind Architecture

In [6]:
model = Sequential()
model.add(Permute((2, 3, 1), input_shape=input_shape))
model.add(Conv2D(32, (8, 8), strides=(4, 4)))
model.add(Activation('relu'))
model.add(Conv2D(64, (4, 4), strides=(2, 2)))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3), strides=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_1 (Permute)          (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 20, 20, 32)        8224      
_________________________________________________________________
activation_1 (Activation)    (None, 20, 20, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
activation_2 (Activation)    (None, 9, 9, 64)          0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
acti

***
### Configuring the Agent

#### 1. *Allocating memory for experience replay*

In [7]:
memory = SequentialMemory(limit=1000000, window_length=window_length)

#### 2.* Policy: Epsilon Greedy Exploration*
<span style="color:teal">*Gradually exploration will be decreased*</span>

In [8]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000)

#### 3. *Compiling DQN Agent*

In [9]:
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=GameProcess(),
               nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.)

In [10]:
dqn.compile(Adam(lr=.00025), metrics=['mae'])

***
## <span style="color:teal"> Training the model </span>

#### 1. *Check if Agent is learning for first 0.5M Steps*

In [None]:
history = dqn.fit(env, nb_steps=500000)

Training for 500000 steps ...
Interval 1 (0 steps performed)
2 episodes - episode_reward: 0.000 [0.000, 0.000] - ale.lives: 0.000

Interval 2 (10000 steps performed)
2 episodes - episode_reward: 0.000 [0.000, 0.000] - ale.lives: 0.000

Interval 3 (20000 steps performed)
2 episodes - episode_reward: 0.000 [0.000, 0.000] - ale.lives: 0.000

Interval 4 (30000 steps performed)
3 episodes - episode_reward: 0.000 [0.000, 0.000] - ale.lives: 0.000

Interval 5 (40000 steps performed)
2 episodes - episode_reward: 0.000 [0.000, 0.000] - ale.lives: 0.000

Interval 6 (50000 steps performed)
Instructions for updating:
Use tf.cast instead.
2 episodes - episode_reward: 0.000 [0.000, 0.000] - loss: 0.001 - mean_absolute_error: 0.015 - mean_q: 0.007 - mean_eps: 0.951 - ale.lives: 0.000

Interval 7 (60000 steps performed)
2 episodes - episode_reward: 0.000 [0.000, 0.000] - loss: 0.000 - mean_absolute_error: 0.005 - mean_q: -0.006 - mean_eps: 0.942 - ale.lives: 0.000

Interval 8 (70000 steps performed)
3

3 episodes - episode_reward: 0.000 [0.000, 0.000] - loss: 0.000 - mean_absolute_error: 0.004 - mean_q: -0.004 - mean_eps: 0.717 - ale.lives: 0.000

Interval 33 (320000 steps performed)

#### 2. *Summarizing the training history*

In [None]:
plt.plot(history.history['episode_reward'])
plt.title('Training for 0.3 million steps')
plt.legend(['Episode reward'], loc='upper right')
plt.show()

plt.plot(history.history['nb_episode_steps'])
plt.title('Training for 0.3 million steps')
plt.legend(['No. of episode steps'], loc='upper right')
plt.show()

#### 3. *Saving the weights*

In [None]:
dqn.save_weights('dqn_atari_Enduro.h5f', overwrite=True)

### <span style="color:red">**-  -  Caution   -  -**</span>

### <span style="color:teal">Re-Training the model (for 2M steps)  </span>
*Loading the saved weights (of 0.3M steps)*

In [None]:
dqn.load_weights('dqn_atari_Enduro.h5f')

In [None]:
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=GameProcess(),
               gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.)

dqn.compile(Adam(lr=0.00025), metrics=['mae'])

In [None]:
history2M = dqn.fit(env, nb_steps=2000000)

*Summarizing the training history*

In [None]:
plt.plot(history2M.history['episode_reward'])
plt.title('Training for 2 million steps')
plt.legend(['Episode reward'], loc='upper right')
plt.show()

plt.plot(history2M.history['nb_episode_steps'])
plt.title('Training for 2 million steps')
plt.legend(['No. of episode steps'], loc='upper right')
plt.show()

#### 4. *Saving final weights*

In [None]:
dqn.save_weights('dqn_atari_Enduro.h5f', overwrite=True)

***
## <span style="color:teal"> Testing the model </span>

#### 1. Loading the weights for testing

In [None]:
dqn.load_weights('dqn_atari_Enduro.h5f')

#### 2. Resetting the environment for testing

In [None]:
env.reset()
dqn.test(env, nb_episodes=2, visualize=True)

In [None]:
env.close()

In [None]:
# Trained on: Intel® Xeon® Processor E5, 2.40 GHz, Nvidia Quadro K4200
# Bhartendu Thakur, Machine Learning & Computing
# https://in.mathworks.com/matlabcentral/profile/authors/10083740-bhartendu?&detail=fileexchange
# https://in.linkedin.com/in/bhartendu-thakur-56bb6285