## CPE 693A: Reinforcement Learning and Control
### Spring 2021
### MS Final Project
### Marie Dumaz

In [1]:
'''IMPORTS'''

import gym
import offworld_gym
import pickle
import matplotlib.pyplot as plt
import random
import numpy as np

import keras
from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Flatten, Conv2D, Input, MaxPooling2D, Concatenate
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.processors import Processor

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# create the environment
env = gym.make('OffWorldDockerMonolithDiscreteSim-v0')
env.seed(1782)

[1782]

## Deep-Q Network

In [3]:
# Create the DQN model
def create_network():
    
    # Create input
    img_input = Input(shape=(240, 320, 4), name='img_input')

    x = img_input
    
    # Convolution to extract features from image
    x = Conv2D(filters = 4, kernel_size = 5, strides = 2)(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(filters = 4, kernel_size = 5, strides = 2)(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(filters = 1, kernel_size = 5, strides = 1)(x)
    x = Activation('relu')(x)
    
    # Feedforward NN
    x = Flatten()(x)
    x = Dense(128)(x)
    x = Activation('relu')(x)
    x = Dense(256)(x)
    x = Activation('relu')(x)

    # Output is which action to take
    output = Dense(nb_actions)(x)
    model = Model(inputs=[img_input], outputs=output)
    print(model.summary())

    return model

In [4]:
# Class to process observations
class RosbotProcessor(Processor):

    # Returns observation
    def process_observation(self, observation):
        return observation

    # Batch process 
    # Returns list of corresponding observation for each state in batch
    def process_state_batch(self, batch):
        imgs_batch = []
        for exp in batch:
            imgs = []
            configs = []
            for state in exp:
                imgs.append(np.expand_dims(state[0], 0))
                configs.append(np.expand_dims(100, 0))
            imgs_batch.append(np.concatenate(imgs, -1))
        imgs_batch = np.concatenate(imgs_batch, 0)

        return imgs_batch

In [5]:
nb_actions = env.action_space.n

# Parameters
memory_size = 25000
window_length = 4
total_nb_steps = 50000
exploration_anneal_nb_steps = 20000
max_eps = 0.8 # Max epsilon
min_eps = 0.1 # Minimum epsilon
learning_warmup_nb_steps = 50
target_model_update = 1e-2
learning_rate = 1e-3

## DQN

In [6]:
# Initiliaze observation processor
processor = RosbotProcessor()
# Initialize policy
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), 'eps', max_eps, min_eps, 0.0, exploration_anneal_nb_steps)

# Initialize model
model = create_network()
# Initialize memory
memory = SequentialMemory(limit = memory_size, window_length = window_length)





_________________________________________________________________
Layer (type)                 Output Shape              Param #   
img_input (InputLayer)       (None, 240, 320, 4)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 118, 158, 4)       404       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 59, 79, 4)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 28, 38, 4)         404       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 14, 19, 4)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 10, 15, 1)         101       
_________________________________________________________________
activation_1 (Activation)    (None, 10, 15, 1)         0         
______

In [7]:
# Create DQN agent
dqn = DQNAgent(processor = processor, model = model, nb_actions = nb_actions, memory = memory, nb_steps_warmup = learning_warmup_nb_steps,
                   enable_double_dqn = False, target_model_update = target_model_update, policy = policy)

In [8]:
# Compile
dqn.compile(Adam(lr = learning_rate), metrics = ['mae'])






In [9]:
# Train
dqn_train = dqn.fit(env, action_repetition=1, nb_steps=total_nb_steps, visualize=False,  verbose = 1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
206 episodes - episode_reward: 0.233 [0.000, 1.000] - loss: 0.006 - mean_absolute_error: 0.609 - mean_q: 0.846 - mean_eps: 0.624 - real_time_factor_for_move: 12.367

Interval 2 (10000 steps performed)
175 episodes - episode_reward: 0.246 [0.000, 1.000] - loss: 0.007 - mean_absolute_error: 0.924 - mean_q: 1.262 - mean_eps: 0.275 - real_time_factor_for_move: 12.112

Interval 3 (20000 steps performed)
179 episodes - episode_reward: 0.307 [0.000, 1.000] - loss: 0.005 - mean_absolute_error: 0.783 - mean_q: 1.062 - mean_eps: 0.100 - real_time_factor_for_move: 12.075

Interval 4 (30000 steps performed)
180 episodes - episode_reward: 0.339 [0.000, 1.000] - loss: 0.002 - mean_absolute_error: 0.524 - mean_q: 0.707 - mean_eps: 0.100 - real_time_factor_for_move: 12.203

Interval 5 (40000 steps performed)
done, took 40680.453 seconds


In [10]:
# Save final weights
dqn.save_weights('dqn_weigths', overwrite = True)

## Double DQN

In [9]:
# Initiliaze observation processor
processor = RosbotProcessor()
# Initialize policy
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), 'eps', max_eps, min_eps, 0.0, exploration_anneal_nb_steps)

# Initialize model
model = create_network()
# Initialize memory
memory = SequentialMemory(limit = memory_size, window_length = window_length)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
img_input (InputLayer)       (None, 240, 320, 4)       0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 118, 158, 4)       404       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 59, 79, 4)         0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 28, 38, 4)         404       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 14, 19, 4)         0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 10, 15, 1)         101       
_________________________________________________________________
activation_4 (Activation)    (None, 10, 15, 1)         0         
__________

In [10]:
# Create agent
double_dqn = DQNAgent(processor = processor, model = model, nb_actions = nb_actions, memory = memory, nb_steps_warmup = learning_warmup_nb_steps,
                   enable_double_dqn = True, target_model_update = target_model_update, policy = policy)

In [11]:
# Compile
double_dqn.compile(Adam(lr = learning_rate), metrics=['mae'])

In [11]:
# Train
double_dqn_train = double_dqn.fit(env, action_repetition = 1, nb_steps = total_nb_steps, visualize = False,  verbose = 1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
210 episodes - episode_reward: 0.219 [0.000, 1.000] - loss: 0.005 - mean_absolute_error: 0.442 - mean_q: 0.613 - mean_eps: 0.624 - real_time_factor_for_move: 12.257

Interval 2 (10000 steps performed)
189 episodes - episode_reward: 0.344 [0.000, 1.000] - loss: 0.003 - mean_absolute_error: 0.520 - mean_q: 0.706 - mean_eps: 0.275 - real_time_factor_for_move: 12.170

Interval 3 (20000 steps performed)
203 episodes - episode_reward: 0.438 [0.000, 1.000] - loss: 0.002 - mean_absolute_error: 0.417 - mean_q: 0.566 - mean_eps: 0.100 - real_time_factor_for_move: 12.329

Interval 4 (30000 steps performed)
198 episodes - episode_reward: 0.424 [0.000, 1.000] - loss: 0.002 - mean_absolute_error: 0.416 - mean_q: 0.566 - mean_eps: 0.100 - real_time_factor_for_move: 12.213

Interval 5 (40000 steps performed)
done, took 46954.232 seconds


In [12]:
# Save final weights
double_dqn.save_weights('double_dqn_weigths', overwrite=True)

## Duel Double DQN

In [12]:
# Initiliaze observation processor
processor = RosbotProcessor()
# Initialize policy
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), 'eps', max_eps, min_eps, 0.0, exploration_anneal_nb_steps)

# Initialize model
model = create_network()
# Initialize memory
memory = SequentialMemory(limit = memory_size, window_length = window_length)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
img_input (InputLayer)       (None, 240, 320, 4)       0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 118, 158, 4)       404       
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 59, 79, 4)         0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 28, 38, 4)         404       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 14, 19, 4)         0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 10, 15, 1)         101       
_________________________________________________________________
activation_7 (Activation)    (None, 10, 15, 1)         0         
__________

In [13]:
# Create agent
duel_dqn = DQNAgent(processor = processor, model = model, nb_actions = nb_actions, memory = memory, nb_steps_warmup = learning_warmup_nb_steps,
                   enable_double_dqn = True, target_model_update = target_model_update, policy = policy, enable_dueling_network = True, dueling_type = 'avg')

In [14]:
# Compile
duel_dqn.compile(Adam(lr = learning_rate), metrics = ['mae'])

In [16]:
# Train
duel_dqn_train = duel_dqn.fit(env, action_repetition = 1, nb_steps = total_nb_steps, visualize = False,  verbose = 1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
185 episodes - episode_reward: 0.276 [0.000, 1.000] - loss: 0.002 - mean_absolute_error: 0.466 - mean_q: 0.645 - mean_eps: 0.624 - real_time_factor_for_move: 11.769

Interval 2 (10000 steps performed)
181 episodes - episode_reward: 0.326 [0.000, 1.000] - loss: 0.002 - mean_absolute_error: 0.473 - mean_q: 0.643 - mean_eps: 0.275 - real_time_factor_for_move: 11.924

Interval 3 (20000 steps performed)
181 episodes - episode_reward: 0.403 [0.000, 1.000] - loss: 0.002 - mean_absolute_error: 0.421 - mean_q: 0.572 - mean_eps: 0.100 - real_time_factor_for_move: 12.141

Interval 4 (30000 steps performed)
174 episodes - episode_reward: 0.310 [0.000, 1.000] - loss: 0.002 - mean_absolute_error: 0.407 - mean_q: 0.551 - mean_eps: 0.100 - real_time_factor_for_move: 11.935

Interval 5 (40000 steps performed)
done, took 47136.826 seconds


In [17]:
# Save final weights
duel_dqn.save_weights('duel_dqn_weigths', overwrite = True)

## Results

In [15]:
# Load weights saved after training
dqn.load_weights('dqn_weigths')
double_dqn.load_weights('double_dqn_weigths')
duel_dqn.load_weights('duel_dqn_weigths')

In [16]:
# Evaluate how well our model does for 100 episodes
visu = False
n = 100
print("For DQN")
dqn_test = dqn.test(env, nb_episodes = n, visualize = visu)
print("For Double DQN")
double_dqn_test = double_dqn.test(env, nb_episodes = n, visualize = visu)
print("For Duel Double DQN")
duel_dqn_test = duel_dqn.test(env, nb_episodes = n, visualize = visu)

For DQN
Testing for 100 episodes ...
Episode 1: reward: 0.000, steps: 100
Episode 2: reward: 1.000, steps: 68
Episode 3: reward: 1.000, steps: 2
Episode 4: reward: 0.000, steps: 100
Episode 5: reward: 0.000, steps: 100
Episode 6: reward: 0.000, steps: 3
Episode 7: reward: 1.000, steps: 99
Episode 8: reward: 1.000, steps: 21
Episode 9: reward: 0.000, steps: 100
Episode 10: reward: 0.000, steps: 100
Episode 11: reward: 1.000, steps: 82
Episode 12: reward: 0.000, steps: 100
Episode 13: reward: 1.000, steps: 25
Episode 14: reward: 1.000, steps: 59
Episode 15: reward: 0.000, steps: 5
Episode 16: reward: 0.000, steps: 100
Episode 17: reward: 0.000, steps: 100
Episode 18: reward: 1.000, steps: 49
Episode 19: reward: 0.000, steps: 100
Episode 20: reward: 1.000, steps: 2
Episode 21: reward: 0.000, steps: 100
Episode 22: reward: 1.000, steps: 66
Episode 23: reward: 0.000, steps: 100
Episode 24: reward: 1.000, steps: 13
Episode 25: reward: 0.000, steps: 20
Episode 26: reward: 0.000, steps: 100
Ep

Episode 20: reward: 0.000, steps: 100
Episode 21: reward: 0.000, steps: 8
Episode 22: reward: 0.000, steps: 7
Episode 23: reward: 0.000, steps: 20
Episode 24: reward: 1.000, steps: 22
Episode 25: reward: 0.000, steps: 28
Episode 26: reward: 1.000, steps: 33
Episode 27: reward: 0.000, steps: 100
Episode 28: reward: 0.000, steps: 100
Episode 29: reward: 0.000, steps: 100
Episode 30: reward: 0.000, steps: 100
Episode 31: reward: 0.000, steps: 44
Episode 32: reward: 0.000, steps: 100
Episode 33: reward: 0.000, steps: 9
Episode 34: reward: 0.000, steps: 5
Episode 35: reward: 0.000, steps: 10
Episode 36: reward: 0.000, steps: 21
Episode 37: reward: 0.000, steps: 43
Episode 38: reward: 0.000, steps: 100
Episode 39: reward: 1.000, steps: 22
Episode 40: reward: 1.000, steps: 98
Episode 41: reward: 0.000, steps: 100
Episode 42: reward: 0.000, steps: 4
Episode 43: reward: 0.000, steps: 100
Episode 44: reward: 1.000, steps: 53
Episode 45: reward: 0.000, steps: 3
Episode 46: reward: 1.000, steps: 1

In [17]:
# Reward average
dqn_success = np.mean(dqn_test.history['episode_reward'])
double_dqn_success = np.mean(double_dqn_test.history['episode_reward'])
duel_dqn_success = np.mean(duel_dqn_test.history['episode_reward'])
print(f'DQN success percentage: {dqn_success}')
print(f'Double DQN success percentage: {double_dqn_success}')
print(f'Duel DQN success percentage: {duel_dqn_success}')

DQN success percentage: 0.44
Double DQN success percentage: 0.37
Duel DQN success percentage: 0.31


In [20]:
# DQN Average number of steps
avg_all = np.mean(dqn_test.history['nb_steps'])
successes = [dqn_test.history['nb_steps'][i] for i in range(len(dqn_test.history['nb_steps'])) if dqn_test.history['episode_reward'][i] == 1]
avg_success = np.mean(successes)
quick_episodes = [i for i in dqn_test.history['nb_steps'] if i < 10]
print("For DQN")
print(f"Average from all episodes: {avg_all}")
print(f"Average from successful episodes: {avg_success}")
print(f"Percentage of quick episodes out of successful episodes: {len(quick_episodes) / len(successes)}")

For DQN
Average from all episodes: 61.14
Average from successful episodes: 33.97826086956522
Percentage of quick episodes out of successful episodes: 0.1956521739130435


In [21]:
# Double DQN Average number of steps
avg_all = np.mean(double_dqn_test.history['nb_steps'])
successes = [double_dqn_test.history['nb_steps'][i] for i in range(len(double_dqn_test.history['nb_steps'])) if double_dqn_test.history['episode_reward'][i] == 1]
avg_success = np.mean(successes)
quick_episodes = [i for i in double_dqn_test.history['nb_steps'] if i < 10]
print("For Double DQN")
print(f"Average from all episodes: {avg_all}")
print(f"Average from successful episodes: {avg_success}")
print(f"Percentage of quick episodes out of successful episodes: {len(quick_episodes) / len(successes)}")

For Double DQN
Average from all episodes: 48.48
Average from successful episodes: 29.307692307692307
Percentage of quick episodes out of successful episodes: 0.2564102564102564


In [22]:
# Duel DQN Average number of steps
avg_all = np.mean(duel_dqn_test.history['nb_steps'])
successes = [duel_dqn_test.history['nb_steps'][i] for i in range(len(duel_dqn_test.history['nb_steps'])) if duel_dqn_test.history['episode_reward'][i] == 1]
avg_success = np.mean(successes)
quick_episodes = [i for i in duel_dqn_test.history['nb_steps'] if i < 10]
print("For Duel Double DQN")
print(f"Average from all episodes: {avg_all}")
print(f"Average from successful episodes: {avg_success}")
print(f"Percentage of quick episodes out of successful episodes: {len(quick_episodes) / len(successes)}")

For Duel Double DQN
Average from all episodes: 54.47
Average from successful episodes: 23.708333333333332
Percentage of quick episodes out of successful episodes: 0.6666666666666666


Note: The callbacks.py script in the rl library was modified to return the metrics so the code below will return an error if run on another computer.

In [None]:
# Extracting metrics
# 1st value is the loss, second is the mean absolute error, third is mean q value and fourth is the mean epsilon value
dqn_loss = [i for i,j,n,m in dqn_train.history['metrics'] if i == i]
dqn_mean_absolute_error = [j for i,j,n,m in dqn_train.history['metrics'] if j == j]
dqn_mean_q = [n for i,j,n,m in dqn_train.history['metrics'] if n == n]
dqn_mean_eps = [m for i,j,n,m in dqn_train.history['metrics'] if m == m]

double_dqn_loss = [i for i,j,n,m in double_dqn_train.history['metrics'] if i == i]
double_dqn_mean_absolute_error = [j for i,j,n,m in double_dqn_train.history['metrics'] if j == j]
double_dqn_mean_q = [n for i,j,n,m in double_dqn_train.history['metrics'] if n == n]
double_dqn_mean_eps = [m for i,j,n,m in double_dqn_train.history['metrics'] if m == m]

duel_dqn_loss = [i for i,j,n,m in duel_dqn_train.history['metrics'] if i == i]
duel_dqn_mean_absolute_error = [j for i,j,n,m in duel_dqn_train.history['metrics'] if j == j]
duel_dqn_mean_q = [n for i,j,n,m in duel_dqn_train.history['metrics'] if n == n]
duel_dqn_mean_eps = [m for i,j,n,m in duel_dqn_train.history['metrics'] if m == m]

In [None]:
# Plot Loss variation during training
plt.plot(dqn_loss, label = "DQN")
plt.plot(double_dqn_loss, label = "Double DQN")
plt.plot(duel_dqn_loss, label = "Duel DQN")
plt.legend()
plt.title("Loss comparison", fontsize = 20)
plt.xlabel("Episode", fontsize = 15)
plt.ylabel("Loss", fontsize = 15)
#plt.savefig("/home/mcd0029/Documents/CPE693/loss_compare.png")

In [None]:
# Plot mean absolute error during training
plt.plot(dqn_mean_absolute_error, label = "DQN")
plt.plot(double_dqn_mean_absolute_error, label = "Double DQN")
plt.plot(duel_dqn_mean_absolute_error, label = "Duel DQN")
plt.legend()
plt.title("Mean Absolute Error comparison", fontsize = 15)
plt.xlabel("Episode", fontsize = 12)
plt.ylabel("Mean Absolute Error", fontsize = 12)
#plt.savefig("/home/mcd0029/Documents/CPE693/mean_error_compare.png")

In [None]:
# Duel DQN independent results
duel_dqn_test = duel_dqn.test(env, nb_episodes = 100, visualize = True)
duel_dqn_success = np.mean(duel_dqn_test.history['episode_reward'])
print(f'Duel DQN success percentage: {duel_dqn_success}')

# Duel DQN Average number of steps
avg_all = np.mean(duel_dqn_test.history['nb_steps'])
successes = [duel_dqn_test.history['nb_steps'][i] for i in range(len(duel_dqn_test.history['nb_steps'])) if duel_dqn_test.history['episode_reward'][i] == 1]
avg_success = np.mean(successes)
quick_episodes = [i for i in duel_dqn_test.history['nb_steps'] if i < 10]
print("For Duel Double DQN")
print(f"Average from all episodes: {avg_all}")
print(f"Average from successful episodes: {avg_success}")
print(f"Percentage of quick episodes out of successful episodes: {len(quick_episodes) / len(successes)}")