In [None]:
import gym
import tensorflow as tf
from tensorflow import keras
import random
import numpy as np
import datetime as dt
import imageio
from tensorflow.keras import layers
import time
current_milli_time = lambda: int(round(time.time() * 1000))


In [None]:
STORE_PATH = 'Projetto_Reti2020'
MAX_EPSILON = 0.9
MIN_EPSILON = 0.1
EPSILON_MIN_ITER = 5000
GAMMA = 0.99 #discount factor 
BATCH_SIZE = 32
TAU = 0.08 
DELAY_TRAINING = 25000 
NUM_FRAMES = 4 #stacked frames to train the network
GIF_RECORDING_FREQ = 100

env = gym.make("SpaceInvaders-v0")
num_actions = env.action_space.n
space_dim   = env.observation_space.shape

POST_PROCESS_IMAGE_SIZE = (105, 80, 4) #resize the image from (210,160) to (105,80)


Definition of the model:
* 3 convolutional layers (to extract relevant features)
* 1 flatten (mute reshaping layer)
* 1 fully-connected layer 
* 1 output layer with n=num_actions units


In [None]:
def CNN(input_shape=(105,80,4), output_layer= 3, last_activation = None):
    model = keras.Sequential(
      [keras.Input(shape=input_shape, name="input_layer" ),
       layers.Conv2D( filters=16, kernel_size= 8,strides=4, padding="valid", activation = "relu" ),
       layers.Conv2D( filters=32, kernel_size= 4,strides=2, padding="valid", activation = "relu" ),
       layers.Conv2D( filters= 32, kernel_size= 3,strides=1, padding="valid", activation = "relu" )             
      ]
    ) 
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation="relu"))
    model.add(layers.Dense(units= output_layer, activation = last_activation))
    return model

Network initialization. \
Note: target network is set as a copy of primary network.\
OPTIMIZER = Adam with loss mse

In [None]:
primary_network = CNN(input_shape= POST_PROCESS_IMAGE_SIZE, output_layer=num_actions)
target_network  = CNN(input_shape= POST_PROCESS_IMAGE_SIZE, output_layer= num_actions)
primary_network.compile(optimizer=keras.optimizers.Adam(), loss='mse')
# make target_network = primary_network
for t, e in zip(target_network.trainable_variables, primary_network.trainable_variables):
    t.assign(e)

primary_network.compile(optimizer=keras.optimizers.Adam(), loss=tf.keras.losses.Huber())


In [None]:
primary_network.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 25, 19, 16)        4112      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 11, 8, 32)         8224      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 6, 32)          9248      
_________________________________________________________________
flatten (Flatten)            (None, 1728)              0         
_________________________________________________________________
dense (Dense)                (None, 128)               221312    
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 516       
Total params: 243,412
Trainable params: 243,412
Non-trainable params: 0
__________________________________________________

Memory class buffer, with add_sample to store transitions and sample to extract a batch of BATCH_SIZE dimension.

In [None]:
class Memory:
    def __init__(self, max_memory):
        self._max_memory = max_memory
        self._actions = np.zeros(max_memory, dtype=np.int32)
        self._rewards = np.zeros(max_memory, dtype=np.float32)
        self._frames = np.zeros((POST_PROCESS_IMAGE_SIZE[0], POST_PROCESS_IMAGE_SIZE[1], max_memory), dtype=np.float32)
        self._terminal = np.zeros(max_memory, dtype=np.bool)
        self._i = 0

    def add_sample(self, frame, action, reward, terminal):
        self._actions[self._i] = action
        self._rewards[self._i] = reward
        self._frames[:, :, self._i] = frame[:, :, 0]
        self._terminal[self._i] = terminal
        if self._i % (self._max_memory - 1) == 0 and self._i != 0:
            self._i = BATCH_SIZE + NUM_FRAMES + 1
        else:
            self._i += 1

    def sample(self):
        if self._i < BATCH_SIZE + NUM_FRAMES + 1:
            raise ValueError("Not enough memory to extract a batch")
        else:
            rand_idxs = np.random.randint(NUM_FRAMES + 1, self._i, size=BATCH_SIZE)
            states = np.zeros((BATCH_SIZE, POST_PROCESS_IMAGE_SIZE[0], POST_PROCESS_IMAGE_SIZE[1], NUM_FRAMES),
                             dtype=np.float32)
            next_states = np.zeros((BATCH_SIZE, POST_PROCESS_IMAGE_SIZE[0], POST_PROCESS_IMAGE_SIZE[1], NUM_FRAMES),
                             dtype=np.float32)
            for i, idx in enumerate(rand_idxs):
                states[i] = self._frames[:, :, idx - 1 - NUM_FRAMES:idx - 1]
                next_states[i] = self._frames[:, :, idx - NUM_FRAMES:idx]
            return states, self._actions[rand_idxs], self._rewards[rand_idxs], next_states, self._terminal[rand_idxs]

In [None]:
#start memory as large as possible (based on aviable RAM) to store more episodes
memory = Memory(150000)

Some utility functions.

In [None]:
def image_preprocess(image, new_size=(105,80)):
    # convert to greyscale, resize and normalize the image
    image = tf.image.rgb_to_grayscale(image)
    image = tf.image.resize(image, new_size)
    image = image / 255
    return image


def choose_action(state, primary_network, eps, step):
  #eps-greedy action after DELAY_TRAINING
    if step < DELAY_TRAINING:
        return random.randint(0, num_actions - 1)
    else:
        if random.random() < eps:
            return random.randint(0, num_actions - 1)
        else:
            return np.argmax(primary_network(tf.reshape(state, (1, POST_PROCESS_IMAGE_SIZE[0],
                                                           POST_PROCESS_IMAGE_SIZE[1], NUM_FRAMES)).numpy()))


def update_network(primary_network, target_network, check= False):
    # update target network parameters slowly from primary network
    #completely after N episodes
    if check:
        tau = 1
    else: tau = 0.08
    for t, e in zip(target_network.trainable_variables, primary_network.trainable_variables):
        t.assign(t * (1 - tau) + e * tau)


def process_state_stack(state_stack, state):
  #images stack
    for i in range(1, state_stack.shape[-1]):
        state_stack[:, :, i - 1].assign(state_stack[:, :, i])
    state_stack[:, :, -1].assign(state[:, :, 0])
    return state_stack


def record_gif(frame_list, episode, fps=50):
    imageio.mimsave(STORE_PATH + f"/SpaceInvaders_EPISODE-{episode}.gif", frame_list, fps=fps) #duration=duration_per_frame)

Training structure.

In [None]:
def train(primary_network, memory, target_network=None):
    #Train function based on DQN with single step
    states, actions, rewards, next_states, terminal = memory.sample()
    # predict Q(s,a) given the batch of states
    prim_qt = primary_network(states)
    # predict Q(s',a') from the evaluation network
    prim_qtp1 = primary_network(next_states)
    # copy the prim_qt tensor into the target_q tensor - we then will update one index corresponding to the max action
    target_q = prim_qt.numpy()
    updates = rewards
    valid_idxs = terminal != True
    batch_idxs = np.arange(BATCH_SIZE)
    if target_network is None:
        updates[valid_idxs] += GAMMA * np.amax(prim_qtp1.numpy()[valid_idxs, :], axis=1) #never used this
    else:
        prim_action_tp1 = np.argmax(prim_qtp1.numpy(), axis=1)
        q_from_target = target_network(next_states)
        updates[valid_idxs] += GAMMA * q_from_target.numpy()[batch_idxs[valid_idxs], prim_action_tp1[valid_idxs]]
    target_q[batch_idxs, actions] = updates
    loss = primary_network.train_on_batch(states, target_q)
    return loss

GPU initialization in COLAB. This is not necessary if the compiler has no GPU. 


In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
#set how many records are necessary before to start batch-sampling from the memory class 
DELAY_TRAINING = 25000

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
    raise SystemError('GPU device not found')
with tf.device('/device:GPU:0'):
    num_episodes = 10000 #note that they should be even more
    eps = MAX_EPSILON
    render = False #passing render if you want to render the game during training
    train_writer = tf.summary.create_file_writer(STORE_PATH + f"/DQN_{dt.datetime.now().strftime('%d%m%Y%H%M')}")

    steps = 0
    ep_time = current_milli_time()

    for i in range(num_episodes):
        state = env.reset()
        state = image_preprocess(state)
        state_stack = tf.Variable(np.repeat(state.numpy(), NUM_FRAMES).reshape((POST_PROCESS_IMAGE_SIZE[0],
                                                                                  POST_PROCESS_IMAGE_SIZE[1],
                                                                                  NUM_FRAMES)))
        cnt = 1
        avg_loss = 0
        tot_reward = 0
        if i % GIF_RECORDING_FREQ == 0:
            frame_list = []
        while True:
          #if render:
          #    env.render()
            #eps-greedy action
            action = choose_action(state_stack, primary_network, eps, steps)
            next_state, reward, done, info = env.step(action)
            tot_reward += reward
            if i % GIF_RECORDING_FREQ == 0:
                frame_list.append(tf.cast(tf.image.resize(next_state, (480, 320)), tf.uint8).numpy())
            next_state = image_preprocess(next_state)
            state_stack = process_state_stack(state_stack, next_state)
          # store in memory
            memory.add_sample(next_state, action, reward, done)

            if steps > DELAY_TRAINING:
                loss = train(primary_network, memory, target_network)
                if i < 100:
                    update_network(primary_network, target_network, check= False)
                else:
                    if i % 100 == 0:
                        update_network(primary_network,target_network,check=True)

            else:
                loss = -1
            avg_loss += loss

          # linearly decay the eps value
            if steps > DELAY_TRAINING:
                eps = MAX_EPSILON - ((steps - DELAY_TRAINING) / EPSILON_MIN_ITER) * \
                    (MAX_EPSILON - MIN_EPSILON) if steps < EPSILON_MIN_ITER else \
                  MIN_EPSILON
            steps += 1

            if done:
                ep_sec_time = int((current_milli_time()-ep_time) / 1000)

                if steps > DELAY_TRAINING:
                    avg_loss /= cnt
                    print(f"Episode: {i}, Reward: {tot_reward}, avg loss: {avg_loss:.5f}, eps: {eps:.3f}, Time: {ep_sec_time: d}")
                    #tensorboard output
                    with train_writer.as_default():
                        tf.summary.scalar('reward', tot_reward, step=i)
                        tf.summary.scalar('avg loss', avg_loss, step=i)
                else:
                    print(f"Pre-training...Episode: {i}")
                if i % GIF_RECORDING_FREQ == 0:
                    record_gif(frame_list, i)
                ep_time = current_milli_time()
                break

            cnt += 1

Pre-training...Episode: 0
Pre-training...Episode: 1
Pre-training...Episode: 2
Pre-training...Episode: 3
Pre-training...Episode: 4
Pre-training...Episode: 5
Pre-training...Episode: 6
Pre-training...Episode: 7
Pre-training...Episode: 8
Pre-training...Episode: 9
Pre-training...Episode: 10
Pre-training...Episode: 11
Pre-training...Episode: 12
Pre-training...Episode: 13
Pre-training...Episode: 14
Pre-training...Episode: 15
Pre-training...Episode: 16
Pre-training...Episode: 17
Pre-training...Episode: 18
Pre-training...Episode: 19
Pre-training...Episode: 20
Pre-training...Episode: 21
Pre-training...Episode: 22
Pre-training...Episode: 23
Pre-training...Episode: 24
Pre-training...Episode: 25
Pre-training...Episode: 26
Pre-training...Episode: 27
Pre-training...Episode: 28
Pre-training...Episode: 29
Pre-training...Episode: 30
Pre-training...Episode: 31
Pre-training...Episode: 32
Pre-training...Episode: 33
Pre-training...Episode: 34
Pre-training...Episode: 35
Pre-training...Episode: 36
Pre-trainin

Note: Actual run in "Breakout-v0" environment. 

Link to source code: \
https://adventuresinmachinelearning.com/atari-space-invaders-dueling-q/

## Creating the copy section to apply transfer learning on other environments

This part it's still work in progress!

In [None]:
from keras.models import model_from_json
import os

# serialize model to JSON
model_json = target_network.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
target_network.save_weights("model.h5")
print("Saved model to disk")
 
# later...
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

Saved model to disk


ValueError: ignored

In [None]:
tf.keras.models.save_model(
    target_network,
    "target_net.h5",
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None
)

filepath = './saved_model'
save_model(model, filepath)

NameError: ignored

In [None]:
primary_net_breakout = keras.models.load_model("SpaceInv_tnet")
primary_net_breakout.layers.pop()

env = gym.make("Breakout-v0")
num_actions = env.action_space.n
space_dim   = env.observation_space.shape #the image input is resized as before

for layer in primary_net_breakout.layers:
  layer.trainable = False
prymary_net_breakout.add(Dense(num_actions))
primary_net_breakout.summary()



NameError: ignored