In [1]:
import collections

REAPLAY_MEMORY_SIZE = 10000 # should be 1000000
REPLAY_START_SIZE = 500 # should be 50000
experience_buffer = collections.deque([], maxlen=REAPLAY_MEMORY_SIZE)

In [2]:
INIT_EPS = 1
END_EPS = 0.1
FINAL_EXPLORATION_FRAME = REAPLAY_MEMORY_SIZE

DISCOUNT_FACTOR = 0.99

In [3]:
import gym
import numpy as np
import tensorflow as tf
from random import sample
from tensorflow.keras import layers, Model

print(tf.config.list_physical_devices('GPU')[0])

env = gym.make("ALE/Breakout-v5")

N_ACTIONS = env.action_space.n
OBS_RANGE = int(env.observation_space.high_repr) - int(env.observation_space.low_repr)
OBS_SHAPE = env.observation_space.shape
# SKIP_FRAMES = [2, 3, 4] # possible addition --> skip randomly 2/3/4 frames instead of always 4
N_INPUT_FRAMES = 4

BATCH_SIZE = 32

input_shape = [*OBS_SHAPE]
input_shape[-1] *= N_INPUT_FRAMES

obs = np.zeros(input_shape)
target = np.zeros(BATCH_SIZE)

model = tf.keras.applications.efficientnet.EfficientNetB0(
    include_top=True,
    weights=None,
    input_shape=input_shape,
    classes=N_ACTIONS,
    classifier_activation='softmax'
)

target_model = tf.keras.models.clone_model(model)
# target_model_weights = model.save_weights("./target_model_weights")
TARGET_MODEL_UPDATE_FREQUENCY = 1000
tr_weights = model.trainable_weights

total_params = sum([np.prod(w.get_shape().as_list()) for w in tr_weights])
print(total_params)

opt = tf.keras.optimizers.Adam(lr = 2e-4)


  import imp
2022-04-06 21:36:19.711469: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-06 21:36:19.738816: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-06 21:36:19.739159: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]
  deprecation(
  deprecation(
  input_shape = imagenet_utils.obtain_input_shape(


PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


2022-04-06 21:36:19.861869: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-06 21:36:19.862905: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-06 21:36:19.863171: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-06 21:36:19.863283: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

4015264


  super(Adam, self).__init__(name, **kwargs)


In [4]:
def preprocess_observation(obs):
    return obs / OBS_RANGE

In [5]:
def simulate(env, action): # big but finite MDP
    i = 0
    k_reward = 0
    done = 0
    while i < N_INPUT_FRAMES and not done:

        observation, reward, done, info = env.step(action)
        obs[:, :, i:i+3] = observation # simply concatenate observations --> each is 3 channels deep
        i += 1
        k_reward += reward

    p_obs = preprocess_observation(obs)
    return p_obs, k_reward, done

In [6]:
# @tf.function
def process_gradient(g):
    return tf.math.maximum(1., tf.math.minimum(-1., g))

In [7]:
total_frames = 0
target_model_update_frames = 0

for _ in range(int(1e6)):                           # n° of episodes

      observation, info = env.reset(seed=42, return_info=True)
      action = env.action_space.sample() # how do you solve the first frame? which action do you choose? (since the obs are 4 frames concat)

      for _ in range(int(1e3)):                     # n° of frames per episode

            i = 0
            k_reward = 0
            done = 0
            # sample_k_frames = sample(SKIP_FRAMES, 1)[0]
            
            while i < N_INPUT_FRAMES and not done:

                  observation, reward, done, info = env.step(action)
                  obs[:, :, i:i+3] = observation
                  i += 1
                  k_reward += reward

            p_obs = preprocess_observation(obs)

            if total_frames < REPLAY_START_SIZE:
                  # random sampling the first "REPLAY_START_SIZE" steps
                  action = env.action_space.sample()
                  next_p_obs, reward, is_end_state = simulate(env, action)
                  experience_buffer.append((p_obs, action, reward, next_p_obs, is_end_state))

            else:
                  fraction_frames = (total_frames - REPLAY_START_SIZE) / FINAL_EXPLORATION_FRAME
                  eps = max( 
                        (1-fraction_frames)*INIT_EPS + fraction_frames*END_EPS, 
                        END_EPS
                  )
                  # policy
                  if np.random.uniform() < eps:
                        action = env.action_space.sample()
                  else:
                        action = max(model([p_obs])) # [p_obs] necessary to give batch=1
                        # action = max(model([p_obs for _ in range(BATCH_SIZE)])) # [p_obs] necessary to give batch=1
                  
                  # next observation
                  next_p_obs, reward, is_end_state = simulate(env, action)
                  # append observation, action, reward, next_observation and "done" into the experience buffer
                  experience_buffer.append((p_obs, action, reward, next_p_obs, is_end_state)) # deque automatically pops from opposite side if maxlen is surpassed

                  # sample BATCH_SIZE tuples from the buffer
                  transition_batch = sample(experience_buffer, BATCH_SIZE) # for the training instead batch is 32

                  batched_old_observation =     tf.stack([transition[0] for transition in transition_batch])
                  # batched_action =              tf.stack([transition[1] for transition in transition_batch])
                  batched_reward =              tf.stack([transition[2] for transition in transition_batch])
                  batched_new_observation =     tf.stack([transition[3] for transition in transition_batch])
                  batched_done =                tf.stack([transition[4] for transition in transition_batch])

                  # compute the max q_values for all x(t+1)
                  batch_q_values = tf.math.reduce_max(target_model(batched_new_observation), axis=1)
                  # select only the ones in which the model is not in its final state, because the value of the final state's target is simply the final reward

                  masked_batch_q_values = tf.multiply(tf.multiply(batch_q_values, tf.cast(batched_done, tf.float32)), DISCOUNT_FACTOR)
                  # compute the target for each transition: reward + possibly the max q_value
                  target = tf.math.add(masked_batch_q_values, batched_reward)
                  # now let's train the network: we need to update the weights

                  with tf.GradientTape() as tape:
                        # tape.watch(model.tr_w)
                        # the loss function is simply MSR
                        loss = tf.math.reduce_sum(
                              tf.math.square( 
                                    tf.subtract(
                                          target, 
                                          tf.math.reduce_max( model(batched_old_observation), axis=1 )
                                    )  
                              )
                        )

                  grads = tape.gradient(loss, tr_weights)
                  # clip the gradients in [-1;+1]
                  processed_grads = [process_gradient(g) for g in grads]
                  opt.apply_gradients(zip(processed_grads, tr_weights))

                  if target_model_update_frames == TARGET_MODEL_UPDATE_FREQUENCY:
                        target_model_update_frames = 0
                        model.save_weights("./weights/update_weights")
                        target_model.load_weights("./weights/update_weights")
                        print("Total frame", total_frames, "updating weights")
                  target_model_update_frames += 1
            # if done:
            #     observation, info = env.reset(return_info=True)

            total_frames += 1
      break

env.close()

inside


2022-04-06 21:36:27.645531: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8302


inside
inside
inside
inside
inside
inside
inside
inside


KeyboardInterrupt: 