In [145]:
import keras
from keras import layers
import gymnasium as gym
from gymnasium.wrappers.frame_stack import FrameStack
from gymnasium.wrappers.atari_preprocessing import AtariPreprocessing
import numpy as np
import tensorflow as tf
import ale_py
import logging
from gymnasium.wrappers import RecordEpisodeStatistics
import json

In [146]:
from gymnasium.vector import SyncVectorEnv 


In [147]:
from gymnasium.spaces import Sequence, Box

In [148]:

gym.register_envs(ale_py)


In [149]:
def create_env():
    def make_env():
        env = gym.make('SpaceInvadersNoFrameskip-v4', render_mode='rgb_array')
        env = AtariPreprocessing(env)
        env = FrameStack(env, 4)
        env = RecordEpisodeStatistics(env)
        trigger = lambda t: t % 50 == 0
        env = gym.wrappers.RecordVideo(env, video_folder="./Outputs/videos", name_prefix='training', episode_trigger=trigger, disable_logger=False)
        return env
    return make_env

In [150]:
# I have 4 cores, so num_envs is going to be 2

num_envs = 2
envs = SyncVectorEnv([create_env() for _ in range(num_envs)])

  logger.warn(


In [151]:
envs

SyncVectorEnv(2)

In [152]:

gamma = 0.99 # discount_factor
epsilon = 0.5
epsilon_min = 0.1
epsilon_max = 0.5
epsilon_interval = (epsilon_max - epsilon_min)
batch_size = 64 # increased for larger data flow
max_steps_per_episode = 10000 
max_episodes = 0 
max_frames = 1e7 

#env = AtariPreprocessing(env)

#env = FrameStack(env, 4)


#env = RecordEpisodeStatistics(env)

num_actions = 4

In [153]:
#nv = gym.make('SpaceInvadersNoFrameskip-v4', render_mode="rgb_array")

In [154]:
model = keras.models.load_model("breakout_qmodel_7764.keras", safe_mode = False)


In [155]:

saved_model = keras.models.load_model("breakout_qmodel_7764.keras", safe_mode = False)


In [156]:
def create_q_model():
    return keras.Sequential(
        [
            layers.Lambda
            (
                lambda tensor: keras.ops.transpose(tensor, [0, 2, 3, 1]),
                output_shape=(84, 84, 4),
                input_shape=(4, 84, 84)
            ),
            layers.Conv2D(32, kernel_size=8, strides=4, activation="leaky_relu"),
            layers.Conv2D(64, kernel_size=4, strides=2, activation="leaky_relu"),
            layers.Conv2D(64, kernel_size=3, strides=1, activation="leaky_relu"),
            layers.Flatten(),
            layers.Dense(512, activation="leaky_relu"),
            layers.Dense(num_actions, activation="linear")
        ]
    )
model = create_q_model()
model_target = create_q_model()
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
loss_function = keras.losses.Huber()



  super().__init__(**kwargs)


In [157]:
for layer in model.layers:
    if layer.name in [l.name for l in saved_model.layers]:
        layer.set_weights(saved_model.get_layer(layer.name).get_weights())

for layer in model_target.layers:
    if layer.name in [l.name for l in saved_model.layers]:
        layer.set_weights(saved_model.get_layer(layer.name).get_weights())

In [158]:
max_memory_length = 10000
replay_buffer = {
    "state": [],
    "state_next": [],
    "action": [],
    "reward": [],
    "done": [],
}

def add_to_replay_buffer(state, state_next, action, reward, done):
    if len(replay_buffer["state"]) >= max_memory_length:
        for key in replay_buffer.keys():
            replay_buffer[key].pop(0)
    replay_buffer["state"].append(state)
    replay_buffer["state_next"].append(state_next)
    replay_buffer["action"].append(action)
    replay_buffer["reward"].append(reward)
    replay_buffer["done"].append(done)

In [159]:

#action_history = []
#state_history = []
#state_next_history = []
#rewards_history = []
#done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 7764
frame_count = 4930000
statistics = []


In [160]:
def sample_from_replay_buffer(batch_size):
    indices = np.random.choice(len(replay_buffer["state"]), size=batch_size)
    state_sample = np.array([replay_buffer["state"][i] for i in indices])
    state_next_sample = np.array([replay_buffer["state_next"][i] for i in indices])
    action_sample = np.array([replay_buffer["action"][i] for i in indices])
    reward_sample = np.array([replay_buffer["reward"][i] for i in indices])
    done_sample = np.array([float(replay_buffer["done"][i]) for i in indices])
    return state_sample, state_next_sample, action_sample, reward_sample, done_sample


In [161]:
epsilon_random_frames = 10000
# Number of frames for exploration
epsilon_greedy_frames = 1000000.0 - frame_count
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 10000
# Train the model after 4 actions
update_after_actions = 4
# How often to update the target network
update_target_network = 10000

In [162]:
while frame_count < max_frames:
    observations, infos = envs.reset()
    states = np.array(observations)
    episode_rewards = np.zeros(num_envs)

    for timestep in range(max_steps_per_episode):
        frame_count += num_envs

        # Epsilon-greedy policy
        if frame_count < 10000 or epsilon > np.random.rand(1)[0]:
            actions = np.random.choice(num_actions, size=num_envs)
        else:
            state_tensor = tf.convert_to_tensor(states)
            q_values = model(state_tensor, training=False)
            actions = tf.argmax(q_values, axis=1).numpy()

        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        # Take actions
        next_states, rewards, dones, _, infos = envs.step(actions)
        next_states = np.array(next_states)

        # Store in replay buffer
        for env_idx in range(num_envs):
            add_to_replay_buffer(
                states[env_idx],
                next_states[env_idx],
                actions[env_idx],
                rewards[env_idx],
                dones[env_idx],
            )

        states = next_states
        print (replay_buffer)
        episode_rewards += rewards

        # Training
        if frame_count % update_after_actions == 0 and len(replay_buffer["state"]) > batch_size:
            state_sample, state_next_sample, action_sample, reward_sample, done_sample = sample_from_replay_buffer(batch_size)

            future_rewards = model_target.predict(state_next_sample, verbose=0)
            updated_q_values = reward_sample + gamma * tf.reduce_max(future_rewards, axis=1) * (1 - done_sample)

            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                q_values = model(state_sample)
                q_action = tf.reduce_sum(q_values * masks, axis=1)
                loss = loss_function(updated_q_values, q_action)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Update target model
        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())
            model_target.save(f"./Outputs/spacedefender_{int(frame_count/1000)}.keras")
            best_score = np.max(episode_reward_history) if episode_reward_history else 0
            statistics.append(
                {   
                    "best_score": best_score,
                     "running_reward": running_reward,
                     "episode": episode_count,
                    "frame_count": frame_count,
                }
            )
            with open(f"./Outputs/statistics_{episode_count}.json", "w") as json_file:
                json.dump(statistics, json_file)
            print(f"Frame: {frame_count}, Running Reward: {running_reward:.2f}, Best Score: {np.max(episode_rewards)}, Running Episode: {episode_count}")

        # Episode end
        if dones.all():
            break

    # Logging
    running_reward = np.mean(episode_rewards)
    episode_reward_history.append(episode_rewards)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    episode_count += num_envs

    if running_reward > 1600:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

    if (max_frames <= frame_count):
        print(f"Stopped at frame {frame_count}!")
        break




  logger.warn(


[np.float64(0.0), np.float64(0.0)]
[np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)]
[np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)]
[np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)]
[np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)]
[np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)]
[np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)]
[np.float64(0.0), np.float64(0.0), np.flo

                                                                         

MoviePy - Done !
MoviePy - video ready d:\workdir\DeepQLab\Outputs\videos\training-episode-0.mp4
[np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), n

KeyboardInterrupt: 