# **Import Libraries**

In [1]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
import time
from collections import deque
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output

from modules.agents import DQNAgent
from modules.logger import EpisodeLogger
import modules.rewards as rewards
import modules.process as process

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')

if physical_devices:
    print("GPU is available")
    print(physical_devices)
else:
    print("GPU is not available")

GPU is available
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# **Frogger Enviroment**

In [3]:
# create env with gymnasium (use ram or rgb state)
env = gym.make(
    "ALE/Frogger-v5", # "ALE/Frogger-ram-v5" or "ALE/Frogger-v5"
    obs_type="grayscale", # ram, grescale, rgb
    render_mode="rgb_array", # rgb_array or human
    difficulty = 0, # [0, 1]
    mode = 0 # [0, 1, 2]
    ) 

env.reset()
print(env.step(0))
print(f"State Frame Size: {env.observation_space}")
print(f"Number Of Actions: {env.action_space.n}")

actions_space = possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
print(f"Possible Actions: \n {actions_space}")

env.reset()
observation = env.step(1)
print(f"Obervation: \n {observation[0]}")
print(observation[0].shape)

(array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [104, 104, 104, ..., 104, 104, 104],
       [104, 104, 104, ..., 104, 104, 104],
       [104, 104, 104, ..., 104, 104, 104]], dtype=uint8), 0.0, False, False, {'lives': 4, 'episode_frame_number': 4, 'frame_number': 4})
State Frame Size: Box(0, 255, (210, 160), uint8)
Number Of Actions: 5
Possible Actions: 
 [[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]
Obervation: 
 [[  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [104 104 104 ... 104 104 104]
 [104 104 104 ... 104 104 104]
 [104 104 104 ... 104 104 104]]
(210, 160)


# **Hyperparameter**

In [4]:
env = gym.make("ALE/Frogger-v5", obs_type="grayscale", render_mode="rgb_array", difficulty = 0, mode = 0) 

# MODEL Hyperparameters
STATE_SIZE = env.observation_space.shape[0]
ACTIONS_SIZE = env.action_space.n
ACTIONS = list(range(0, ACTIONS_SIZE))
LEARNING_RATE = 0.001 # Learning Rate (alpha)

# AGENT Hyperparameters (epsilon greedy strategy)
EPSILON = 0.95
EPSILON_MIN = 0.001 # EPSELON value where exploreation stops
EPSILON_DECAY_RATE = 0.9995 # the higher the longer the exploreation takes (Linear Decay: EPSELON * DECAY_RATE)
GAMMA = 0.975 # Discounting rate (lower -> agent thinks more long term)

# TRAINING Hyperparameters
RENDER_INTERVAL = 10 # Intervall when the game is rendered
TOTAL_EPISODES = 1000
REPLAY_INTERVAL = 1 # Replay every x steps (retrain model) 
MINI_BATCHES_REPLAY = 64
REPLAY_BUFFER_MEMORY = 6000
STACKED_FRAMES_SIZE = 4
AVERAGE_WINDOW = 10

STACKED_FRAMES = deque([np.zeros((STATE_SIZE), dtype=np.int32) for i in range(STACKED_FRAMES_SIZE)], maxlen=4)

# **Reward Graph**

In [5]:
%matplotlib qt
average_rewards = []
rewards_per_episode = []
distance_per_episode = []

plt.ion()

# Create a figure and two subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

# REWARD PLOT
reward_line, = ax1.plot(rewards_per_episode, label='Reward per Episode')
average_line, = ax1.plot(average_rewards, label='Moving Average Reward')
ax1.set_xlabel('Episode')
ax1.set_ylabel('Reward')
ax1.set_title('Episode Reward and Moving Average Reward over Time')
ax1.legend()

# DISTANCE PLOT
distance_line, = ax2.plot(distance_per_episode, label='AVG. Distance')
ax2.set_xlabel('Episode')
ax2.set_ylabel('AVG. Distance Travelled')
ax2.set_title('Average Distance Travelled Per Episode')
ax2.legend()

fig.savefig('figures/training.png')
plt.show()



# **Training**

In [6]:
%matplotlib qt

agent = DQNAgent(REPLAY_BUFFER_MEMORY, MINI_BATCHES_REPLAY, EPSILON, ACTIONS_SIZE, GAMMA, EPSILON_MIN, EPSILON_DECAY_RATE, LEARNING_RATE)
logger = EpisodeLogger(log_file_path="logs/episode_logs.json")

for episode in range(TOTAL_EPISODES):
    # create env (human to render game and see actions)
    if episode % RENDER_INTERVAL == 0 and episode != 0:
        env = gym.make("ALE/Frogger-v5", obs_type="grayscale", render_mode="human", difficulty = 0, mode = 0) 
    else:
        env = gym.make("ALE/Frogger-v5", obs_type="grayscale", render_mode="rgb_array", difficulty = 0, mode = 0) 
    state = env.reset()[0]
    preprocessed_state = process.preprocess_frames(state)

    # reset episode variables
    done = False
    total_reward = 0
    step_count = 0
    is_new_episode = True
    distrance_travelled = 0
    distance_before = 0
    current_lives = 4
    lives_before = 4
    episode_max_travelled_distance = 0
    episode_step_distance = 0

    # initilize episode stack
    stacked_array_state, STACKED_FRAMES = process.stack_frames(STACKED_FRAMES, preprocessed_state, is_new_episode, STATE_SIZE, STACKED_FRAMES_SIZE)
    
    while not current_lives < lives_before:
        # update step_count
        step_count += 1

        # Predict action
        if is_new_episode and step_count <= 110:
            # jsut wait for first 100 steps because of initialization 
            action = 0
            next_state, reward, done, loss_of_live, info = env.step(0)
            state = next_state
        else:
            action = agent.predict_action(stacked_array_state)
            next_state, reward, done, loss_of_live, info = env.step(action)
            next_state_preprocessed = process.preprocess_frames(next_state)
            is_new_episode = False
            
            # Do Stacking
            stacked_array_next_state, STACKED_FRAMES = process.stack_frames(STACKED_FRAMES, next_state_preprocessed, is_new_episode, STATE_SIZE, STACKED_FRAMES_SIZE)

            # updated distance
            if reward == 1:
                distrance_travelled += 1

            #update lives
            current_lives = info["lives"]

            # update reward
            total_reward = rewards.action_based_reward(total_reward, action, distrance_travelled, distance_before, current_lives, lives_before)

            # store action infromation in memory
            agent.remember(stacked_array_state, action, reward, stacked_array_next_state, done)

            # set the next state to the current state
            stacked_array_state = stacked_array_next_state

            # Do Replay
            if step_count % REPLAY_INTERVAL == 0 and not is_new_episode and len(agent.memory.buffer) > MINI_BATCHES_REPLAY:
                agent.replay()

            distance_before = distrance_travelled
            if distrance_travelled > episode_max_travelled_distance:
                episode_max_travelled_distance = distrance_travelled

            # if frog crosses the highway or dies
            if (distrance_travelled) == 6 or (current_lives < lives_before):
                # clear output of cell for every new episode
                clear_output(wait=True)
                
                distance_per_episode.append(distrance_travelled)

                if distrance_travelled == 6:
                    print(f"👑: Distance travelled this run: {distrance_travelled} steps forward & {current_lives} lives left!")
                if current_lives < lives_before:
                    print(f"Game Over: Distance travelled this run: {distrance_travelled} steps forward & {current_lives} lives left!")
                    distrance_travelled = 0  
                    lives_before = current_lives + 1

                agent.update_target_model()

                rewards_per_episode.append(total_reward)

                if len(rewards_per_episode) >= AVERAGE_WINDOW:
                    moving_average = np.mean(rewards_per_episode[-AVERAGE_WINDOW:])
                else:
                    moving_average = np.mean(rewards_per_episode)
                average_rewards.append(moving_average)

                step_text = f"Episode: {episode}/{TOTAL_EPISODES}, Total Reward: {total_reward}, Moving AVG. Reward: {moving_average}, Max trav. dist.: {episode_max_travelled_distance}, Epsilon: {agent.EPSILON:.2}"

                # Update REWARD PLOT
                reward_line.set_data(range(episode + 1), rewards_per_episode[:episode + 1])
                average_line.set_data(range(episode + 1), average_rewards[:episode + 1])
                ax1.set_xlim(0, episode + 1)
                ax1.set_title(step_text)
                y_min = min(min(rewards_per_episode[:episode + 1]), min(average_rewards[:episode + 1])) - 10
                y_max = max(max(rewards_per_episode[:episode + 1]), max(average_rewards[:episode + 1])) + 10
                ax1.set_ylim(y_min, y_max)
                
                # Update DISTANCE PLOT
                distance_line.set_data(range(episode + 1), distance_per_episode[:episode + 1])
                ax2.set_xlim(0, episode + 1)
                y_min_distance = 0
                y_max_distance = max(distance_per_episode[:episode + 1]) + 1
                ax2.set_ylim(y_min_distance, y_max_distance)
                
                # Redraw the figure
                fig.canvas.draw()
                fig.canvas.flush_events()
                
                # Save the figure
                fig.savefig(f'figures/training.png')
                
                time.sleep(0.1)

                print(step_text)
                logger.log_episode(episode, total_reward, moving_average, episode_max_travelled_distance, agent.EPSILON, step_count, episode_step_distance, distance_per_episode)
                break


Game Over: Distance travelled this run: 2 steps forward & 3 lives left!
Episode: 42/1000, Total Reward: -19.0, Moving AVG. Reward: -24.1, Max trav. dist.: 2, Epsilon: 0.72
Replay


 28%|██▊       | 18/64 [00:10<00:27,  1.65it/s]


KeyboardInterrupt: 