# **Import Libraries**

In [1]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
import time
from collections import deque
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output

from modules.agents import DQNAgent
from modules.logger import EpisodeLogger
import modules.rewards as rewards
import modules.processing as processing

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')

if physical_devices:
    print("GPU is available")
    print(physical_devices)
else:
    print("GPU is not available")

GPU is available
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# **Freeway Enviroment**

In [3]:
# create env with gymnasium (use ram, greyscale or rgb state)
env = gym.make(
    "ALE/Freeway-v5", # "Enviroment Variant"
    obs_type="ram", # ram, grescale, rgb
    render_mode="rgb_array", # rgb_array or human
    difficulty = 0, # [0, 1]
    mode = 0 # [0]
    ) 

env.reset()
print(env.step(0))
print(f"State Frame Size: {env.observation_space}")
print(f"Number Of Actions: {env.action_space.n}")

actions_space = possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
print(f"Possible Actions: \n {actions_space}")

env.reset()
observation = env.step(1)
print(f"Obervation: \n {observation[0]}")
print(observation[0].shape)

(array([  0,   4, 132,   0,  15,  15,   0, 255,  74,  30,  12,   6,   0,
         8,   6,   6, 255, 255,   7,   7,  26,   0, 255,  80,  64,  48,
        32,  16, 144, 160, 176, 192, 208,   2,   1,   0,   1,   0,   0,
         0,   2,   0,   1,  80,  80,  80,  64,  48,  10, 234, 234, 218,
       218, 122, 138,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
        80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  26,
       216,  68, 136,  36, 130,  74,  18, 220,  66, 189, 247, 122, 247,
       122, 247,  80, 247,  80, 247,   0, 247,   0, 247,   0,   0,   0,
         0,   0,   0,   0,   1,   1,   1,   2,   3, 156, 158, 158, 159,
       159,   0,  16,   5,   1,  80, 255,  87, 246,  75, 244], dtype=uint8), 0.0, False, False, {'lives': 0, 'episode_frame_number': 4, 'frame_number': 4})
State Frame Size: Box(0, 255, (128,), uint8)
Number Of Actions: 3
Possible Actions: 
 [[1 0 0]
 [0 1 0]
 [0 0 1]]
Obervation: 
 [  0   4 132   0  14  15   0 255  74  30  12   6   0   8  10   6 25

# **Hyperparameter**

In [4]:
EVIROMENT_VARIANT = "ALE/Freeway-v5"

env = gym.make(EVIROMENT_VARIANT, obs_type="ram", difficulty = 0, mode = 0) 

# MODEL Hyperparameters
STATE_SIZE = env.observation_space.shape[0]
ACTIONS_SIZE = env.action_space.n
ACTIONS = list(range(0, ACTIONS_SIZE))
LEARNING_RATE = 0.001 # Learning Rate (alpha)

# AGENT Hyperparameters (epsilon greedy strategy)
EPSILON = 0.99
EPSILON_MIN = 0.001 # EPSELON value where exploreation stops
EPSILON_DECAY_RATE = 0.9995 # the higher the longer the exploreation takes (Linear Decay: EPSELON * DECAY_RATE)
GAMMA = 0.975 # Discounting rate (lower -> agent thinks more long term)

# TRAINING Hyperparameters
RENDER_INTERVAL = 10 # Intervall when the game is rendered
TOTAL_EPISODES = 1000
REPLAY_INTERVAL = 4 # Replay every x steps (retrain model) 
MINI_BATCHES_REPLAY = 32
REPLAY_BUFFER_MEMORY = 6000
STACKED_FRAMES_SIZE = 4
AVERAGE_WINDOW = 10

STACKED_FRAMES = deque([np.zeros((STATE_SIZE), dtype=np.int32) for i in range(STACKED_FRAMES_SIZE)], maxlen=4)

# **Reward Graph**

In [5]:
%matplotlib qt
average_rewards = []
rewards_per_episode = []
distance_per_episode = []

plt.ion()

# Create a figure and two subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

# REWARD PLOT
reward_line, = ax1.plot(rewards_per_episode, label='Reward per Episode')
average_line, = ax1.plot(average_rewards, label='Moving Average Reward')
ax1.set_xlabel('Episode')
ax1.set_ylabel('Reward')
ax1.set_title('Episode Reward and Moving Average Reward over Time')
ax1.legend()

# DISTANCE PLOT
distance_line, = ax2.plot(distance_per_episode, label='Distance')
ax2.set_xlabel('Episode')
ax2.set_ylabel('Distance')
ax2.set_title('Distance Travelled Per Episode')
ax2.legend()

fig.savefig('figures/training.png')
plt.show()

# **Training**

In [6]:
%matplotlib qt

agent = DQNAgent(REPLAY_BUFFER_MEMORY, MINI_BATCHES_REPLAY, EPSILON, ACTIONS_SIZE, GAMMA, EPSILON_MIN, EPSILON_DECAY_RATE, LEARNING_RATE)
logger = EpisodeLogger(log_file_path="logs/episode_logs.json")

for episode in range(TOTAL_EPISODES):
    print(f"EPISODE: {episode}")
    # create env (human to render game and see actions)
    if episode % RENDER_INTERVAL == 0 and episode != 0:
        env = gym.make(EVIROMENT_VARIANT, obs_type="ram", render_mode="human", difficulty=0, mode=0) 
    else:
        env = gym.make(EVIROMENT_VARIANT, obs_type="ram", difficulty=0, mode=0) 
        
    state = env.reset()[0]
    preprocessed_state = processing.preprocess_ram(state)

    # reset episode variables
    step_count = 0
    max_distance_episode = 0
    distance_before = 0
    is_new_episode = True
    total_reward = 0

    # initilize episode stack
    #stacked_array_state, STACKED_FRAMES = process.stack_frames(STACKED_FRAMES, preprocessed_state, is_new_episode, STATE_SIZE, STACKED_FRAMES_SIZE)
    
    while True:
        # update step_count
        step_count += 1

        # Predict action
        action = agent.predict_action(preprocessed_state)
        next_state, game_reward, game_done, game_loss_of_live, game_info = env.step(action)

        y_pos = env.ale.getRAM()[14]
        crashed = 1 if env.ale.getRAM()[16] != 255 else 0 # RAM(16) =:= Collision Lane

        next_state_preprocessed = processing.preprocess_ram(next_state)

        # Do Stacking
        #stacked_array_next_state, STACKED_FRAMES = process.stack_frames(STACKED_FRAMES, next_state_preprocessed, is_new_episode, STATE_SIZE, STACKED_FRAMES_SIZE)

        # updated distance
        if y_pos > max_distance_episode:
            max_distance_episode = y_pos

        # update reward
        total_reward = rewards.action_based_reward(total_reward, crashed, action, y_pos, max_distance_episode)

        # store action infromation in memory
        agent.remember(preprocessed_state, action, game_reward, next_state_preprocessed, game_done)

        # set the next state to the current state
        preprocessed_state = next_state_preprocessed

        # Do Replay
        if step_count % REPLAY_INTERVAL == 0 and len(agent.memory.buffer) > MINI_BATCHES_REPLAY:
            agent.replay()

        # END EPISODE IF CHICKEN COLLIDES
        if crashed == 1 or y_pos >= 175:
            # clear output of cell for every new episode
            clear_output(wait=True)

            # update ntework
            agent.update_target_model()

            rewards_per_episode.append(total_reward)
            distance_per_episode.append(y_pos)

            if len(rewards_per_episode) >= AVERAGE_WINDOW:
                moving_average = np.mean(rewards_per_episode[-AVERAGE_WINDOW:])
            else:
                moving_average = np.mean(rewards_per_episode)
            average_rewards.append(moving_average)

            step_text = f"Episode: {episode}/{TOTAL_EPISODES}, Total Reward: {total_reward}, Moving AVG. Reward: {moving_average}, Distance: {y_pos}, Epsilon: {agent.EPSILON:.2}"

            # Update REWARD PLOT
            reward_line.set_data(range(episode + 1), rewards_per_episode[:episode + 1])
            average_line.set_data(range(episode + 1), average_rewards[:episode + 1])
            ax1.set_xlim(0, episode + 1)
            ax1.set_title(step_text)
            y_min = min(min(rewards_per_episode[:episode + 1]), min(average_rewards[:episode + 1])) - 10
            y_max = max(max(rewards_per_episode[:episode + 1]), max(average_rewards[:episode + 1])) + 10
            ax1.set_ylim(y_min, y_max)
            
            # Update DISTANCE PLOT
            distance_line.set_data(range(episode + 1), distance_per_episode[:episode + 1])
            ax2.set_xlim(0, episode + 1)
            y_min_distance = 0
            y_max_distance = max(distance_per_episode[:episode + 1]) + 1
            ax2.set_ylim(y_min_distance, y_max_distance)
            
            # Redraw the figure
            fig.canvas.draw()
            fig.canvas.flush_events()
            
            # Save the figure
            fig.savefig(f'figures/training.png')
            
            time.sleep(0.1)

            print(step_text)
            #logger.log_episode(episode, total_reward, moving_average, agent.EPSILON, step_count, distance_per_episode)
            break

        is_new_episode = False
        distance_before = y_pos


Episode: 3/1000, Total Reward: -33.0, Moving AVG. Reward: -38.375, Distance: 31, Epsilon: 0.77
EPISODE: 4
[2 0 0 0 0 0 0 0 0 0 0 0]
[2 0 0 0 0 0 0 0 0 0 0 0]
[3 0 0 0 0 0 0 0 0 0 0 0]
[4 0 0 0 0 0 0 0 0 0 0 0]
[6 0 0 0 0 0 0 0 0 0 0 0]
Replay


100%|██████████| 32/32 [00:04<00:00,  7.12it/s]


[7 0 0 0 0 0 0 0 0 0 0 0]
[8 0 0 0 0 0 7 0 0 0 0 0]
[8 0 0 0 0 0 9 0 0 0 0 0]
[10  0  0  0  0  0 10  0  0  0  0  0]
Replay


100%|██████████| 32/32 [00:04<00:00,  6.50it/s]


[ 8  0  0  0  0  0 11  0  0  0  0  0]
[10  0  0  0  0  6 13  0  0  0  0  0]
[11  0  0  0  0  7 14  0  0  0  0  0]
[11  0  0  0  0  8 15  0  0  0  0  0]
Replay


100%|██████████| 32/32 [00:04<00:00,  7.51it/s]


[11  0  0  0  0  8 17  0  0  0  0  0]
[10  0  0  0  0  9 18  0  0  0  0  0]
[11  0  0  0  6 10 19  0  0  0  0  0]
[11  0  0  0  7 10 21  0  0  0  0  0]
Replay


100%|██████████| 32/32 [00:04<00:00,  7.62it/s]


[10  0  0  0  7 11 22  0  0  0  0  0]
[10  0  0  0  8 12 23  0  0  0  0  0]
[ 8  0  0  0  8 12 25  0  0  0  0  0]
[ 7  0  0  6  9 13 26 26  0  0  0  0]
Replay


100%|██████████| 32/32 [00:04<00:00,  7.25it/s]


[ 7  0  0  7  9 14  0 25  0  0  0  0]
[ 8  0  0  7  9 14  0 24  0  0  0  0]
[10  0  0  7 10 15  0 22  0  0  0  0]
[10  0  0  8 10 16  0 21  0  0  0  0]
Replay


 25%|██▌       | 8/32 [00:01<00:03,  6.04it/s]