# **Import Libraries**

In [1]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import random
from collections import deque
from tqdm import tqdm
import matplotlib.pyplot as plt
from ipywidgets import Output
import sys

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')

if physical_devices:
    print("GPU is available")
    print(physical_devices)
else:
    print("GPU is not available")

GPU is available
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# **Frogger Enviroment**

In [3]:
# create env with gymnasium (use ram or rgb state)
env = gym.make(
    "ALE/Frogger-ram-v5", # "ALE/Frogger-ram-v5" or "ALE/Frogger-v5"
    render_mode="rgb_array", # rgb_array or human
    difficulty = 0, # [0, 1]
    mode = 0 # [0, 1, 2]
    ) 

print(f"State Frame Size: {env.observation_space}")
print(f"Number Of Actions: {env.action_space.n}")

actions_space = possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
print(f"Possible Actions: \n {actions_space}")

env.reset()
observation = env.step(1)
print(f"Obervation: \n {observation}")
print(observation[0].shape)

State Frame Size: Box(0, 255, (128,), uint8)
Number Of Actions: 5
Possible Actions: 
 [[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]
Obervation: 
 (array([143,  20,   5,  40,   6,  45, 147, 127,  28,  33,  39,  63, 100,
        85, 120,  86,  87,  52, 135,  60, 133, 119, 252,  36,  35, 213,
        19, 118, 188,  11, 164,  69, 229,  71, 217, 216, 123, 200, 184,
         6, 124, 119, 171, 154, 255,  91,  91, 195,  80,   0,   0,   4,
        40,   0,   0,   0, 234, 254, 234, 254,  29, 254,  51, 254, 157,
       254, 157, 254,  84, 254,   1,   0, 115, 115, 149,   0,   0,   0,
         0,   0,   4,   4,   1,   1,   0, 253, 155, 253, 255,  11,   0,
        75,   0,   1,   8,  63,  30,   0,   0,   0,   0,   0,   0,   0,
        46, 253,   0, 115, 255, 198,   1, 255,   0, 163,  58,  80, 255,
        80, 255,  28,  12,   0,   0,  91,   0,   0, 124, 250], dtype=uint8), 0.0, False, False, {'lives': 4, 'episode_frame_number': 4, 'frame_number': 4})
(128,)


# **Hyperparameter**

In [4]:
env = gym.make("ALE/Frogger-ram-v5", render_mode="rgb_array", difficulty = 0, mode = 0) 

# MODEL Hyperparameters
STATE_SIZE = env.observation_space.shape[0]
ACTIONS_SIZE = env.action_space.n
ACTIONS = list(range(0, ACTIONS_SIZE))
LEARNING_RATE = 0.01 # Learning Rate (alpha)

# AGENT Hyperparameters (epsilon greedy strategy)
EPSILON = 1.0
EPSILON_MIN = 0.01 # EPSELON value where exploreation stops
EPSILON_DECAY_RATE = 0.995 # the higher the longer the exploreation takes (Linear Decay: EPSELON * DECAY_RATE)
GAMMA = 0.99 # Discounting rate (lower -> agent thinks more long term)

# TRAINING Hyperparameters
RENDER_INTERVALL = 50 # Intervall when the game is rendered
TOTAL_EPISODES = 1000000
MINI_BATCHES_REPLAY = 64
REPLAY_BUFFER_MEMORY = 10000
STACKED_FRAMES_SIZE = 4
AVERAGE_WINDOW = 50

# **Deep Q-Learning Neural Network Model**

In [5]:
"""
def build_dqn(input_shape=(STATE_SIZE*STACKED_FRAMES_SIZE,)):
    model = models.Sequential()
    model.add(layers.Dense(24, input_shape=input_shape, activation='relu'))
    model.add(layers.Dense(24, activation='relu'))
    model.add(layers.Dense(ACTIONS_SIZE, activation='linear'))
    model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE))
    return model

"""


def build_dqn(input_shape=(STATE_SIZE*STACKED_FRAMES_SIZE,)):
    model = models.Sequential()
    model.add(layers.Reshape((16, 32), input_shape=input_shape))  # Reshape to (16, 32)
    model.add(layers.Conv1D(32, 3, activation='relu', padding='same'))
    model.add(layers.UpSampling1D(4))  # Upsample to (64, 32)
    model.add(layers.Conv1D(64, 3, activation='relu', padding='same'))
    model.add(layers.UpSampling1D(2))  # Upsample to (128, 64)
    model.add(layers.Conv1D(4, 3, activation='linear', padding='same'))  # Output (128, 4)
    model.add(layers.Reshape((128, 4)))  # Reshape to (128, 4)
    model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE))
    return model

# **Stacked Frames**

In [6]:
STACKED_FRAMES = deque([np.zeros((STATE_SIZE), dtype=np.int32) for i in range(STACKED_FRAMES_SIZE)], maxlen=4)

def stack_frames(stacked_frames, state, is_new_episode):

    if is_new_episode:
        # clear stack for new episode
        stacked_frames = deque([np.zeros((STATE_SIZE), dtype=np.int32) for i in range(STACKED_FRAMES_SIZE)], maxlen=4)
        
        # Add the same frame 4 times to the deque since its a new episode
        stacked_frames.append(state)
        stacked_frames.append(state)
        stacked_frames.append(state)
        stacked_frames.append(state)
        
        # Stack the frames with numpy (join all 4 frames)
        stacked_frames_array = np.stack(stacked_frames, axis=0)

    elif not is_new_episode:
        # append new frame and remove oldest frame
        stacked_frames.append(state)

        # Stack the frames with numpy (join all 4 frames)
        stacked_frames_array = np.stack(stacked_frames, axis=0) 

    return stacked_frames_array, stacked_frames

# **Replay Buffer**

In [7]:
class ReplayBuffer:
    def __init__(self):
        # deque that ther are only max REPLAY_BUFFER_MEMORY items in the list
        # deque = remove oldest item
        self.buffer = deque(maxlen=REPLAY_BUFFER_MEMORY)
    
    def add(self, experience):
        # add item to buffer
        self.buffer.append(experience)
    
    def sample(self):
        return random.sample(self.buffer, MINI_BATCHES_REPLAY) # 16 (MINI_BATCHES_REPLAY) samples to retrain the mdoel

# **DQN Agent**

In [8]:
class DQNAgent:
    def __init__(self):
        self.memory = ReplayBuffer()
        self.EPSILON = EPSILON
        self.model = build_dqn()
        self.target_model = build_dqn()
        self.update_target_model()
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
    
    def remember(self, state, action, reward, next_state, done):
        flattend_array_state = np.expand_dims(state.reshape(-1), axis=0)
        flattend_array_next_state = np.expand_dims(next_state.reshape(-1), axis=0)
        self.memory.add((flattend_array_state, action, reward, flattend_array_next_state, done))
    
    def predict_action(self, stacked_array):
        # flatten array for model
        flattend_array = np.expand_dims(stacked_array.reshape(-1), axis=0)

        if np.random.rand() <= EPSILON:
            return random.randrange(ACTIONS_SIZE)
        q_values = self.model.predict(flattend_array)
        return np.argmax(q_values[0])
    
    def replay(self):
        print("Replay")
        minibatch = self.memory.sample()
        i = 0
        for state, action, reward, next_state, done in tqdm(minibatch):
            i += 1
            # Predict Target Q-Values
            target = self.model.predict(state, verbose=0)
            if done:
                # If the episode is done the target Q-value for the taken action is set to the received reward
                target[0][action] = reward
            elif not done:
                # If the episode is not done, the target Q-value for the taken action is updated using the Bellman equation
                t = self.target_model.predict(next_state, verbose=0)[0]
                target[0][action] = reward + GAMMA * np.amax(t)
            self.model.fit(state, target, epochs=1, verbose=0)
                
        if self.EPSILON > EPSILON_MIN:
            self.EPSILON *= EPSILON_DECAY_RATE

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

# **Reward Graph**

In [9]:
%matplotlib qt
average_rewards = []
rewards_per_episode = []

plt.figure(figsize=(12, 6))
reward_line, = plt.plot(rewards_per_episode, label='Reward per Episode')
average_line, = plt.plot(average_rewards, label='Moving Average Reward')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Episode Reward and Moving Average Reward over Time')
plt.legend()

# Display plot in a new window
plt.show()

# **Training**

In [10]:
%matplotlib qt

agent = DQNAgent()

for episode in range(TOTAL_EPISODES):
    # create env (human to render game and see actions)
    if episode % RENDER_INTERVALL == 0 and episode != 0:
        env = gym.make("ALE/Frogger-ram-v5", render_mode="human", difficulty = 0, mode = 0) 
    else:
        env = gym.make("ALE/Frogger-ram-v5", render_mode="rgb_array", difficulty = 0, mode = 0) 
    state = env.reset()[0]

    # reset episode variables
    done = False
    total_reward = 0
    step_count = 0
    is_new_episode = True

    # initilize episode stack
    stacked_array_state, STACKED_FRAMES = stack_frames(STACKED_FRAMES, state, is_new_episode)
    
    while not done:
        # update step_count
        step_count += 1

        # Predict action
        if is_new_episode and step_count <= 110:
            # jsut wait for first 100 steps because of initialization 
            action = 0
            next_state, reward, done, loss_of_live, info = env.step(0)
        else:
            action = agent.predict_action(stacked_array_state)
            next_state, reward, done, loss_of_live, info = env.step(action)
            is_new_episode = False

        # Predict action
        action = agent.predict_action(stacked_array_state)
        next_state, reward, done, loss_of_live, info = env.step(action)
        
        # Do Stacking
        stacked_array_next_state, STACKED_FRAMES = stack_frames(STACKED_FRAMES, next_state, is_new_episode)

        # update reward
        total_reward += reward

        # store action infromation in memory
        next_state = np.reshape(next_state, [1, STATE_SIZE])
        agent.remember(stacked_array_state, action, reward, stacked_array_next_state, done)

        # set the next state to the current state
        stacked_array_state = stacked_array_next_state
        
        # if game finished (won or lose)
        if done:
            agent.update_target_model()

            rewards_per_episode.append(total_reward)

            if len(rewards_per_episode) >= AVERAGE_WINDOW:
                moving_average = np.mean(rewards_per_episode[-AVERAGE_WINDOW:])
            else:
                moving_average = np.mean(rewards_per_episode)
            average_rewards.append(moving_average)

            # reward graph
            reward_line.set_data(range(episode + 1), rewards_per_episode)
            average_line.set_data(range(episode + 1), average_rewards)
            plt.xlim(0, episode + 1)
            plt.ylim(0, max(max(rewards_per_episode), max(average_rewards)) + 10)
            plt.pause(0.01)

            print(f"Episode: {episode}/{TOTAL_EPISODES}, Total Reward: {total_reward}, Moving AVG. Reward: {moving_average}, Steps: {step_count}, Epsilon: {agent.EPSILON:.2}")
            break

    if episode % 4 == 0 and episode != 0:
        agent.replay()

Episode: 0/1000000, Total Reward: 5.0, Moving AVG. Reward: 5.0, Steps: 122, Epsilon: 1.0
Episode: 1/1000000, Total Reward: 4.0, Moving AVG. Reward: 4.5, Steps: 195, Epsilon: 1.0
Episode: 2/1000000, Total Reward: 10.0, Moving AVG. Reward: 6.333333333333333, Steps: 145, Epsilon: 1.0
Episode: 3/1000000, Total Reward: 6.0, Moving AVG. Reward: 6.25, Steps: 153, Epsilon: 1.0
Episode: 4/1000000, Total Reward: 9.0, Moving AVG. Reward: 6.8, Steps: 124, Epsilon: 1.0
Replay


100%|██████████| 64/64 [00:11<00:00,  5.68it/s]


Episode: 5/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.0, Steps: 145, Epsilon: 0.99
Episode: 6/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.0, Steps: 134, Epsilon: 0.99
Episode: 7/1000000, Total Reward: 9.0, Moving AVG. Reward: 7.25, Steps: 159, Epsilon: 0.99
Episode: 8/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.222222222222222, Steps: 172, Epsilon: 0.99
Replay


100%|██████████| 64/64 [00:09<00:00,  7.08it/s]


Episode: 9/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.3, Steps: 194, Epsilon: 0.99
Episode: 10/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.363636363636363, Steps: 138, Epsilon: 0.99
Episode: 11/1000000, Total Reward: 9.0, Moving AVG. Reward: 7.5, Steps: 133, Epsilon: 0.99
Episode: 12/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.461538461538462, Steps: 145, Epsilon: 0.99
Replay


100%|██████████| 64/64 [00:08<00:00,  7.68it/s]


Episode: 13/1000000, Total Reward: 5.0, Moving AVG. Reward: 7.285714285714286, Steps: 149, Epsilon: 0.99
Episode: 14/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.333333333333333, Steps: 133, Epsilon: 0.99
Episode: 15/1000000, Total Reward: 4.0, Moving AVG. Reward: 7.125, Steps: 240, Epsilon: 0.99
Episode: 16/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.117647058823529, Steps: 183, Epsilon: 0.99
Replay


100%|██████████| 64/64 [00:08<00:00,  7.67it/s]


Episode: 17/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.111111111111111, Steps: 136, Epsilon: 0.98
Episode: 18/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.157894736842105, Steps: 151, Epsilon: 0.98
Episode: 19/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.15, Steps: 146, Epsilon: 0.98
Episode: 20/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.142857142857143, Steps: 161, Epsilon: 0.98
Replay


100%|██████████| 64/64 [00:08<00:00,  7.69it/s]


Episode: 21/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.136363636363637, Steps: 143, Epsilon: 0.98
Episode: 22/1000000, Total Reward: 11.0, Moving AVG. Reward: 7.304347826086956, Steps: 147, Epsilon: 0.98
Episode: 23/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.291666666666667, Steps: 152, Epsilon: 0.98
Episode: 24/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.32, Steps: 165, Epsilon: 0.98
Replay


100%|██████████| 64/64 [00:08<00:00,  7.62it/s]


Episode: 25/1000000, Total Reward: 10.0, Moving AVG. Reward: 7.423076923076923, Steps: 141, Epsilon: 0.97
Episode: 26/1000000, Total Reward: 5.0, Moving AVG. Reward: 7.333333333333333, Steps: 175, Epsilon: 0.97
Episode: 27/1000000, Total Reward: 6.0, Moving AVG. Reward: 7.285714285714286, Steps: 196, Epsilon: 0.97
Episode: 28/1000000, Total Reward: 10.0, Moving AVG. Reward: 7.379310344827586, Steps: 151, Epsilon: 0.97
Replay


100%|██████████| 64/64 [00:08<00:00,  7.63it/s]


Episode: 29/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.366666666666666, Steps: 168, Epsilon: 0.97
Episode: 30/1000000, Total Reward: 4.0, Moving AVG. Reward: 7.258064516129032, Steps: 159, Epsilon: 0.97
Episode: 31/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.28125, Steps: 146, Epsilon: 0.97
Episode: 32/1000000, Total Reward: 10.0, Moving AVG. Reward: 7.363636363636363, Steps: 124, Epsilon: 0.97
Replay


100%|██████████| 64/64 [00:08<00:00,  7.45it/s]


Episode: 33/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.382352941176471, Steps: 138, Epsilon: 0.96
Episode: 34/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.371428571428571, Steps: 121, Epsilon: 0.96
Episode: 35/1000000, Total Reward: 9.0, Moving AVG. Reward: 7.416666666666667, Steps: 130, Epsilon: 0.96
Episode: 36/1000000, Total Reward: 6.0, Moving AVG. Reward: 7.378378378378378, Steps: 149, Epsilon: 0.96
Replay


100%|██████████| 64/64 [00:08<00:00,  7.72it/s]


Episode: 37/1000000, Total Reward: 6.0, Moving AVG. Reward: 7.342105263157895, Steps: 138, Epsilon: 0.96
Episode: 38/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.358974358974359, Steps: 140, Epsilon: 0.96
Episode: 39/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.375, Steps: 145, Epsilon: 0.96
Episode: 40/1000000, Total Reward: 10.0, Moving AVG. Reward: 7.439024390243903, Steps: 132, Epsilon: 0.96
Replay


100%|██████████| 64/64 [00:08<00:00,  7.64it/s]


Episode: 41/1000000, Total Reward: 9.0, Moving AVG. Reward: 7.476190476190476, Steps: 180, Epsilon: 0.95
Episode: 42/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.465116279069767, Steps: 139, Epsilon: 0.95
Episode: 43/1000000, Total Reward: 6.0, Moving AVG. Reward: 7.431818181818182, Steps: 146, Epsilon: 0.95
Episode: 44/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.444444444444445, Steps: 137, Epsilon: 0.95
Replay


100%|██████████| 64/64 [00:08<00:00,  7.56it/s]


Episode: 45/1000000, Total Reward: 8.0, Moving AVG. Reward: 7.456521739130435, Steps: 193, Epsilon: 0.95
Episode: 46/1000000, Total Reward: 6.0, Moving AVG. Reward: 7.425531914893617, Steps: 153, Epsilon: 0.95
Episode: 47/1000000, Total Reward: 6.0, Moving AVG. Reward: 7.395833333333333, Steps: 130, Epsilon: 0.95
Episode: 48/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.387755102040816, Steps: 140, Epsilon: 0.95
Replay


100%|██████████| 64/64 [00:08<00:00,  7.37it/s]


Episode: 49/1000000, Total Reward: 5.0, Moving AVG. Reward: 7.34, Steps: 145, Epsilon: 0.94
Episode: 50/1000000, Total Reward: 6.0, Moving AVG. Reward: 7.36, Steps: 127, Epsilon: 0.94
Episode: 51/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.42, Steps: 139, Epsilon: 0.94
Episode: 52/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.36, Steps: 176, Epsilon: 0.94
Replay


100%|██████████| 64/64 [00:08<00:00,  7.12it/s]


Episode: 53/1000000, Total Reward: 4.0, Moving AVG. Reward: 7.32, Steps: 145, Epsilon: 0.94
Episode: 54/1000000, Total Reward: 9.0, Moving AVG. Reward: 7.32, Steps: 115, Epsilon: 0.94
Episode: 55/1000000, Total Reward: 5.0, Moving AVG. Reward: 7.26, Steps: 125, Epsilon: 0.94
Episode: 56/1000000, Total Reward: 7.0, Moving AVG. Reward: 7.26, Steps: 141, Epsilon: 0.94
Replay


 20%|██        | 13/64 [00:01<00:06,  7.43it/s]