# **Import Libraries**

In [None]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
import time
import random
from collections import deque
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output

from modules.agents import DQNAgent
from modules.logger import EpisodeLogger
import modules.rewards as rewards
import modules.processing as processing
import modules.figures as figures
import modules.networks as networks

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')

if physical_devices:
    print("GPU is available")
    print(physical_devices)
else:
    print("GPU is not available")

# **Freeway Enviroment**

In [None]:
# create env with gymnasium (use ram, greyscale or rgb state)
env = gym.make(
    "ALE/Freeway-v5", # "Enviroment Variant"
    obs_type="ram", # ram, grescale, rgb
    render_mode="rgb_array", # rgb_array or human
    difficulty = 0, # [0, 1]
    mode = 0 # [0]
    ) 

env.reset()
print(env.step(0))
print(f"State Frame Size: {env.observation_space}")
print(f"Number Of Actions: {env.action_space.n}")

actions_space = possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
print(f"Possible Actions: \n {actions_space}")

env.reset()
observation = env.step(1)
print(f"Obervation: \n {observation[0]}")
print(observation[0].shape)

# **Hyperparameter**

In [None]:
# GAME PARAMETER
EVIROMENT_VARIANT = "ALE/Freeway-v5"
DIFFICULTY = 0
MODE = 0

env = gym.make(EVIROMENT_VARIANT, obs_type="ram", difficulty = 0, mode = 0) 

# MODEL Hyperparameters
STATE_SIZE = env.observation_space.shape[0]
ACTIONS_SIZE = env.action_space.n
ACTIONS = list(range(0, ACTIONS_SIZE))
LEARNING_RATE = 0.001 # Learning Rate (alpha)

# AGENT Hyperparameters (epsilon greedy strategy)
EPSILON = 0.99
EPSILON_MIN = 0.01 # EPSELON value where exploreation stops
EPSILON_DECAY_RATE = 0.9995 # the higher the longer the exploreation takes (Linear Decay: EPSELON * DECAY_RATE)
GAMMA = 0.99 # Discounting rate (lower -> agent thinks more long term)

# TRAINING Hyperparameters
RENDER_INTERVAL = 100 # Intervall when the game is rendered
TOTAL_EPISODES = 1000
REPLAY_INTERVAL = 4 # Replay every x steps (retrain model) 
MINI_BATCHES_REPLAY = 16
REPLAY_BUFFER_MEMORY = 200000
MINIMUM_REPLAY_HISTORY = 10000
AVERAGE_WINDOW = 10
UPDATE_TARGET_MODEL_FREQUENCY = 5 # Intevall of episodes the target model is updated
MODEL_SAVE_INTERVALL = 10 # Intervall in which the target model is saved

# **Init Agent and Logger**

In [None]:
agent = DQNAgent(REPLAY_BUFFER_MEMORY, MINI_BATCHES_REPLAY, EPSILON, ACTIONS_SIZE, GAMMA, EPSILON_MIN, EPSILON_DECAY_RATE, LEARNING_RATE)
logger = EpisodeLogger(log_files_dir="logs/episode_logs.json")

# **Fill Replay Buffer**

In [None]:
env = gym.make(EVIROMENT_VARIANT, obs_type="ram", difficulty=DIFFICULTY, mode=MODE) 

state = env.reset()[0]
preprocessed_state = processing.preprocess_ram(state)
total_reward = 0
average_rewards = []
rewards_per_episode = []
distance_per_episode = []

with tqdm(total=MINIMUM_REPLAY_HISTORY) as pbar:
    while len(agent.memory.buffer) < MINIMUM_REPLAY_HISTORY:
        # take random action action
        action = random.randint(0, ACTIONS_SIZE - 1)
        next_state, game_reward, game_done, game_loss_of_live, game_info = env.step(action)

        y_pos = env.ale.getRAM()[14] // 3
        crashed = 1 if env.ale.getRAM()[16] != 255 else 0 # RAM(16) =:= Collision Lane

        next_state_preprocessed = processing.preprocess_ram(next_state)

        # update reward
        total_reward, gained_reward = rewards.action_based_reward(total_reward, crashed, action, y_pos, game_reward)

        # store action infromation in memory
        agent.remember(preprocessed_state, next_state_preprocessed, action, gained_reward, crashed)

        # set the next state to the current state
        preprocessed_state = next_state_preprocessed
        pbar.update(len(agent.memory.buffer) - pbar.n)

# **Training**

In [None]:
%matplotlib qt

for episode in range(TOTAL_EPISODES):
    print(f"EPISODE: {episode}")
    # create env (human to render game and see actions)
    if episode % RENDER_INTERVAL == 0 and episode != 0:
        env = gym.make(EVIROMENT_VARIANT, obs_type="ram", render_mode="human", difficulty=DIFFICULTY, mode=MODE) 
    else:
        env = gym.make(EVIROMENT_VARIANT, obs_type="ram", difficulty=DIFFICULTY, mode=MODE) 
        
    state = env.reset()[0]
    preprocessed_state = processing.preprocess_ram(state)

    # reset episode variables
    step_count = 0
    distance_before = 0
    total_reward = 0

    # initilize episode stack
    #stacked_array_state, STACKED_FRAMES = process.stack_frames(STACKED_FRAMES, preprocessed_state, is_new_episode, STATE_SIZE, STACKED_FRAMES_SIZE)
    
    while True:
        # update step_count
        step_count += 1

        # Predict action
        action = agent.predict_action(preprocessed_state)
        next_state, game_reward, game_done, game_loss_of_live, game_info = env.step(action)

        y_pos = env.ale.getRAM()[14] // 3
        crashed = 1 if env.ale.getRAM()[16] != 255 else 0 # RAM(16) =:= Collision Lane

        next_state_preprocessed = processing.preprocess_ram(next_state)

        # update reward
        total_reward, gained_reward = rewards.action_based_reward(total_reward, crashed, action, y_pos, game_reward)

        # store action infromation in memory
        agent.remember(preprocessed_state, next_state_preprocessed, action, gained_reward, crashed)

        # set the next state to the current state
        preprocessed_state = next_state_preprocessed

        # Do Replay
        agent.replay()

        # END EPISODE IF CHICKEN COLLIDES
        if crashed == 1 or game_reward == 1:
            # clear output of cell for every new episode
            clear_output(wait=True)

            # update ntework
            agent.update_target_model()
            
            rewards_per_episode.append(total_reward)
            distance_per_episode.append(y_pos)

            if len(rewards_per_episode) >= AVERAGE_WINDOW:
                moving_average = np.mean(rewards_per_episode[-AVERAGE_WINDOW:])
            else:
                moving_average = np.mean(rewards_per_episode)
            average_rewards.append(moving_average)

            step_text = f"Episode: {episode}/{TOTAL_EPISODES}, Total Reward: {total_reward}, Moving AVG. Reward: {moving_average}, Distance: {y_pos}, Epsilon: {agent.EPSILON:.2}"
            
            figures.reward_plot(rewards_per_episode, average_rewards)
            figures.distance_plot(distance_per_episode)

            time.sleep(0.1)

            print(step_text)
            if episode % UPDATE_TARGET_MODEL_FREQUENCY == 0:
                agent.update_target_model()

            logger.log_episode(total_reward, moving_average, agent.EPSILON, y_pos)
            networks.save_model(agent.target_model, episode, MODEL_SAVE_INTERVALL, dir="models/")
            break

        is_new_episode = False
        distance_before = y_pos
