In [1]:
from doubleAgent2 import DoubleAgent2
from policy import Policy
from memory import Memory
from qNetwork import QNetwork
from transition import Transition
from state import State

import os
import torch
import gymnasium as gym
from torch import nn
from datetime import datetime
import numpy as np

In [2]:
GREEN = '\033[0;37;42m'
RED = '\033[0;37;41m'
BLACK = '\033[0m'

last_episode = 0
max_returns = 0
max_returns_episode = 0
last_steps = []

name = "06-21_13-17" # name of the model

# DOUBLEQ = True # use double q learning?

episodes = 2_000 # max number of episodes
max_steps = 1_000 # max steps in an episode

memory_size = 100_000 # max memory size
sample_size = 64 # train samples per step

lr = 0.001 # learning rate
discount = 0.99 # discount
epsilon = 0.1 # epsilon, value for taking random moves
decay = 0.996 # decay of epsilon
averaging_rate = 0.1 # averaging rate for doubleq

last_steps_n = 20 # last number of steps to measure score by
stop_score = 200 # average score to achieve to finish training prematurely

policy = Policy(network=QNetwork(), optimizer=None, loss_fn=None, epsilon=epsilon, decay=decay)
target_policy = Policy(network=QNetwork(), optimizer=None, loss_fn=None, epsilon=epsilon, decay=decay)
memory = Memory(max_size=memory_size)

agent = DoubleAgent2(policy, memory, target_policy, averaging_rate)

env = gym.make("LunarLander-v2", render_mode="human")

for episode in range(episodes):
    last_episode = episode
    returns = 0
    state, _ = env.reset()
    state = State(*state)
    t1 = datetime.now()

    trainingtime = 0

    for i in range(max_steps):
        action = agent.select_action(state)

        next_state, reward, terminated, truncated, _ = env.step(action.value)
        
        next_state = State(*next_state)

        returns += reward
        agent.memory.store(
            Transition(
                state, 
                action,
                reward,
                next_state,
                terminated
            )
        )

        if terminated or truncated:
            break

        state = next_state
        t3 = datetime.now()
        agent.train(gamma=discount, memory_batch_size=sample_size)
        t4 = datetime.now()
        trainingtime += (t4 - t3).total_seconds()
        agent.decay()
    t2 = datetime.now()
    totaltime = (t2 - t1).total_seconds()
    runtime = totaltime - trainingtime
    print(f"stappen gezet: {i}, traintijd: {trainingtime:.2f}, run tijd: {runtime:.2f}, totale tijd: {totaltime:.2f}")
    last_steps.append(returns)
    if returns > max_returns or episode == 0:
        max_returns = returns
        max_returns_episode = episode

    mean_last = np.mean(last_steps[-100:])
    time_delta = t2 - t1
    colour = (GREEN if returns > mean_last else RED)

    print(
        colour +
        "Finished episode",
        " " * (3 -len(str(episode))),
        episode,
        "in",
        str(time_delta.seconds) + "." + str(time_delta.microseconds)[0:2] + "s\t",
        f"Current last {last_steps_n} mean:",
        np.round(mean_last, 1),
        "\tLast reward:", np.round(reward, 1),
        " Returns:",str(np.round(returns, 2))
        + BLACK
    )

    if mean_last > stop_score:
        break

env.close()

Using [32mcpu[0m device

Using [32mcpu[0m device

stappen gezet: 130, traintijd: 2.86, run tijd: 1.48, totale tijd: 4.35
[0;37;41mFinished episode    0 in 4.34s	 Current last 20 mean: -260.0 	Last reward: -100  Returns: -260.04[0m
stappen gezet: 106, traintijd: 4.82, run tijd: 0.31, totale tijd: 5.13
[0;37;42mFinished episode    1 in 5.12s	 Current last 20 mean: -259.8 	Last reward: -100  Returns: -259.59[0m
stappen gezet: 170, traintijd: 7.16, run tijd: 0.47, totale tijd: 7.63
[0;37;42mFinished episode    2 in 7.63s	 Current last 20 mean: -226.3 	Last reward: -100  Returns: -159.41[0m
stappen gezet: 165, traintijd: 7.66, run tijd: 0.49, totale tijd: 8.15
[0;37;41mFinished episode    3 in 8.15s	 Current last 20 mean: -237.8 	Last reward: -100  Returns: -272.15[0m


KeyboardInterrupt: 