In [None]:
from pathlib import Path
import sys

import tensorflow

import tensorforce
from tensorforce.agents import Agent
from tensorforce.environments import Environment
from tensorforce.execution import Runner

from bad_seeds.simple.bad_seeds_04_bollux import Bollux


In [None]:
tensorflow.__version__

In [None]:
tensorforce.__version__

In [None]:
def evaluate_agent(agent_, bad_seeds_env_):
    # Evaluate for 100 episodes
    sum_rewards = 0.0
    for _ in range(100):
        states = bad_seeds_env_.reset()
        internals = agent_.initial_internals()
        terminal = False
        while not terminal:
            actions, internals = agent_.act(
                states=states,
                internals=internals,
                independent=True,
                deterministic=True
            )
            states, terminal, reward = bad_seeds_env_.execute(actions=actions)
            sum_rewards += reward

    average_evaluation_reward = sum_rewards / 100
    print(f'Mean episode reward: {average_evaluation_reward}')

    return average_evaluation_reward

In [None]:
bad_seeds_environment = Environment.create(
    environment=Bollux,
    seed_count=10,
    reward_probability=2/3,
    bad_seed_count=3,
    max_episode_length=100,
    max_episode_timesteps=100,
)

In [None]:
batch_size = 10
variable_noise = 0.1
l2_regularization = 0.01

agent = Agent.create(
    agent="ppo",
    environment=bad_seeds_environment,

    batch_size=batch_size,
    variable_noise=0.1,

    l2_regularization=l2_regularization,

    summarizer=dict(
        directory=f"training_data_saveload/agent_ppo_01_env_bollux_bs{batch_size}_l2r{l2_regularization}/summaries",
        labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],  # tensorforce 0.5.5
        #summaries=["entropy", "kl-divergence", "loss", "reward"],  # removed "graph"  # tensorforce 0.6.0
        flush=True,
        # frequency=100  not necessary?
    ),
)

agent.debug = False

print("begin training")
runner = Runner(agent=agent, environment=bad_seeds_environment)
runner.run(num_episodes=1000)
print("done training")

agent.debug = True

print("begin evaluation")
runner = Runner(agent=agent, environment=bad_seeds_environment)
runner.run(num_episodes=1000, evaluation=True)
#avg_reward = evaluate_agent(agent_=agent, bad_seeds_env_=bad_seeds_environment)
#print(f"average reward: {avg_reward}")
print("done evaluating")

agent.save(directory="saved_models", format="numpy")

agent.close()
bad_seeds_environment.close()

In [None]:
new_bad_seeds_environment = Environment.create(
    environment=Bollux,
    seed_count=10,
    bad_seed_count=3,
    reward_probability=1.0,
    max_episode_length=100,
    max_episode_timesteps=100,
)

loaded_agent = Agent.load(
    directory="saved_models",
    format="numpy",
    environment=new_bad_seeds_environment,
    #summarizer=dict(
    #    directory=f"training_data_saveload/agent_ppo_01_env_bollux_bs{batch_size}_l2r{l2_regularization}/summaries",
    #    labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],  # tensorforce 0.5.5
    #    #summaries=["entropy", "kl-divergence", "loss", "reward"],  # removed "graph"  # tensorforce 0.6.0
    #    flush=True,
    #    # frequency=100  not necessary?
    #),
)

loaded_agent.debug = True

loaded_agent_runner = Runner(
    agent=loaded_agent,
    environment=new_bad_seeds_environment
)

loaded_agent_runner.run(
    num_episodes=1000,
    evaluation=True
)
#loaded_agent_runner.close()

#avg_reward = evaluate_agent(loaded_agent, new_bad_seeds_environment)
#print(f"average reward: {avg_reward}")

loaded_agent.close()