In [3]:
from AppleGameEnv import AppleGameEnv
from stable_baselines3 import PPO
from stable_baselines3.common import env_checker
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.logger import configure

In [2]:
env = AppleGameEnv(m=36, n=36, max_steps=1000)
env_checker.check_env(env)

## Test with random agent

In [3]:
seed = 0
env.reset(seed=seed)
env.action_space.seed(seed)

done = False

while not done:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    print(info)
    done = terminated or truncated

env.render()
env.close()

{'score': 0, 'steps': 1, 'reward': 0}
{'score': 0, 'steps': 2, 'reward': 0}
{'score': 0, 'steps': 3, 'reward': 0}
{'score': 0, 'steps': 4, 'reward': 0}
{'score': 0, 'steps': 5, 'reward': 0}
{'score': 0, 'steps': 6, 'reward': 0}
{'score': 0, 'steps': 7, 'reward': 0}
{'score': 0, 'steps': 8, 'reward': 0}
{'score': 0, 'steps': 9, 'reward': 0}
{'score': 0, 'steps': 10, 'reward': 0}
{'score': 0, 'steps': 11, 'reward': 0}
{'score': 0, 'steps': 12, 'reward': 0}
{'score': 0, 'steps': 13, 'reward': 0}
{'score': 0, 'steps': 14, 'reward': 0}
{'score': 0, 'steps': 15, 'reward': 0}
{'score': 0, 'steps': 16, 'reward': 0}
{'score': 0, 'steps': 17, 'reward': 0}
{'score': 0, 'steps': 18, 'reward': 0}
{'score': 0, 'steps': 19, 'reward': 0}
{'score': 0, 'steps': 20, 'reward': 0}
{'score': 0, 'steps': 21, 'reward': 0}
{'score': 0, 'steps': 22, 'reward': 0}
{'score': 0, 'steps': 23, 'reward': 0}
{'score': 0, 'steps': 24, 'reward': 0}
{'score': 0, 'steps': 25, 'reward': 0}
{'score': 0, 'steps': 26, 'reward'

## Train the agent

In [4]:
train_env = AppleGameEnv(m=36, n=36, max_steps=1000)

model = PPO("CnnPolicy", train_env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
log_dir = "../tmp/logs/2"

new_logger = configure(log_dir, ["stdout", "csv", "tensorboard"])
model.set_logger(new_logger)

eval_env = AppleGameEnv(m=36, n=36, max_steps=1000)

# Use deterministic actions for evaluation
eval_callback = EvalCallback(eval_env, best_model_save_path=log_dir,
                             log_path=log_dir, eval_freq=10_000, n_eval_episodes=5,
                             deterministic=False, render=False)

model.learn(total_timesteps=1_000_000, callback=eval_callback)

Logging to ../tmp/logs/1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 5.5      |
| time/              |          |
|    fps             | 4995     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1e+03         |
|    ep_rew_mean          | 2.75          |
| time/                   |               |
|    fps                  | 898           |
|    iterations           | 2             |
|    time_elapsed         | 4             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00856844    |
|    clip_fraction        | 0.0468        |
|    clip_range           | 0.2           |
|    entropy_loss         | -5.68         |
|    explained_vari



Eval num_timesteps=10000, episode_reward=0.00 +/- 0.00
Episode length: 1000.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 1e+03       |
|    mean_reward          | 0           |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.004568476 |
|    clip_fraction        | 0.0419      |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.64       |
|    explained_variance   | 0.016076446 |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0183     |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.00137    |
|    std                  | 0.989       |
|    value_loss           | 0.0165      |
-----------------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1

<stable_baselines3.ppo.ppo.PPO at 0x103fe6090>

tensorboard로 훈련 과정 확인

> 루트 디렉토리에서 실행
```bash
tensorboard --logdir ./tmp/logs/1
```

## Test the trained model

In [4]:
log_dir = "../tmp/logs/1"
model = PPO.load(log_dir + "/best_model.zip")

In [9]:
test_env = AppleGameEnv(m=36, n=36, max_steps=1000)
obs, _ = test_env.reset()
done = False

while not done:
    action = model.predict(obs, deterministic=False)[0]
    obs, reward, terminated, truncated, info = test_env.step(action)
    print(info)
    done = terminated or truncated

test_env.render()
test_env.close()

{'score': 0, 'steps': 1, 'reward': 0}
{'score': 0, 'steps': 2, 'reward': 0}
{'score': 0, 'steps': 3, 'reward': 0}
{'score': 0, 'steps': 4, 'reward': 0}
{'score': 0, 'steps': 5, 'reward': 0}
{'score': 0, 'steps': 6, 'reward': 0}
{'score': 0, 'steps': 7, 'reward': 0}
{'score': 0, 'steps': 8, 'reward': 0}
{'score': 0, 'steps': 9, 'reward': 0}
{'score': 0, 'steps': 10, 'reward': 0}
{'score': 0, 'steps': 11, 'reward': 0}
{'score': 0, 'steps': 12, 'reward': 0}
{'score': 0, 'steps': 13, 'reward': 0}
{'score': 0, 'steps': 14, 'reward': 0}
{'score': 0, 'steps': 15, 'reward': 0}
{'score': 0, 'steps': 16, 'reward': 0}
{'score': 0, 'steps': 17, 'reward': 0}
{'score': 0, 'steps': 18, 'reward': 0}
{'score': 0, 'steps': 19, 'reward': 0}
{'score': 0, 'steps': 20, 'reward': 0}
{'score': 0, 'steps': 21, 'reward': 0}
{'score': 0, 'steps': 22, 'reward': 0}
{'score': 0, 'steps': 23, 'reward': 0}
{'score': 0, 'steps': 24, 'reward': 0}
{'score': 0, 'steps': 25, 'reward': 0}
{'score': 0, 'steps': 26, 'reward'