# PPO Baseline Training Notebook

This Notebook contains the baseline model training for an PPO agent using stable_baselines3

## Imports

In [1]:
from gym_vectorvelocity import VectorVelocityEnv
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from gymnasium import make 
from stable_baselines3 import PPO
import pandas as pd 

import tensorboardX

pygame-ce 2.4.1 (SDL 2.28.5, Python 3.11.5)


## Settings

In [2]:
# general parameters, change as needed for your experiments

MODEL_NAME = "baseline_ppo" # name of the log folder and the model zip file
USE_MULTIPROCESSING = True # use multiprocessing via vec env or a single gym environment
USE_VERBOSE = True

# environment parameters
NUMBER_OF_STEPS = 2048
NUMBER_OF_ENVS = 10 # only if USE_MULTIPROCESSING is True
TOTAL_TIMESTEPS = 1e+5 # total timesteps for training

NUMBER_OF_EPOCHS = 10 
BATCH_SIZE = 64
LEARNING_RATE = 0.0001

LOG_DIR = "logs" # name of the folder where the logs will be stored

# env modifications if needed
GAMEOVER_PENALTY = 75
MISSED_COIN_PENALTY = 10

DODGED_OBSTACLE_REWARD = 1
COLLECTED_COIN_REWARD = 15


## the total time steps will be NUMBER_OF_ENVS * NUMBER_OF_TOTAL_STEPS, 
## comment out if you wont use this set up. it will then take the default value u have set :) 

if USE_MULTIPROCESSING:
    TOTAL_TIMESTEPS = NUMBER_OF_ENVS * TOTAL_TIMESTEPS


## Setting up the Enviornment

### Initilize and check

In [3]:
def create_env():
    env = VectorVelocityEnv()
    env.coin_missed_penalty = MISSED_COIN_PENALTY
    env.game_over_penalty = GAMEOVER_PENALTY
    env.dodged_obstacle_reward = DODGED_OBSTACLE_REWARD
    env.coin_reward = COLLECTED_COIN_REWARD
    return env

env = create_env()
check_env(env)

In [4]:
env.observation_space

Dict('coin_dists': Box(-1.0, 1.0, (40,), float32), 'coins': Box(-1.0, 1.0, (40,), float32), 'collected_coins': Discrete(20001), 'lane_coins': MultiDiscrete([4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4]), 'lane_obstacles': MultiDiscrete([4 4 4 4 4 4 4 4 4]), 'obstacle_dists': Box(-1.0, 1.0, (18,), float32), 'obstacles': Box(-1.0, 1.0, (18,), float32), 'player_pos': Box(0.0, 1.0, (1,), float32), 'score': Discrete(120001), 'speed': Discrete(20))

### Env Setup

In [5]:
if USE_MULTIPROCESSING:
    env = make_vec_env(create_env, n_envs=NUMBER_OF_ENVS)
else:
    env = make(create_env)

## Model & Training

### Model Setup

In [6]:
model = PPO("MultiInputPolicy", 
            env, 
            verbose=USE_VERBOSE, 
            seed=42, 
            learning_rate=LEARNING_RATE,
            n_epochs=NUMBER_OF_EPOCHS,
            batch_size=BATCH_SIZE,
            tensorboard_log=LOG_DIR)

Using cuda device


In [7]:
model.policy

MultiInputActorCriticPolicy(
  (features_extractor): CombinedExtractor(
    (extractors): ModuleDict(
      (coin_dists): Flatten(start_dim=1, end_dim=-1)
      (coins): Flatten(start_dim=1, end_dim=-1)
      (collected_coins): Flatten(start_dim=1, end_dim=-1)
      (lane_coins): Flatten(start_dim=1, end_dim=-1)
      (lane_obstacles): Flatten(start_dim=1, end_dim=-1)
      (obstacle_dists): Flatten(start_dim=1, end_dim=-1)
      (obstacles): Flatten(start_dim=1, end_dim=-1)
      (player_pos): Flatten(start_dim=1, end_dim=-1)
      (score): Flatten(start_dim=1, end_dim=-1)
      (speed): Flatten(start_dim=1, end_dim=-1)
    )
  )
  (pi_features_extractor): CombinedExtractor(
    (extractors): ModuleDict(
      (coin_dists): Flatten(start_dim=1, end_dim=-1)
      (coins): Flatten(start_dim=1, end_dim=-1)
      (collected_coins): Flatten(start_dim=1, end_dim=-1)
      (lane_coins): Flatten(start_dim=1, end_dim=-1)
      (lane_obstacles): Flatten(start_dim=1, end_dim=-1)
      (obstacle_

### Training

In [8]:
model.learn(total_timesteps=TOTAL_TIMESTEPS,
            tb_log_name=MODEL_NAME,
            )

Logging to logs\baseline_ppo_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 397      |
|    ep_rew_mean     | -59.8    |
| time/              |          |
|    fps             | 594      |
|    iterations      | 1        |
|    time_elapsed    | 34       |
|    total_timesteps | 20480    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 406         |
|    ep_rew_mean          | -43.3       |
| time/                   |             |
|    fps                  | 380         |
|    iterations           | 2           |
|    time_elapsed         | 107         |
|    total_timesteps      | 40960       |
| train/                  |             |
|    approx_kl            | 0.007939317 |
|    clip_fraction        | 0.00727     |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.000171    |

<stable_baselines3.ppo.ppo.PPO at 0x216f005a310>

### Evaluation

In [9]:
# seed = TOTAL_TIMESTEPS + 42 because we want to use unseen data and inital seed is 42 by default
eval_env = make_vec_env(create_env, n_envs=1, seed=int(TOTAL_TIMESTEPS + 42))  

eval = evaluate_policy(model, eval_env, n_eval_episodes=10)

In [10]:
print("mean reward: ", eval[0], "std reward: ", eval[1])

mean reward:  94.02935740000001 std reward:  153.13391394034295


### Save the Model

In [11]:
model.save(MODEL_NAME)