# Training Grow-R Environment with PPO

## Importing Required Libraries

In [1]:
import os
import sys
import gymnasium as gym

# Add the parent directory to the path to allow for package imports
notebook_dir = os.getcwd()
sys.path.insert(0, os.path.abspath(os.path.join(notebook_dir, '..')))

from sb3_contrib import RecurrentPPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecNormalize
from stable_baselines3.common import logger
from collections import OrderedDict
from training_utils import SaveOnIntervalCallback, visualise_training_logs
from plantos_env import PlantOSEnv

  from pkg_resources import resource_stream, resource_exists


## Define paths for saving models and logs

In [2]:
training_run = "10M"

MODEL_DIR = os.path.join("PPO_Training/models", training_run)
LOG_DIR = os.path.join("PPO_Training/logs", training_run)
TENSORBOARD_LOG_DIR = "PPO_Training/logs"
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

## Setting PPO Hyperparameters

In [3]:
# PPO Hyperparameters
config = OrderedDict([('batch_size', 64),
            ('clip_range', 0.18),
            ('ent_coef', 0.0),
            ('gae_lambda', 0.95),
            ('gamma', 0.999),
            ('learning_rate', 0.0003),
            ('n_epochs', 10),
            ('n_steps', 2048),
            ('n_timesteps', 1000000.0),
            ('normalize', True),
            ('policy', 'MlpLstmPolicy'),
            ('policy_kwargs', dict(net_arch=[256, 256])),
            ('normalize_kwargs', {'norm_obs': True, 'norm_reward': False})])

## Initialising the Environment

In [4]:
n_envs = 4
env_kwargs = {
    'grid_size': 21,
    'num_plants': 20,
    'num_obstacles': 12,
    'lidar_range': 6,
    'lidar_channels': 32,
    'observation_mode': 'grid',
    'thirsty_plant_prob': 0.5
}

# Create vectorized environment using the registered environment ID
env = make_vec_env('PlantOS-v0', n_envs=n_envs, env_kwargs=env_kwargs)
if config['normalize']:
    env = VecNormalize(env, norm_obs=config['normalize_kwargs']['norm_obs'], norm_reward=config['normalize_kwargs']['norm_reward'], clip_obs=10.0)

  logger.warn(


## Initialising the Model

In [5]:
model = RecurrentPPO(
        config['policy'],
        env,
        device='cpu',
        verbose=1,
        learning_rate=float(config['learning_rate']),
        batch_size=int(config['batch_size']),
        gamma=float(config['gamma']),
        clip_range=config['clip_range'],
        ent_coef=config['ent_coef'],
        gae_lambda=config['gae_lambda'],
        n_epochs=config['n_epochs'],
        n_steps=config['n_steps'],
        policy_kwargs=config['policy_kwargs'],
        #tensorboard_log=TENSORBOARD_LOG_DIR
    )

# Log the training
new_logger = logger.configure(LOG_DIR, ["stdout", "csv"])
model.set_logger(new_logger)

Using cpu device
Logging to PPO_Training/logs\10M


## Setting up Callbacks

In [6]:
save_interval = 100000
save_callback = SaveOnIntervalCallback(save_interval, MODEL_DIR)
combined_callbacks = [save_callback]

## Training the Model

In [7]:
print("Starting PPO training with Stable Baselines3...")
model.learn(
    total_timesteps=config['n_timesteps'], # 10 Million
    callback=combined_callbacks
)
print("PPO Training Finished.")
print(f"Total timesteps trained: {model.num_timesteps}")

# Evaluate the trained model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"This is the avg reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# 5. Save the final model
model.save(os.path.join(MODEL_DIR, f"ppo_plantos_final_model-{training_run}"))

env.close()

Starting PPO training with Stable Baselines3...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 40.1     |
|    ep_rew_mean     | 149      |
| time/              |          |
|    fps             | 2149     |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 61          |
|    ep_rew_mean          | 228         |
| time/                   |             |
|    fps                  | 146         |
|    iterations           | 2           |
|    time_elapsed         | 111         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.010650488 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.18        |
|    entropy_loss         | -1.6        |
|    explained_variance 

KeyboardInterrupt: 