In [None]:
import os
import math
import time
import random
import numpy as np
import matplotlib.pyplot as plt

import gymnasium as gym
from gymnasium import spaces

from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback

# Only needed if visualize=True
try:
    import pygame
except ImportError:
    pygame = None

###############################################################################
# ENVIRONMENT CONSTANTS
###############################################################################
FULL_VIEW_SIZE = (1200, 800)
SCALING_FACTOR_X = FULL_VIEW_SIZE[0] / 600.0
SCALING_FACTOR_Y = FULL_VIEW_SIZE[1] / 600.0
SCALING_FACTOR   = (SCALING_FACTOR_X + SCALING_FACTOR_Y) / 2

DOT_RADIUS       = int(15 * SCALING_FACTOR)
TARGET_RADIUS    = int(10 * SCALING_FACTOR)
OBSTACLE_RADIUS  = int(10 * SCALING_FACTOR)
COLLISION_BUFFER = int(5  * SCALING_FACTOR)
MAX_SPEED        = 3 * SCALING_FACTOR
NOISE_MAGNITUDE  = 0.5
RENDER_FPS       = 30

# The start position is fixed in the middle.
START_POS = np.array([FULL_VIEW_SIZE[0]//2, FULL_VIEW_SIZE[1]//2], dtype=np.float32)

WHITE = (255,255,255)
GRAY  = (128,128,128)
YELLOW= (255,255,0)
BLACK = (0,0,0)

###############################################################################
# UTILITY FUNCTIONS
###############################################################################
def distance(a, b):
    return math.hypot(a[0]-b[0], a[1]-b[1])

def check_line_collision(start, end, center, radius):
    dx = end[0] - start[0]
    dy = end[1] - start[1]
    fx = center[0] - start[0]
    fy = center[1] - start[1]
    l2 = dx*dx + dy*dy
    if l2 < 1e-9:
        return distance(start, center) <= radius
    t = max(0, min(1, (fx*dx + fy*dy) / l2))
    px = start[0] + t*dx
    py = start[1] + t*dy
    return distance((px,py), center) <= radius

def line_collision(pos, new_pos, obstacles):
    for obs in obstacles:
        if check_line_collision(pos, new_pos, obs, OBSTACLE_RADIUS + COLLISION_BUFFER):
            return True
    return False

def inside_obstacle(pos, obstacles):
    for obs in obstacles:
        if distance(pos, obs) <= (OBSTACLE_RADIUS + DOT_RADIUS):
            return True
    return False

def potential_field_dir(pos, goal, obstacles):
    """
    Compute a normalized direction vector that is a combination of an attractive 
    force toward the goal and repulsive forces from obstacles.
    """
    gx = goal[0] - pos[0]
    gy = goal[1] - pos[1]
    dg = math.hypot(gx, gy)
    if dg < 1e-6:
        return np.zeros(2, dtype=np.float32)
    att = np.array([gx/dg, gy/dg], dtype=np.float32)

    repulse_x = 0.0
    repulse_y = 0.0
    repulsion_radius = 150.0 * SCALING_FACTOR
    repulsion_gain   = 15000.0

    for obs in obstacles:
        dx = pos[0] - obs[0]
        dy = pos[1] - obs[1]
        dobs = math.hypot(dx, dy)
        if dobs < 1e-9:
            continue
        if dobs < repulsion_radius:
            pushx   = dx/dobs
            pushy   = dy/dobs
            strength= repulsion_gain/(dobs**2)
            repulse_x += pushx*strength
            repulse_y += pushy*strength

    px = att[0] + repulse_x
    py = att[1] + repulse_y
    mg = math.hypot(px, py)
    if mg < 1e-9:
        return np.zeros(2, dtype=np.float32)
    return np.array([px/mg, py/mg], dtype=np.float32)

###############################################################################
# CALLBACK WITH MOVING AVERAGE METRICS
###############################################################################
class MetricsCallback(BaseCallback):
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.episode_rewards = []
        self.episode_gammas  = []
        self.current_episode_gammas = []
        self.total_reward = 0.0

        # Arrays to store training metrics after each update/rollout
        self.training_steps = []
        self.losses = []
        self.value_losses = []
        self.policy_losses = []
        self.entropy_losses = []

        self.n_collisions = 0
        self.n_episodes   = 0
        self.action_low  = None
        self.action_high = None
        self.n_updates = 0

    def _on_training_start(self):
        self.n_collisions = 0
        self.n_episodes = 0

    def _on_step(self) -> bool:
        """
        Called at each environment step (i.e., for each action taken).
        We collect reward and gamma stats here.
        """
        if self.action_low is None:
            # Just read from the model's action space once
            self.action_low  = float(self.model.action_space.low)
            self.action_high = float(self.model.action_space.high)

        actions = self.locals['actions']
        rewards = self.locals['rewards']
        done    = self.locals['dones'][0]
        infos   = self.locals['infos']

        raw_action = float(actions[0])
        raw_action = max(self.action_low, min(self.action_high, raw_action))
        gamma = 0.5 * (raw_action + 1.0)

        r = float(rewards[0])
        self.total_reward += r

        if done:
            # Episode is finished, store stats
            self.episode_rewards.append(self.total_reward)
            avg_g = np.mean(self.current_episode_gammas) if self.current_episode_gammas else 0.0
            self.episode_gammas.append(avg_g)
            self.total_reward = 0.0
            self.current_episode_gammas.clear()
            self.n_episodes += 1

            if infos[0].get("terminal_reason") == "collision":
                self.n_collisions += 1
        else:
            self.current_episode_gammas.append(gamma)

        return True

    def _on_rollout_end(self):
        """
        Called after each rollout is completed and PPO has updated its policy.
        Here, we can retrieve the training losses from the logger.
        """
        # Grab the training logs from stable-baselines3
        logs = self.model.logger.name_to_value

        # If training was performed, these keys should exist:
        # train/loss, train/policy_gradient_loss, train/value_loss, train/entropy_loss, etc.
        # We'll store them if available:
        if "train/loss" in logs:
            self.losses.append(logs["train/loss"])
            self.n_updates += 1
            self.training_steps.append(self.n_updates)

        if "train/policy_gradient_loss" in logs:
            self.policy_losses.append(logs["train/policy_gradient_loss"])
        if "train/value_loss" in logs:
            self.value_losses.append(logs["train/value_loss"])
        if "train/entropy_loss" in logs:
            self.entropy_losses.append(logs["train/entropy_loss"])

    def _moving_average(self, data, window=10):
        if len(data) < window:
            return np.array(data)
        return np.convolve(data, np.ones(window)/window, mode='valid')

    def save_metrics(self, save_dir="training_metrics"):
        os.makedirs(save_dir, exist_ok=True)

        # Plot Episode Reward with Moving Average
        if self.episode_rewards:
            plt.figure(figsize=(10, 6))
            plt.plot(self.episode_rewards, label="Episode Reward", alpha=0.6)
            ma_rewards = self._moving_average(self.episode_rewards, window=10)
            if len(ma_rewards) > 0:
                # The moving average array is shorter, so align its x-axis
                start_index = 10 - 1
                plt.plot(range(start_index, start_index + len(ma_rewards)), 
                         ma_rewards, label="Moving Average (window=10)", linewidth=2)
            plt.xlabel("Episode")
            plt.ylabel("Reward")
            plt.title("Episode Reward Over Time")
            plt.grid(True)
            plt.legend()
            plt.savefig(os.path.join(save_dir, "episode_reward.png"))
            plt.close()

        # Plot Average Gamma with Moving Average
        if self.episode_gammas:
            plt.figure(figsize=(10, 6))
            plt.plot(self.episode_gammas, label="Average Gamma", alpha=0.6)
            ma_gamma = self._moving_average(self.episode_gammas, window=10)
            if len(ma_gamma) > 0:
                start_index = 10 - 1
                plt.plot(range(start_index, start_index + len(ma_gamma)), 
                         ma_gamma, label="Moving Average (window=10)", linewidth=2)
            plt.xlabel("Episode")
            plt.ylabel("Gamma")
            plt.title("Average Gamma per Episode")
            plt.grid(True)
            plt.legend()
            plt.savefig(os.path.join(save_dir, "average_gamma.png"))
            plt.close()
        
        # Plot Loss with Moving Average (if we have loss data)
        if self.losses:
            plt.figure(figsize=(10, 6))
            plt.plot(self.training_steps, self.losses, label="Training Loss", alpha=0.6)
            if len(self.losses) >= 10:
                ma_loss = self._moving_average(self.losses, window=10)
                # The moving average array is shorter, align x-axis:
                plt.plot(range(self.training_steps[9], self.training_steps[9] + len(ma_loss)), 
                         ma_loss, label="Moving Average (window=10)", linewidth=2)
            plt.xlabel("Training Updates")
            plt.ylabel("Loss")
            plt.title("Total Loss During Training")
            plt.grid(True)
            plt.legend()
            plt.savefig(os.path.join(save_dir, "training_loss.png"))
            plt.close()
        
        # Plot value loss if available
        if self.value_losses and len(self.value_losses) == len(self.losses):
            plt.figure(figsize=(10, 6))
            plt.plot(self.training_steps, self.value_losses, label="Value Loss", alpha=0.6, color='red')
            if len(self.value_losses) >= 10:
                ma_val_loss = self._moving_average(self.value_losses, window=10)
                plt.plot(range(self.training_steps[9], self.training_steps[9] + len(ma_val_loss)),
                         ma_val_loss, label="Moving Average (window=10)", linewidth=2, color='darkred')
            plt.xlabel("Training Updates")
            plt.ylabel("Value Loss")
            plt.title("Value Loss During Training")
            plt.grid(True)
            plt.legend()
            plt.savefig(os.path.join(save_dir, "value_loss.png"))
            plt.close()
        
        # Plot policy loss if available
        if self.policy_losses and len(self.policy_losses) == len(self.losses):
            plt.figure(figsize=(10, 6))
            plt.plot(self.training_steps, self.policy_losses, label="Policy Loss", alpha=0.6, color='green')
            if len(self.policy_losses) >= 10:
                ma_pol_loss = self._moving_average(self.policy_losses, window=10)
                plt.plot(range(self.training_steps[9], self.training_steps[9] + len(ma_pol_loss)),
                         ma_pol_loss, label="Moving Average (window=10)", linewidth=2, color='darkgreen')
            plt.xlabel("Training Updates")
            plt.ylabel("Policy Loss")
            plt.title("Policy Gradient Loss During Training")
            plt.grid(True)
            plt.legend()
            plt.savefig(os.path.join(save_dir, "policy_loss.png"))
            plt.close()

        # Plot entropy loss if available (not strictly asked, but often useful)
        if self.entropy_losses and len(self.entropy_losses) == len(self.losses):
            plt.figure(figsize=(10, 6))
            plt.plot(self.training_steps, self.entropy_losses, label="Entropy Loss", alpha=0.6, color='purple')
            if len(self.entropy_losses) >= 10:
                ma_ent_loss = self._moving_average(self.entropy_losses, window=10)
                plt.plot(range(self.training_steps[9], self.training_steps[9] + len(ma_ent_loss)),
                         ma_ent_loss, label="Moving Average (window=10)", linewidth=2, color='darkmagenta')
            plt.xlabel("Training Updates")
            plt.ylabel("Entropy Loss")
            plt.title("Entropy Loss During Training")
            plt.grid(True)
            plt.legend()
            plt.savefig(os.path.join(save_dir, "entropy_loss.png"))
            plt.close()

        # Write out summary text file
        with open(os.path.join(save_dir, "training_summary.txt"), 'w') as f:
            f.write(f"Total Episodes: {len(self.episode_rewards)}\n")
            if self.episode_rewards:
                f.write(f"Mean Reward: {np.mean(self.episode_rewards):.3f}\n")
                f.write(f"Mean Gamma: {np.mean(self.episode_gammas):.3f}\n")
            if self.losses:
                f.write(f"Final Loss: {self.losses[-1]:.6f}\n")
                f.write(f"Mean Loss: {np.mean(self.losses):.6f}\n")
            if self.value_losses:
                f.write(f"Mean Value Loss: {np.mean(self.value_losses):.6f}\n")
            if self.policy_losses:
                f.write(f"Mean Policy Loss: {np.mean(self.policy_losses):.6f}\n")
            if self.entropy_losses:
                f.write(f"Mean Entropy Loss: {np.mean(self.entropy_losses):.6f}\n")
            f.write(f"Collisions Count: {self.n_collisions}\n")

###############################################################################
# ENVIRONMENT: DemoArbitrationEnv with Alternating Fixed Scenarios and Reward Shaping
###############################################################################
class DemoArbitrationEnv(gym.Env):
    """
    The dot must be extremely close to the goal to receive a positive shaping reward.
    If the dot is in a "risky" region – near a goal or near an obstacle – the agent
    is rewarded for a high gamma; otherwise, high gamma is penalized.
    
    This environment alternates among five fixed scenarios using seeds:
       [0, 1, 2, 58, 487]
    """
    metadata = {"render_modes": ["human"], "render_fps": RENDER_FPS}

    def __init__(self, visualize=False, scenario_mode=True, fixed_scenario_seed=None):
        super().__init__()
        self.visualize = visualize

        low  = np.array([0,0, -1,-1, 0,0, -1,-1, 0], dtype=np.float32)
        high = np.array([
            FULL_VIEW_SIZE[0], FULL_VIEW_SIZE[1],
            1,1,
            FULL_VIEW_SIZE[0], FULL_VIEW_SIZE[1],
            1,1,
            1
        ], dtype=np.float32)
        self.observation_space = spaces.Box(low=low, high=high, shape=(9,), dtype=np.float32)
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(), dtype=np.float32)

        self.dot_pos = None
        self.goal_pos = None
        self.step_count = 0
        self.max_steps = 300
        self.episode_reward = 0.0
        self.max_dist = math.hypot(FULL_VIEW_SIZE[0], FULL_VIEW_SIZE[1])
        self.close_threshold = 10.0
        self.alpha = 3.0
        self.beta  = 3.0

        # Additional thresholds for reward shaping.
        self.goal_threshold = 100.0   # if within 100 units of the goal, high gamma is good.
        self.obs_threshold  = 100.0   # if within 100 units of any obstacle, high gamma is good.

        # For reproducible scenarios:
        self.scenario_mode = scenario_mode
        self.fixed_scenario_seed = fixed_scenario_seed
        self.scenario_seed = fixed_scenario_seed  # if fixed, use it always
        self.last_scenario_seed = None
        # Fixed seeds to alternate between (matching demo): 0, 1, 2, 58, 487.
        self.SCENARIO_SEEDS = [0, 1, 2, 58, 487]
        self.scenario_index = 0

        # Initially randomize the environment.
        self.randomize_env()

        if self.visualize and pygame is not None:
            pygame.init()
            self.window = pygame.display.set_mode(FULL_VIEW_SIZE)
            pygame.display.set_caption("Demo Arbitration Environment")
            self.clock = pygame.time.Clock()
        else:
            self.window = None
            self.clock = None

        self.episode_counter = 0

    def randomize_env(self):
        """
        Generate goals and obstacles exactly like in the demo code.
        """
        if self.scenario_seed is not None:
            random.seed(self.scenario_seed)
            np.random.seed(self.scenario_seed)

        N_GOALS = 8
        N_OBSTACLES = 5
        margin = 50 * SCALING_FACTOR
        min_goal_distance = 300 * SCALING_FACTOR

        new_goals = []
        attempts = 0
        while len(new_goals) < N_GOALS and attempts < 1000:
            x = random.uniform(margin, FULL_VIEW_SIZE[0] - margin)
            y = random.uniform(margin, FULL_VIEW_SIZE[1] - margin)
            candidate = np.array([x, y], dtype=np.float32)
            if distance(candidate, START_POS) >= min_goal_distance:
                if all(distance(candidate, g) >= TARGET_RADIUS * 2 for g in new_goals):
                    new_goals.append(candidate)
            attempts += 1
        self.goals = new_goals

        new_obstacles = []
        # For obstacles, we choose a subset of goals to attach obstacles.
        if len(new_goals) > 1:
            obstacle_goals = random.sample(new_goals, k=min(min(N_GOALS-1, N_OBSTACLES), len(new_goals)-1))
        else:
            obstacle_goals = new_goals
        for goal in obstacle_goals:
            # Place obstacle between start and goal: choose t in [0.6, 0.8]
            t = random.uniform(0.6, 0.8)
            base_point = START_POS + t * (goal - START_POS)
            # Compute perpendicular vector.
            vec = goal - START_POS
            if np.linalg.norm(vec) < 1e-6:
                perp = np.array([0, 0], dtype=np.float32)
            else:
                perp = np.array([-vec[1], vec[0]], dtype=np.float32)
                perp /= np.linalg.norm(perp)
            # Apply offset: push obstacle away from the direct line.
            offset_mag = random.uniform(20 * SCALING_FACTOR, 40 * SCALING_FACTOR)
            offset = perp * offset_mag * random.choice([-1,1])
            candidate = base_point + offset
            candidate[0] = np.clip(candidate[0], margin, FULL_VIEW_SIZE[0] - margin)
            candidate[1] = np.clip(candidate[1], margin, FULL_VIEW_SIZE[1] - margin)
            valid = True
            # Ensure candidate is not too close to the start.
            if distance(candidate, START_POS) < (DOT_RADIUS + OBSTACLE_RADIUS + 10):
                valid = False
            # Ensure candidate is not too close to the goal (to keep goal visible).
            if distance(candidate, goal) < (TARGET_RADIUS + OBSTACLE_RADIUS + 20):
                valid = False
            for obs in new_obstacles:
                if distance(candidate, obs) < (2 * OBSTACLE_RADIUS + 10):
                    valid = False
            if valid:
                new_obstacles.append(candidate)
        self.obstacles = new_obstacles

        # Randomly choose a goal for the episode.
        idx = random.randint(0, len(self.goals) - 1)
        self.goal_pos = self.goals[idx].copy()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.episode_counter += 1
        # Alternate scenarios: if fixed seed is provided, always use it.
        if self.fixed_scenario_seed is not None:
            self.scenario_seed = self.fixed_scenario_seed
            self.last_scenario_seed = self.scenario_seed
            self.randomize_env()
        elif self.scenario_mode:
            if self.episode_counter % 10 == 0:
                self.scenario_seed = self.SCENARIO_SEEDS[self.scenario_index]
                self.last_scenario_seed = self.scenario_seed
                self.scenario_index = (self.scenario_index + 1) % len(self.SCENARIO_SEEDS)
                self.randomize_env()
        else:
            if self.episode_counter % 10 == 0:
                self.randomize_env()

        self.step_count = 0
        self.episode_reward = 0.0
        self.dot_pos = START_POS.copy()
        if self.goals:
            idx = random.randint(0, len(self.goals) - 1)
            self.goal_pos = self.goals[idx].copy()
        else:
            self.goal_pos = np.array([
                random.uniform(0.2*FULL_VIEW_SIZE[0], 0.8*FULL_VIEW_SIZE[0]),
                random.uniform(0.2*FULL_VIEW_SIZE[1], 0.8*FULL_VIEW_SIZE[1])
            ], dtype=np.float32)
        return self._get_obs(), {}

    def step(self, action):
        raw_a = float(action)
        raw_a = np.clip(raw_a, -1.0, 1.0)
        gamma_val = 0.5*(raw_a+1.0)  # maps action [-1,1] to gamma in [0,1]
        self.step_count += 1

        # Compute the perfect direction.
        w_dir = potential_field_dir(self.dot_pos, self.goal_pos, self.obstacles)
        noise = np.random.normal(0, NOISE_MAGNITUDE, size=2)
        h_dir = w_dir + noise
        hm = np.hypot(h_dir[0], h_dir[1])
        if hm > 1e-6:
            h_dir /= hm

        # Arbitration between perfect and noisy directions.
        c_dir = gamma_val * w_dir + (1-gamma_val)*h_dir
        cm = np.hypot(c_dir[0], c_dir[1])
        if cm > 1e-6:
            c_dir /= cm

        move_vec = c_dir * MAX_SPEED
        new_pos = self.dot_pos + move_vec
        if not line_collision(self.dot_pos, new_pos, self.obstacles):
            new_pos[0] = np.clip(new_pos[0], 0, FULL_VIEW_SIZE[0])
            new_pos[1] = np.clip(new_pos[1], 0, FULL_VIEW_SIZE[1])
            self.dot_pos = new_pos

        collided = inside_obstacle(self.dot_pos, self.obstacles)
        info = {}
        if collided:
            original_reward = -2.0
            done = True
            info["terminal_reason"] = "collision"
        else:
            original_reward = 0.0
            done = False
            info["terminal_reason"] = None

        truncated = (self.step_count >= self.max_steps)
        if truncated and not done:
            info["terminal_reason"] = "timeout"

        # Reward shaping: if the dot is near the goal OR near any obstacle, reward high gamma.
        d_goal = distance(self.dot_pos, self.goal_pos)
        d_obs = min([distance(self.dot_pos, obs) for obs in self.obstacles]) if self.obstacles else 1e6
        if d_goal < self.goal_threshold or d_obs < self.obs_threshold:
            shaping_reward = self.alpha * gamma_val
        else:
            shaping_reward = -self.beta * gamma_val

        reward = original_reward + shaping_reward
        self.episode_reward += reward

        obs = self._get_obs()
        return obs, float(reward), done, truncated, info

    def _get_obs(self):
        to_g = self.goal_pos - self.dot_pos
        d = math.hypot(to_g[0], to_g[1])
        dist_ratio = d / self.max_dist if self.max_dist > 1e-6 else 0.0

        w_dir = potential_field_dir(self.dot_pos, self.goal_pos, self.obstacles)
        noise = np.random.normal(0, NOISE_MAGNITUDE, size=2)
        h_dir = w_dir + noise
        hm = math.hypot(h_dir[0], h_dir[1])
        if hm > 1e-6:
            h_dir /= hm

        obs = np.concatenate([
            self.dot_pos,
            h_dir,
            self.goal_pos,
            w_dir,
            [dist_ratio]
        ]).astype(np.float32)
        return obs

    def _render(self):
        if not self.window or not pygame:
            return

        self.window.fill(WHITE)
        for obs in self.obstacles:
            pygame.draw.circle(self.window, GRAY, (int(obs[0]), int(obs[1])), OBSTACLE_RADIUS)
        for gpos in self.goals:
            pygame.draw.circle(self.window, YELLOW, (int(gpos[0]), int(gpos[1])), TARGET_RADIUS)
        pygame.draw.circle(self.window, BLACK, (int(self.goal_pos[0]), int(self.goal_pos[1])), TARGET_RADIUS+2, width=2)
        pygame.draw.circle(self.window, BLACK, (int(self.dot_pos[0]), int(self.dot_pos[1])), DOT_RADIUS, width=2)
        pygame.display.flip()
        self.clock.tick(RENDER_FPS)

    def close(self):
        if self.visualize and pygame is not None:
            pygame.quit()
        super().close()

###############################################################################
# TRAINING FUNCTION
###############################################################################
def train(visualize=False, total_timesteps=300_000):
    from stable_baselines3.common.callbacks import CallbackList
    # Enable scenario_mode so that the environment alternates through our fixed seeds.
    env = DemoArbitrationEnv(visualize=visualize, scenario_mode=True)
    metrics_callback = MetricsCallback()
    callback = CallbackList([metrics_callback])

    model = PPO(
        policy="MlpPolicy",
        env=env,
        learning_rate=3e-4,
        n_steps=1024,
        batch_size=1024,
        n_epochs=4,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        verbose=1,
        tensorboard_log="./ppo_tensorboard/"  # Add tensorboard logging
    )

    try:
        print(f"Starting PPO training (visualize={visualize}) ...")
        model.learn(total_timesteps=total_timesteps, callback=callback, log_interval=1)
    except KeyboardInterrupt:
        print("Training interrupted; saving partial model...")

    os.makedirs("trained_models", exist_ok=True)
    model.save("trained_models/extreme_close_gamma_ppo2")
    print("Model saved to trained_models/extreme_close_gamma_ppo2.zip")

    metrics_callback.save_metrics("training_metrics")
    print("Metrics saved in training_metrics/")
    env.close()

###############################################################################
# MAIN
###############################################################################
if __name__=="__main__":
    import sys
    vis = (len(sys.argv)>1 and sys.argv[1].lower()=="visualize")
    train(visualize=vis, total_timesteps=3_000_000)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Starting PPO training (visualize=False) ...
Logging to ./ppo_tensorboard/PPO_6
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 300      |
|    ep_rew_mean     | 124      |
| time/              |          |
|    fps             | 682      |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 1024     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 300           |
|    ep_rew_mean          | 128           |
| time/                   |               |
|    fps                  | 690           |
|    iterations           | 2             |
|    time_elapsed         | 2             |
|    total_timesteps      | 2048          |
| train/                  |               |
|    approx_kl            | 0.00034638552 |
| 