# 1.Import Dependencies

In [1]:
import os
import gymnasium
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Load Environment

In [2]:
environment_name = 'CartPole-v1'
env = gymnasium.make(environment_name, render_mode="human")

In [3]:
# episodes = 5
# for episode in range(1, episodes + 1):
#     state = env.reset()
#     done = False
#     score = 0
# 
#     while not done:
#         action = env.action_space.sample()
#         n_state, reward, terminated, truncated, info = env.step(action)
#         done = terminated or truncated
#         score += reward
# 
#     print(f'Episode:{episode}, Score:{score}')
# env.close()

# 3.Understanding the Environment

In [4]:
env.action_space

Discrete(2)

In [5]:
env.action_space.sample()

0

In [6]:
env.observation_space.sample()

array([ 4.42379   , -1.4380158 , -0.23170045, -1.0911617 ], dtype=float32)

# 4. Train Model

In [7]:
log_path = os.path.join('Training', 'Logs')


In [8]:
import torch

# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
    print("MPS is not available, using CPU.")



In [9]:
env = gymnasium.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, device=device)

Using mps device




In [10]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_1
-----------------------------
| time/              |      |
|    fps             | 366  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 303          |
|    iterations           | 2            |
|    time_elapsed         | 13           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0069779074 |
|    clip_fraction        | 0.0713       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.687       |
|    explained_variance   | 0.000318     |
|    learning_rate        | 0.0003       |
|    loss                 | 11.1         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0109      |
|    value_loss           | 58.4         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x314fab7d0>

# 5. Save Model & Load Model

In [11]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model_CartPole')

In [12]:
model.save(PPO_path)

In [13]:
del model

In [14]:
model = PPO.load(PPO_path, env=env)

# 6. Evaluate

In [15]:
# Step 1: Recreate the environment
env = gymnasium.make("CartPole-v1", render_mode="human")

# Step 2: Define the path to the saved model
PPO_path = os.path.join("Training", "Saved Models", "PPO_model_CartPole")

# Step 3: Load the saved model
model = PPO.load(PPO_path, env=env)

# Step 4: Custom evaluation function
def evaluate_policy_with_rendering(model, env, n_eval_episodes=10, render=True):
    """
    Evaluate the policy of a loaded model with optional rendering.
    """
    episode_rewards = []

    for episode in range(n_eval_episodes):
        reset_output = env.reset()
        state = reset_output[0] if isinstance(reset_output, tuple) else reset_output
        done = False
        total_reward = 0

        while not done:
            if render:
                env.render()  # Render each frame

            # Get the action from the model
            action, _states = model.predict(state, deterministic=True)
            
            # Adjust for environments returning 4 or 5 values
            step_output = env.step(action)
            if len(step_output) == 5:
                state, reward, terminated, truncated, info = step_output
            else:
                state, reward, terminated, truncated = step_output

            # Convert reward to scalar to avoid warnings
            total_reward += reward.item() if hasattr(reward, 'item') else float(reward)

            # Combine termination flags
            done = terminated or truncated

        episode_rewards.append(total_reward)
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")

    # Calculate mean and standard deviation of rewards
    mean_reward = sum(episode_rewards) / n_eval_episodes
    std_reward = (sum([(x - mean_reward) ** 2 for x in episode_rewards]) / n_eval_episodes) ** 0.5

    return mean_reward, std_reward

# Step 5: Evaluate the loaded model
mean_reward, std_reward = evaluate_policy_with_rendering(model, env, n_eval_episodes=10, render=True)
print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")

# Close the environment
env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


2024-12-20 12:00:09.214 Python[5432:179914] +[IMKClient subclass]: chose IMKClient_Modern
2024-12-20 12:00:09.215 Python[5432:179914] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Episode 1: Total Reward = 500.0
Episode 2: Total Reward = 500.0
Episode 3: Total Reward = 500.0
Episode 4: Total Reward = 500.0
Episode 5: Total Reward = 500.0
Episode 6: Total Reward = 500.0
Episode 7: Total Reward = 500.0
Episode 8: Total Reward = 500.0
Episode 9: Total Reward = 500.0
Episode 10: Total Reward = 500.0
Mean reward: 500.0, Std reward: 0.0
