# 1.Import Dependencies

In [None]:
import os
from cmath import pi

import gymnasium
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Load Environment

In [None]:
environment_name = 'CartPole-v1'
env = gymnasium.make(environment_name, render_mode="human")

In [None]:
# episodes = 5
# for episode in range(1, episodes + 1):
#     state = env.reset()
#     done = False
#     score = 0
# 
#     while not done:
#         action = env.action_space.sample()
#         n_state, reward, terminated, truncated, info = env.step(action)
#         done = terminated or truncated
#         score += reward
# 
#     print(f'Episode:{episode}, Score:{score}')
# env.close()

# 3.Understanding the Environment

In [None]:
env.action_space

In [None]:
env.action_space.sample()

In [None]:
env.observation_space.sample()

# 4. Train Model

In [None]:
log_path = os.path.join('Training', 'Logs')


In [None]:
import torch

# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
    print("MPS is not available, using CPU.")



In [None]:
env = gymnasium.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, device=device)

In [None]:
model.learn(total_timesteps=20000)

# 5. Save Model & Load Model

In [None]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model_CartPole')

In [None]:
model.save(PPO_path)

In [None]:
del model

In [None]:
model = PPO.load(PPO_path, env=env)

# 6. Evaluate

In [None]:
# Step 1: Recreate the environment
env = gymnasium.make("CartPole-v1", render_mode="human")

# Step 2: Define the path to the saved model
PPO_path = os.path.join("Training", "Saved Models", "PPO_model_CartPole")

# Step 3: Load the saved model
model = PPO.load(PPO_path, env=env)

# Step 4: Custom evaluation function
def evaluate_policy_with_rendering(model, env, n_eval_episodes=10, render=True):
    """
    Evaluate the policy of a loaded model with optional rendering.
    """
    episode_rewards = []

    for episode in range(n_eval_episodes):
        reset_output = env.reset()
        state = reset_output[0] if isinstance(reset_output, tuple) else reset_output
        done = False
        total_reward = 0

        while not done:
            if render:
                env.render()  # Render each frame

            # Get the action from the model
            action, _states = model.predict(state, deterministic=True)
            
            # Adjust for environments returning 4 or 5 values
            step_output = env.step(action)
            if len(step_output) == 5:
                state, reward, terminated, truncated, info = step_output
            else:
                state, reward, terminated, truncated = step_output

            # Convert reward to scalar to avoid warnings
            total_reward += reward.item() if hasattr(reward, 'item') else float(reward)

            # Combine termination flags
            done = terminated or truncated

        episode_rewards.append(total_reward)
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")

    # Calculate mean and standard deviation of rewards
    mean_reward = sum(episode_rewards) / n_eval_episodes
    std_reward = (sum([(x - mean_reward) ** 2 for x in episode_rewards]) / n_eval_episodes) ** 0.5

    return mean_reward, std_reward

# Step 5: Evaluate the loaded model
mean_reward, std_reward = evaluate_policy_with_rendering(model, env, n_eval_episodes=10, render=True)
print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")

# Close the environment
env.close()

# 7. Test Model

In [None]:
import os
import gymnasium as gym
from stable_baselines3 import PPO

# Create the environment
env = gym.make("CartPole-v1", render_mode="human")

# Define the path to the saved model
PPO_path = os.path.join("Training", "Saved Models", "PPO_model_CartPole")

# Load the saved model
model = PPO.load(PPO_path, env=env)

# Number of episodes
episodes = 10

for episode in range(1, episodes + 1):
    # Extract the observation from the reset output
    obs, _ = env.reset()
    done = False
    score = 0

    while not done:
        action, _states = model.predict(obs)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        score += reward

    print(f'Episode: {episode}, Score: {score}')

# Close the environment
env.close()

# 8. View Logs in Tensorboard

In [None]:
import os
log_path = os.path.join('Training', 'Logs')
training_log_path = os.path.join(log_path, 'PPO_2')


In [None]:
!tensorboard --logdir={training_log_path}

# 9. Adding a callback to the training stage

In [None]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3.common.monitor import Monitor

# Create the environment
env = gym.make("CartPole-v1", render_mode="human")

# Wrap the training environment with Monitor for proper episode length and reward tracking
train_env = Monitor(env)

# The path to save logs and models
log_path = os.path.join("Training", "Logs")
best_model_save_path = os.path.join("Training", "Saved_Models")

# Create the evaluation environment and wrap it with Monitor
eval_env = gym.make("CartPole-v1")
eval_env = Monitor(eval_env)

# Create the callback
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(eval_env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=best_model_save_path,
                             verbose=1)

# Create the PPO model
model = PPO('MlpPolicy', train_env, verbose=1, tensorboard_log=log_path)

# Train the model
model.learn(total_timesteps=20000, callback=eval_callback)

# Save the trained model
model_save_path = os.path.join("Training", "Saved_Models", "ppo_cartpole")
model.save(model_save_path)

print("Training complete! Model saved at:", model_save_path)

# Test the trained model
test_env = gym.make("CartPole-v1", render_mode="human")
obs = test_env.reset()
done = False

print("Testing the trained model...")
while not done:
    action, _ = model.predict(obs)
    obs, reward, done, _ = test_env.step(action)
    if done:
        print("Episode finished!")
        break

# Close the environment
test_env.close()

# 10. Change policy

In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3.common.monitor import Monitor
import torch

# Check if MPS is available
device = torch.device("mps" if torch.has_mps else "cpu")

# Create the environment
env = gym.make("CartPole-v1", render_mode="human")

# Wrap the training environment with Monitor for proper episode length and reward tracking
train_env = Monitor(env)

# The path to save logs and models
log_path = os.path.join("Training", "Logs")
best_model_save_path = os.path.join("Training", "Saved_Models")

# Create the evaluation environment and wrap it with Monitor
eval_env = gym.make("CartPole-v1")
eval_env = Monitor(eval_env)

# Create the callback
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(eval_env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=best_model_save_path,
                             verbose=1)

# Define the new policy architecture
new_arch = dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])

# Create the PPO model with the new policy architecture
model = PPO('MlpPolicy', train_env, policy_kwargs={"net_arch": new_arch}, verbose=1, tensorboard_log=log_path, device=device)

# Train the model
model.learn(total_timesteps=20000, callback=eval_callback)

# Save the trained model
model_save_path = os.path.join("Training", "Saved_Models", "ppo_cartpole")
model.save(model_save_path)

print("Training complete! Model saved at:", model_save_path)

# Test the trained model
test_env = gym.make("CartPole-v1", render_mode="human")
obs = test_env.reset()
done = False

print("Testing the trained model...")
while not done:
    action, _ = model.predict(obs)
    obs, reward, done, _ = test_env.step(action)
    if done:
        print("Episode finished!")
        break

# Close the environment
test_env.close()

  device = torch.device("mps" if torch.has_mps else "cpu")


Using mps device
Wrapping the env in a DummyVecEnv.




Logging to Training/Logs/PPO_7


2024-12-20 13:16:46.772 Python[6409:263760] +[IMKClient subclass]: chose IMKClient_Modern
2024-12-20 13:16:46.772 Python[6409:263760] +[IMKInputSession subclass]: chose IMKInputSession_Modern


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.5     |
|    ep_rew_mean     | 21.5     |
| time/              |          |
|    fps             | 44       |
|    iterations      | 1        |
|    time_elapsed    | 45       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28          |
|    ep_rew_mean          | 28          |
| time/                   |             |
|    fps                  | 43          |
|    iterations           | 2           |
|    time_elapsed         | 93          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015097221 |
|    clip_fraction        | 0.236       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | 0.00021     |
|    learning_rate        | 0.

ValueError: You have passed a tuple to the predict() function instead of a Numpy array or a Dict. You are probably mixing Gym API with SB3 VecEnv API: `obs, info = env.reset()` (Gym) vs `obs = vec_env.reset()` (SB3 VecEnv). See related issue https://github.com/DLR-RM/stable-baselines3/issues/1694 and documentation for more information: https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html#vecenv-api-vs-gym-api

# 11. Utilising an Alternate Algorithim

In [None]:
import os
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

# Create the environment
env = gym.make("CartPole-v1")
env = Monitor(env)
env = DummyVecEnv([lambda: env])
env = VecNormalize(env, norm_reward=True)

# Log and model paths
log_path = os.path.join("Training", "Logs")
model_save_path = os.path.join("Training", "Saved_Models", "dqn_cartpole")

# Create the DQN model
model = DQN(
    'MlpPolicy',
    env,
    verbose=1,
    tensorboard_log=log_path,
    exploration_fraction=0.3,  # Slower exploration decay
    exploration_final_eps=0.05,  # Ensure some exploration remains
    buffer_size=100000,  # Larger replay buffer
)

# Train the model
model.learn(total_timesteps=100000)

# Save the trained model
model.save(model_save_path)
print("Training complete! Model saved at:", model_save_path)

# Test the trained model
test_env = gym.make("CartPole-v1", render_mode="human")
obs, _ = test_env.reset()  # Correctly handle the tuple returned by reset()
done = False

print("Testing the trained model...")
while not done:
    action, _ = model.predict(obs)  # Only pass `obs` to predict()
    obs, reward, done, truncated, info = test_env.step(action)  # Updated to handle 5 values
    done = done or truncated  # Check if the episode is over (done or truncated)
    if done:
        print("Episode finished!")
        break

# Close the environment
test_env.close()

Using cpu device
Logging to Training/Logs/DQN_11
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 21.8     |
|    ep_rew_mean      | 21.8     |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8596     |
|    time_elapsed     | 0        |
|    total_timesteps  | 87       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 22.2     |
|    ep_rew_mean      | 22.2     |
|    exploration_rate | 0.994    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4442     |
|    time_elapsed     | 0        |
|    total_timesteps  | 178      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0154   |
|    n_updates        | 19       |
----------------------------------
----------------------------------
| roll