# Import Required Libraries
Import the necessary libraries, including SB3 Contrib.

In [1]:
# Importing necessary libraries
import gymnasium
from stable_baselines3 import PPO
from stable_baselines3.common.envs import BitFlippingEnv
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from sb3_contrib import MaskablePPO
from knapsack_env import BoundedKnapsackEnv

# Define the BoundedKnapsack Environment
Define the BoundedKnapsack environment and initialize it.

In [5]:
# Define the BoundedKnapsack environment
env = BoundedKnapsackEnv()

# Enable Action Masking
Enable action masking in the environment using the get_mask() method.

In [7]:
# Define the MaskablePPO model
model = MaskablePPO("MlpPolicy", env, verbose=1)

# Enable action masking
env.action_mask = env.get_mask()

# Train the model
model.learn(total_timesteps=10000)

Using cuda device


AttributeError: 'DummyVecEnv' object has no attribute 'get_mask'

# Train a MaskablePPO Agent
Train a MaskablePPO agent from SB3 Contrib on the BoundedKnapsack environment.

In [None]:
# Evaluate the trained model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

# Print the mean reward
print(f"Mean reward: {mean_reward} +/- {std_reward}")

# Compare the results with the best agent from Part 1
# Assuming the best reward from Part 1 is stored in a variable `best_reward_part1`
if mean_reward > best_reward_part1:
    print("The MaskablePPO agent performed better than the best agent from Part 1.")
else:
    print("The best agent from Part 1 performed better than the MaskablePPO agent.")

# Experiment with Different Neural Network Architectures
Experiment with different neural network architectures for the MaskablePPO agent.

In [None]:
# Importing necessary libraries for neural network
from stable_baselines3.ppo.policies import MlpPolicy, CnnPolicy

# List of different neural network architectures to experiment with
policies = [MlpPolicy, CnnPolicy]

# Dictionary to store the mean rewards for each policy
rewards = {}

# Loop through each policy
for policy in policies:
    # Define the MaskablePPO model with the current policy
    model = MaskablePPO(policy, env, verbose=1)

    # Train the model
    model.learn(total_timesteps=10000)

    # Evaluate the trained model
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

    # Store the mean reward in the dictionary
    rewards[policy.__name__] = mean_reward

    # Print the mean reward for the current policy
    print(f"Mean reward for {policy.__name__}: {mean_reward} +/- {std_reward}")

# Print the policy with the highest mean reward
best_policy = max(rewards, key=rewards.get)
print(f"The best policy is {best_policy} with a mean reward of {rewards[best_policy]}")

# Tune the Algorithm Hyperparameters
Manually tune the algorithm hyperparameters to improve the agent's performance.

In [None]:
# Define the hyperparameters to tune
hyperparams = {
    'n_steps': [128, 256, 512],
    'gamma': [0.9, 0.99, 0.999],
    'learning_rate': [0.0001, 0.001, 0.01],
}

# Dictionary to store the mean rewards for each set of hyperparameters
rewards = {}

# Loop through each set of hyperparameters
for n_steps in hyperparams['n_steps']:
    for gamma in hyperparams['gamma']:
        for learning_rate in hyperparams['learning_rate']:
            # Define the MaskablePPO model with the current set of hyperparameters
            model = MaskablePPO(best_policy, env, verbose=1, n_steps=n_steps, gamma=gamma, learning_rate=learning_rate)

            # Train the model
            model.learn(total_timesteps=10000)

            # Evaluate the trained model
            mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

            # Store the mean reward in the dictionary
            rewards[(n_steps, gamma, learning_rate)] = mean_reward

            # Print the mean reward for the current set of hyperparameters
            print(f"Mean reward for n_steps={n_steps}, gamma={gamma}, learning_rate={learning_rate}: {mean_reward} +/- {std_reward}")

# Print the set of hyperparameters with the highest mean reward
best_hyperparams = max(rewards, key=rewards.get)
print(f"The best hyperparameters are n_steps={best_hyperparams[0]}, gamma={best_hyperparams[1]}, learning_rate={best_hyperparams[2]} with a mean reward of {rewards[best_hyperparams]}")

# Evaluate the Agent
Evaluate the trained agent's performance over 100 episodes and calculate the average reward.

In [None]:
# Evaluate the trained model with the best policy and hyperparameters
model = MaskablePPO(best_policy, env, verbose=1, n_steps=best_hyperparams[0], gamma=best_hyperparams[1], learning_rate=best_hyperparams[2])
model.learn(total_timesteps=10000)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)

# Print the mean reward
print(f"Mean reward: {mean_reward} +/- {std_reward}")

# Compare the results with the best agent from Part 1
if mean_reward > best_reward_part1:
    print("The MaskablePPO agent performed better than the best agent from Part 1.")
else:
    print("The best agent from Part 1 performed better than the MaskablePPO agent.")

# Compare Results with Part 1
Compare the results of the MaskablePPO agent with the best agent from Part 1.

In [None]:
# Compare the results with the best agent from Part 1
best_reward_part2 = mean_reward

if best_reward_part2 > best_reward_part1:
    print("The MaskablePPO agent performed better than the best agent from Part 1.")
else:
    print("The best agent from Part 1 performed better than the MaskablePPO agent.")