In [1]:
import os
import gym
from environments.energy_management_env import EnergyManagementEnv
from environments.env_registration import register_env
from rl_monitoring_utils.vectorized_env_wrapper import VectorizedEnvWrapper
from policies.categorical_policy import CategoricalPolicy
from learning_utils.value_estimator import ValueEstimator
from agents.a2c import A2C
from agents.ppo import PPO
import numpy as np
import matplotlib.pyplot as plt

# Define environment parameters and register the environment
env_params = {
    'SOC_min': 0.2,
    'SOC_max': 0.8,
    'E': 1000,
    'lambda_val': 0.1,
    'data_path': 'data/Data_input.csv',
    'initial_SOC': 0.5
}
register_env('EnergyManagement-v0', 'environments.env_registration:environment_creator', {'environment_class': EnergyManagementEnv, **env_params})

# Function to run experiments
def run_experiment(env, policy_class, agent_class, hidden_sizes, epochs, gamma, T, num_runs):
    totals = []
    for _ in range(num_runs):
        policy = policy_class(env, lr=1e-2, hidden_sizes=hidden_sizes)
        value_estimator = ValueEstimator(env, lr=1e-2)
        agent, total_rewards = agent_class(env, policy, value_estimator, epochs=epochs, gamma=gamma, T=T)
        totals.append(total_rewards)
    return totals

# Set experiment configurations
num_envs_list = [1, 2, 4, 8, 16]
hidden_sizes = [16]  # fixed hidden size
num_runs = 5
epochs = 2000
gamma = 1
T = 720

results = {}

# Run experiments for both A2C and PPO with varying num_envs
for num_envs in num_envs_list:
    energy_management = VectorizedEnvWrapper(gym.make("EnergyManagement-v0"), num_envs=num_envs)
    for agent_class, agent_name in [(A2C, 'A2C'), (PPO, 'PPO')]:
        label = f'{agent_name}_{num_envs}'
        print(f"Running {label}")
        results[label] = run_experiment(energy_management, CategoricalPolicy, agent_class, hidden_sizes, epochs, gamma, T, num_runs)

# Plot results
fig, ax = plt.subplots()
for label, data in results.items():
    means = np.mean(data, axis=0)
    stddev = np.std(data, axis=0)
    epochs_range = range(len(means))
    ax.plot(epochs_range, means, label=label)
    ax.fill_between(epochs_range, means - stddev, means + stddev, alpha=0.1)
ax.set_title('Performance with Different Number of Environments')
ax.set_xlabel('Epoch')
ax.set_ylabel('Total Reward')
ax.legend()
plt.show()


Running A2C_1
Running PPO_1.759916063888785
Running A2C_2.762324863968786
Running PPO_2.374028863728785
Running A2C_4.395872863848787
Running PPO_4.376131463568786
Running A2C_8.403081264548785
Running PPO_8.378470364018785
Running A2C_16432223563428785
Running PPO_16374534963563786
1165/2000:-12.384437664548784