In [7]:
from stable_baselines3 import DQN 
import numpy as np
import sys
from collections import OrderedDict
sys.path.insert(0,'boptestGymService')
from boptestGymEnv import BoptestGymEnv
from boptestGymEnv import BoptestGymEnvRewardWeightCost, NormalizedActionWrapper, NormalizedObservationWrapper, SaveAndTestCallback,DiscretizedActionWrapper
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.logger import configure
from testing import utilities
import random
import os
from stable_baselines3 import SAC,PPO
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.logger import configure
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_util import make_vec_env
from wandb.integration.sb3 import WandbCallback
import torch
import wandb
import requests
url = 'http://127.0.0.1:5000'
# url="https://api.boptest.net" 


In [8]:
import numpy as np
import requests

class BoptestGymEnvCustomReward(BoptestGymEnv):
    
    def calculate_objective(self, kpis):
        """
        Calculate the objective based on the given KPI values.
        """
        cost_tot = kpis.get('cost_tot', 0) or 0
        pdih_tot = kpis.get('pdih_tot', 0) or 0
        pele_tot = kpis.get('pele_tot', 0) or 0
        tdis_tot = kpis.get('tdis_tot', 0) or 0
        idis_tot = kpis.get('idis_tot', 0) or 0

        objective = (
            cost_tot +
            4.25 * (pdih_tot + pele_tot) +
            0.005 * tdis_tot +
            0.0001 * idis_tot
        )

        return objective

    def get_reward(self):
        try:
            #use this one running on local server
            kpis = requests.get(f'{self.url}/kpi').json()['payload']

            #use this when running boptest server
            # print(self.test_id)
            # print(self.url)
            # kpis = requests.get('{0}/kpi/{1}'.format(self.url,self.testid)).json()['payload']
            # print(kpis)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching KPIs: {e}")
            return 0  # In case of error, return zero reward

        current_objective = self.calculate_objective(kpis)
        # Compute reward
        
        reward = -(current_objective - self.objective_integrand)
        print("prev",self.objective_integrand)
        print("curr",current_objective)
        print("reward",reward)
        self.objective_integrand = current_objective
        
        
        return reward

# With weight and biases

In [9]:
import os
import torch
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.logger import configure


def train_PPO_with_callback(model_path=None,
                            log_dir=os.path.join('results', 'PPO_AD1', 'Model6'),
                            tensorboard_log=os.path.join('results', 'PPO_AD1', 'Model6')):
    """
    Method to train a PPO agent using a callback to save the model periodically.

    Parameters
    ----------
    model_path : str, optional
        Path to a pre-trained model. If provided, the model will be loaded and further trained.
    log_dir : str
        Directory where monitoring data and best-trained model are stored.
    tensorboard_log : str
        Path to directory to load tensorboard logs.
    """
    if torch.cuda.is_available():
        device = torch.device("cuda")
        torch.cuda.empty_cache()  # Clear GPU cache
        print("CUDA is available. Using GPU.")
    else:
        device = torch.device("cpu")
        print("CUDA is not available. Using CPU.")

      
    excluding_periods = []
    excluding_periods.append((173*24*3600, 266*24*3600))  # Summer period
    env_config = {
        'url': url,
        'actions': ['ahu_oveFanSup_u', 'oveValCoi_u', 'oveValRad_u'],
        'observations': {
            'time': (0, 31536000),
            'reaTZon_y': (200., 400.),
            'reaCO2Zon_y': (200., 2000.),
            'weaSta_reaWeaTDryBul_y': (250., 350.),
            'PriceElectricPowerHighlyDynamic':(-0.4,0.4),
            'LowerSetp[1]':(280.,310.),
            'UpperSetp[1]':(280.,310.),
            'UpperCO2[1]':(0,10000),
        },
        'predictive_period': 5*3600,
        'scenario': {'electricity_price': 'highly_dynamic'},
        'random_start_time': True,
        'max_episode_length': 3*24*3600,
        'step_period': 3600,
        'log_dir': log_dir,
        'excluding_periods': excluding_periods
    }
    env = BoptestGymEnvCustomReward(
        url=url,
        actions=['ahu_oveFanSup_u', 'oveValCoi_u', 'oveValRad_u'],
        observations={
            'time': (0, 31536000),
            'reaTZon_y': (200., 400.),
            'reaCO2Zon_y': (200., 2000.),
            'weaSta_reaWeaTDryBul_y': (250., 350.),
            'PriceElectricPowerHighlyDynamic':(-0.4,0.4),
            'LowerSetp[1]':(280.,310.),
            'UpperSetp[1]':(280.,310.),
            'UpperCO2[1]':(0,10000)
        },
        predictive_period     = 5*3600,
        scenario={'electricity_price': 'highly_dynamic'},
        random_start_time=True,
        max_episode_length=5*24*3600,
        step_period=3600,
        log_dir=tensorboard_log,
        excluding_periods=excluding_periods
    )
    
    env = DiscretizedActionWrapper(env, n_bins_act=20)
    os.makedirs(log_dir, exist_ok=True)
    run=wandb.init(
        
        project="ppo-training",
        config={
            'env': env_config,
            'verbose': 1,
            'gamma': 0.99,
            'learning_rate': 3e-4,
            'n_steps': 2048,
            'batch_size': 64,
            'n_epochs': 10,
            'clip_range': 0.2,
            'gae_lambda': 0.95,
            'ent_coef': 0.01,
            'device': device,
            'action_bins':20
        },
        dir=log_dir,
        id="ggm8beyo",
        name="discrete_action_20",
        resume="allow",
        sync_tensorboard=True,
    )
    print(run.id)
    env = Monitor(env=env, filename=os.path.join(log_dir, 'monitor.csv'))
    
    # Callback to save model every 2000 steps
    # callback = SaveAndTestCallback(check_freq=48,save_freq=500,env=env,log_dir=tensorboard_log)
    
    # Set up logger with TensorBoard logging continuation
    new_logger = configure(log_dir, ['stdout', 'csv', 'tensorboard'])
    
    # Check if CUDA is available and force GPU usage if possible
    
    
   

    # Load existing model if model_path is given, else create a new one
    if model_path and os.path.isfile(model_path):
        model = PPO.load(model_path, env=env, tensorboard_log=tensorboard_log, device=device)
        print(f"Loaded pre-trained model from {model_path}")
        model.set_logger(new_logger)  # Reconfigure the logger to continue logging
    else:
        model = PPO(
            'MlpPolicy', 
            env, 
            verbose=1, 
            gamma=0.99,
            learning_rate=3e-4,
            n_steps=512,
            batch_size=64,
            n_epochs=10,
            clip_range=0.2,
            gae_lambda=0.95,
            ent_coef=0.01,
            tensorboard_log=tensorboard_log,
            device=device
        )
        model.set_logger(new_logger)
        print("Starting training from scratch.")
    
    # Verify that the model is on the correct device
    print(f"Model is on device: {next(model.policy.parameters()).device}")
    
    # Train the agent with the callback
    model.learn(total_timesteps=1000000, callback=WandbCallback(verbose=2,model_save_freq=1000,model_save_path=f"Model6/ppo_{run.id}",gradient_save_freq=100))
    # Finish W&B logging
    run.finish()
    return env, model

if __name__ == "__main__":
    model_path = "Model6/ppo_ggm8beyo/model.zip" # Update this with the correct path if needed
    env, model = train_PPO_with_callback(model_path=model_path)
    model.save(os.path.join('results', 'PPO', 'final_model_ppo'))
    print("Training completed. Model saved in results/PPO/")
    print("TensorBoard logs saved in results/PPO/")

CUDA is available. Using GPU.


0,1
global_step,1536.0
rollout/ep_len_mean,120.0
rollout/ep_rew_mean,-2.13094
time/fps,2.0
train/approx_kl,0.06884
train/clip_fraction,0.73848
train/clip_range,0.2
train/entropy_loss,-9.12562
train/explained_variance,0.0
train/learning_rate,0.0003


ggm8beyo
Logging to results/PPO_AD1/Model6
Wrapping the env in a DummyVecEnv.


ValueError: Observation spaces do not match: Box([  0.  200.  200.  250.   -0.4  -0.4  -0.4  -0.4  -0.4  -0.4 280.  280.
 280.  280.  280.  280.  280.  280.  280.  280.  280.  280.    0.    0.
   0.    0.    0.    0. ], [3.1536e+07 4.0000e+02 2.0000e+03 3.5000e+02 4.0000e-01 4.0000e-01
 4.0000e-01 4.0000e-01 4.0000e-01 4.0000e-01 3.1000e+02 3.1000e+02
 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02
 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 1.0000e+04 1.0000e+04
 1.0000e+04 1.0000e+04 1.0000e+04 1.0000e+04], (28,), float32) != Box([  0.  200.  200.  250.   -0.4  -0.4  -0.4  -0.4  -0.4  -0.4  -0.4  -0.4
  -0.4  -0.4  -0.4  -0.4  -0.4  -0.4  -0.4  -0.4  -0.4  -0.4  -0.4  -0.4
  -0.4 280.  280.  280.  280.  280.  280.  280.  280.  280.  280.  280.
 280.  280.  280.  280.  280.  280.  280.  280.  280.  280.  280.  280.
 280.  280.  280.  280.  280.  280.  280.  280.  280.  280.  280.  280.
 280.  280.  280.  280.  280.  280.  280.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0. ], [3.1536e+07 4.0000e+02 2.0000e+03 3.5000e+02 4.0000e-01 4.0000e-01
 4.0000e-01 4.0000e-01 4.0000e-01 4.0000e-01 4.0000e-01 4.0000e-01
 4.0000e-01 4.0000e-01 4.0000e-01 4.0000e-01 4.0000e-01 4.0000e-01
 4.0000e-01 4.0000e-01 4.0000e-01 4.0000e-01 4.0000e-01 4.0000e-01
 4.0000e-01 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02
 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02
 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02
 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02
 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02
 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02
 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02 3.1000e+02
 3.1000e+02 1.0000e+04 1.0000e+04 1.0000e+04 1.0000e+04 1.0000e+04
 1.0000e+04 1.0000e+04 1.0000e+04 1.0000e+04 1.0000e+04 1.0000e+04
 1.0000e+04 1.0000e+04 1.0000e+04 1.0000e+04 1.0000e+04 1.0000e+04
 1.0000e+04 1.0000e+04 1.0000e+04 1.0000e+04], (88,), float32)