# Import dependencies

In [35]:
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box

import numpy as np
import random
import os

from stable_baselines3 import DQN,A2C,PPO,SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Objectives:

The objective of the project is to design, train and evaluate a framework for an agent in a low-energy environment. It seeks to efficiently manage the heat and air in the room to reduce energy consumption and improve occupant comfort. 

Leveraging reinforcement learning, the goal is to train agents to dynamically adjust controls based on factors such as occupancy, solar radiation, energy availability, and so on. 

Ultimately, the project aims to help achieve optimal energy without compromising comfort, as likewise sustainable building design.

# The Environment

This environment is designed to be able to simulate the house of a consumer/user and train an agent specific to all the rooms in the house to utilize energy as efficiently as possible.

#### The Version with the Discrete action space

In [36]:
import numpy as np
import random
import gym
import gym.spaces as spaces

class EnergySavingEnv_Disc(gym.Env):
    def __init__(self, num_bedrooms=3, num_people_per_bedroom=[1,2,1], num_people_in_living_room=2):
        self.num_bedrooms = num_bedrooms
        self.num_people_per_bedroom = num_people_per_bedroom if num_people_per_bedroom is not None else [2] * num_bedrooms
        self.num_people_in_living_room = num_people_in_living_room

        self.action_space = spaces.Discrete(28)  # Total number of actions: 7 temperature adjustments * 4 airflow adjustments
        
        num_rooms = num_bedrooms + 1  
        self.observation_space = spaces.Box(low=-1, high=1, shape=(num_rooms+1,1), dtype=np.float32)

        # Define the initial temperature (random between 24 and 26 degrees Celsius) for each room
        self.initial_temperatures = np.random.uniform(24, 26, size=(num_rooms,))

        # Set the current temperature to the initial temperature
        self.temperatures = self.initial_temperatures.copy()

        self.energy_level = np.array([120])

        self.ideal_temperatures = [22] * num_rooms  # Assuming ideal temperature of 22 for all rooms

        # Define the resource manager properties
        self.energy_capacity = 250
        self.energy_threshold = 50  
        self.energy_usage_factor = 2  
        self.base_airflow = 0  
        self.max_airflow = 3  
        self.airflow_energy_consumption = 0.5  

        self.time_steps = 100
        self.current_step = 0

        self.cleaning_energy_consumption = 1 

        self.cleaning_frequency = 1

        # Solar energy parameters
        self.solar_panel_efficiency = 0.2  
        self.solar_panel_area = 100  # Area covered by solar panels in square meters

        self.base_electricity_cost = 1  # Base cost per unit of electricity

        self.solar_radiation_variation = 100  # Maximum variation in solar radiation
        self.solar_radiation_mean = 700  # Mean solar radiation during the day
        self.randomness_factor = 0.1  # Factor to control randomness

        self.max_people_per_room = 3
        self.num_people_in_rooms = self.num_people_per_bedroom.copy() + [0] * (num_bedrooms + 1)  
        self.num_people_in_rooms[-1] = num_people_in_living_room  

        self.add_remove_frequency = 25

        # Exploration parameters
        self.epsilon = 0.1 # Exploration rate
        self.min_epsilon = 0.01  # Minimum exploration rate
        self.epsilon_decay = 0.99  # Exploration decay rate

    def get_solar_radiation(self, time_of_day):

        # Defining the solar radiation ranges for different times of the day
        if time_of_day < 6:  # Night
            solar_radiation = np.random.uniform(0, 100)
        elif time_of_day < 9:  # Early morning
            solar_radiation = np.random.uniform(500, 700)
        elif time_of_day < 15:  # Afternoon
            solar_radiation = np.random.uniform(700, 1000)
        elif time_of_day < 19:  # Evening
            solar_radiation = np.random.uniform(500, 700)
        else:  # Night
            solar_radiation = 0

        # Introducing randomness with occasional complete randomness
        if random.random() < self.randomness_factor:
            solar_radiation = np.random.uniform(self.solar_radiation_mean - self.solar_radiation_variation,
                                                 self.solar_radiation_mean + self.solar_radiation_variation)

        return solar_radiation

    def step(self, action):
        temperature_action = action // 4 + 1  # Temperature adjustment
        airflow_action = action % 4  # Airflow adjustment

        temp_changes = np.zeros_like(self.temperatures)
        temp_change = 0
        if temperature_action == 1:  # Decrease temperature by 1 degree
            temp_change = -1
        elif temperature_action == 2:  # Keep temperature (no change)
            temp_change = 0
        elif temperature_action == 3:  # Increase temperature by 1 degree
            temp_change = 1
        elif temperature_action == 4:  # Decrease temperature by 2 degrees
            temp_change = -2
        elif temperature_action == 5:  # Increase temperature by 2 degrees
            temp_change = 2
        elif temperature_action == 6:  # Decrease temperature by 3 degrees
            temp_change = -3
        elif temperature_action == 7:  # Increase temperature by 3 degrees
            temp_change = 3

        # Apply the temperature change to all rooms
        temp_changes[:self.num_bedrooms] = temp_change

        # Clip temperature within the valid range
        self.temperatures += temp_changes
        self.temperatures = np.clip(self.temperatures, 16, 33)

        # Calculate energy produced by solar panels
        solar_radiation = self.get_solar_radiation(self.current_step / self.time_steps * 24)
        solar_energy = self.solar_panel_area * self.solar_panel_efficiency * solar_radiation

        # Calculate energy usage for temperature adjustment
        energy_usage = np.abs(temp_changes) * self.energy_usage_factor

        # Adjust airflow based on the action and number of people in the room
        airflow = self.base_airflow + airflow_action * (self.max_airflow / (self.action_space.n // 4 - 1))
        airflow_energy_usage = airflow * self.airflow_energy_consumption  # Initialize airflow energy usage
        
        # Adjust airflow energy consumption based on the number of people in each room
        for room_idx, num_people in enumerate(self.num_people_in_rooms):
            if num_people > 0:
                airflow_energy_usage += self.airflow_energy_consumption * num_people

        self.energy_level = min(self.energy_level + solar_energy, self.energy_capacity)
        self.energy_level -= (np.sum(energy_usage) + airflow_energy_usage)
        self.energy_level = np.clip(self.energy_level, 0, self.energy_capacity)

        eps = 1e-6 
        temperature_rewards = -np.abs(self.temperatures - self.ideal_temperatures) * np.log(np.abs(self.temperatures - self.ideal_temperatures + eps))
        temperature_rewards_sum = 0.25*np.sum(temperature_rewards)

        energy_reward = 1 if self.energy_level >= self.energy_threshold else -1
        rewards = temperature_rewards_sum + energy_reward
        done = self.current_step >= self.time_steps
        self.current_step += 1

        if self.current_step % self.add_remove_frequency == 0:
            for i in range(self.num_bedrooms):
                if self.num_people_in_rooms[i] < self.max_people_per_room:
                    self.num_people_in_rooms[i] += 1
                    break

        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

        if random.random() < self.epsilon:
            action = self.action_space.sample()

        info = {}

        normalized_observation = np.hstack(((self.temperatures - 16) / 17, (self.energy_level - 0) / self.energy_capacity))
        normalized_observation = normalized_observation.reshape((-1, 1))

        return normalized_observation, rewards, done, info

    def reset(self):
        self.current_step = 0
        
        if self.current_step % self.cleaning_frequency == 0:
            self.energy_level -= self.cleaning_energy_consumption
            self.energy_level = max(0, min(self.energy_capacity, self.energy_level))

        normalized_observation = np.hstack(((self.temperatures - 16) / 17, (self.energy_level - 0) / self.energy_capacity))
        normalized_observation = normalized_observation.reshape((-1, 1))
        return normalized_observation

#### The Version with the continuos action space

In [37]:
class EnergySavingEnv_Cont(gym.Env):
    def __init__(self, num_bedrooms=3, num_people_per_bedroom=[1,2,1], num_people_in_living_room=2):
        self.num_bedrooms = num_bedrooms
        self.num_people_per_bedroom = num_people_per_bedroom if num_people_per_bedroom is not None else [2] * num_bedrooms
        self.num_people_in_living_room = num_people_in_living_room

        num_rooms = num_bedrooms + 1  
        self.observation_space = spaces.Box(low=-1, high=1, shape=(num_rooms+1,1), dtype=np.float32)

        # Define the initial temperature (random between 24 and 26 degrees Celsius) for each room
        self.initial_temperatures = np.random.uniform(24, 26, size=(num_rooms,))

        # Set the current temperature to the initial temperature
        self.temperatures = self.initial_temperatures.copy()

        # Initialize energy level for the whole house
        self.energy_level = np.array([120])

        # Define the ideal temperature for each room
        self.ideal_temperatures = [22] * num_rooms  # Assuming ideal temperature of 22 for all rooms

        # Define the resource manager properties
        self.energy_capacity = 250
        self.energy_threshold = 50  
        self.energy_usage_factor = 2 
        self.base_airflow = 0  
        self.max_airflow = 3  
        self.airflow_energy_consumption = 0.5 

        self.time_steps = 100

        self.current_step = 0

        self.cleaning_energy_consumption = 1

        # The frequency of home cleaning (once a day/episode)
        self.cleaning_frequency = 1

        # The solar energy parameters
        self.solar_panel_efficiency = 0.2  # Efficiency of solar panels
        self.solar_panel_area = 100  # Area covered by solar panels in square meters

        self.base_electricity_cost = 1  # Base cost per unit of electricity

        self.solar_radiation_variation = 100  # Maximum variation in solar radiation
        self.solar_radiation_mean = 700  # Mean solar radiation during the day
        self.randomness_factor = 0.1  # Factor to control randomness

        self.max_people_per_room = 3
        self.num_people_in_rooms = self.num_people_per_bedroom.copy() + [0] * (num_bedrooms + 1)  
        self.num_people_in_rooms[-1] = num_people_in_living_room  


        self.add_remove_frequency = 25

        self.epsilon = 0.1 # Exploration rate
        self.min_epsilon = 0.01  # Minimum exploration rate
        self.epsilon_decay = 0.99  # Exploration decay rate

        self.action_space = spaces.Box(low=np.array([0, 0]), high=np.array([6, 3]), dtype=np.float32) 

    def get_solar_radiation(self, time_of_day):

        # Defining solar radiation ranges for different times of the day
        if time_of_day < 6:  # Night
            solar_radiation = np.random.uniform(0, 100)
        elif time_of_day < 9:  # Early morning
            solar_radiation = np.random.uniform(500, 700)
        elif time_of_day < 15:  # Afternoon
            solar_radiation = np.random.uniform(700, 1000)
        elif time_of_day < 19:  # Evening
            solar_radiation = np.random.uniform(500, 700)
        else:  # Night
            solar_radiation = 0

        # Introducing randomness with occasional complete randomness
        if random.random() < self.randomness_factor:
            solar_radiation = np.random.uniform(self.solar_radiation_mean - self.solar_radiation_variation,
                                                 self.solar_radiation_mean + self.solar_radiation_variation)

        return solar_radiation

    def step(self, action):
        temperature_action = int(action[0])  # Temperature adjustment
        airflow_action = int(action[1])  # Airflow adjustment

        # Determine the temperature change based on the action
        temp_changes = np.zeros_like(self.temperatures)
        temp_change = 0
        if temperature_action == 0:  # Decrease temperature by 1 degree
            temp_change = -1
        elif temperature_action == 1:  # Keep temperature (no change)
            temp_change = 0
        elif temperature_action == 2:  # Increase temperature by 1 degree
            temp_change = 1
        elif temperature_action == 3:  # Decrease temperature by 2 degrees
            temp_change = -2
        elif temperature_action == 4:  # Increase temperature by 2 degrees
            temp_change = 2
        elif temperature_action == 5:  # Decrease temperature by 3 degrees
            temp_change = -3
        elif temperature_action == 6:  # Increase temperature by 3 degrees
            temp_change = 3

        # Apply the temperature change to all rooms
        temp_changes[:self.num_bedrooms] = temp_change

        # Clip temperature within the valid range
        self.temperatures += temp_changes
        self.temperatures = np.clip(self.temperatures, 16, 33)

        # Calculate energy produced by solar panels
        solar_radiation = self.get_solar_radiation(self.current_step / self.time_steps * 24)
        solar_energy = self.solar_panel_area * self.solar_panel_efficiency * solar_radiation

        # Calculate energy usage for temperature adjustment
        energy_usage = np.abs(temp_changes) * self.energy_usage_factor

        # Adjust airflow based on the action and number of people in the room
        airflow = self.base_airflow + airflow_action * (self.max_airflow / (self.action_space.high[1] - self.action_space.low[1]))
        airflow_energy_usage = airflow * self.airflow_energy_consumption  # Initialize airflow energy usage
        
        # Adjust airflow energy consumption based on the number of people in each room
        for room_idx, num_people in enumerate(self.num_people_in_rooms):
            if num_people > 0:
                airflow_energy_usage += self.airflow_energy_consumption * num_people

        self.energy_level = min(self.energy_level + solar_energy, self.energy_capacity)
        self.energy_level -= (np.sum(energy_usage) + airflow_energy_usage)

        # Clip energy level within the valid range
        self.energy_level = np.clip(self.energy_level, 0, self.energy_capacity)

        eps = 1e-6 
        temperature_rewards = -np.abs(self.temperatures - self.ideal_temperatures) * np.log(np.abs(self.temperatures - self.ideal_temperatures + eps))
        temperature_rewards_sum = 0.25*np.sum(temperature_rewards)

        energy_reward = 1 if self.energy_level >= self.energy_threshold else -1

        rewards = temperature_rewards_sum + energy_reward

        done = self.current_step >= self.time_steps

        self.current_step += 1

        if self.current_step % self.add_remove_frequency == 0:
            for i in range(self.num_bedrooms):
                if self.num_people_in_rooms[i] < self.max_people_per_room:
                    self.num_people_in_rooms[i] += 1
                    break

        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

        if random.random() < self.epsilon:
            action = self.action_space.sample()

        info = {}

        normalized_observation = np.hstack(((self.temperatures - 16) / 17, (self.energy_level - 0) / self.energy_capacity))
        normalized_observation = normalized_observation.reshape((-1, 1))

        return normalized_observation, rewards, done, info

    def reset(self):
        self.current_step = 0
        
        if self.current_step % self.cleaning_frequency == 0:
            self.energy_level -= self.cleaning_energy_consumption
            self.energy_level = max(0, min(self.energy_capacity, self.energy_level))

        normalized_observation = np.hstack(((self.temperatures - 16) / 17, (self.energy_level - 0) / self.energy_capacity))
        normalized_observation = normalized_observation.reshape((-1, 1))

        return normalized_observation


# Test the Environment

In [38]:
env_energy_saving = EnergySavingEnv_Disc()
env_energy_saving.observation_space.sample()

array([[ 0.04654082],
       [ 0.08470397],
       [-0.4847795 ],
       [-0.48447603],
       [-0.5138494 ]], dtype=float32)

In [39]:
env_energy_saving.reset()

array([[0.54432044],
       [0.49699098],
       [0.56276739],
       [0.49263875],
       [0.476     ]])

In [40]:
episodes = 2
for episode in range(1, episodes+1):
  obs = env_energy_saving.reset()
  done = False
  score = 0

  while not done:
    # env_energy_saving.render()
    action = env_energy_saving.action_space.sample()
    obs, reward, done, info = env_energy_saving.step(action)
    score += reward

  print('Episode: {} Score {}'.format(episode, score))

Episode: 1 Score -890.6701186122424
Episode: 2 Score -782.8049709508339


# Train model

We are going to train the model with different agents/models and compare the results obtained.

In [41]:
log_path = os.path.join('Training', 'House_Logs') 

### SAC

In [42]:
env_energy_saving = EnergySavingEnv_Cont()

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [43]:
energy_saving_model = SAC('MlpPolicy', env_energy_saving, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [44]:
energy_saving_model.learn(total_timesteps=250000)

Logging to Training/House_Logs/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 101      |
|    ep_rew_mean     | -495     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 84       |
|    time_elapsed    | 4        |
|    total_timesteps | 404      |
| train/             |          |
|    actor_loss      | 6.32     |
|    critic_loss     | 10.5     |
|    ent_coef        | 0.915    |
|    ent_coef_loss   | -0.271   |
|    learning_rate   | 0.0003   |
|    n_updates       | 303      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 101      |
|    ep_rew_mean     | -381     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 82       |
|    time_elapsed    | 9        |
|    total_timesteps | 808      |
| train/             |          |
|    actor_loss      | 9.54     |
|    critic

In [None]:
path = os.path.join('Training', 'Saved Models', f'SAC_250k_env_house')
energy_saving_model.save(path)
del energy_saving_model

In [None]:
energy_model = SAC.load(path, env_energy_saving)

### Evaluation:

In [None]:
mean_reward, std_reward = evaluate_policy(energy_model, env_energy_saving, n_eval_episodes=10, render=True)

print(f"Mean reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
episodes = 5

for episode in range(1, episodes+1):
    obs = env_energy_saving.reset() 
    done = False
    score = 0

    while not done:
        # obs = obs.reshape((1, -1))
        action, _ = energy_model.predict(obs)  
        obs, reward, done, info = env_energy_saving.step(action)
        score += reward

    print('Episode: {} Score {}'.format(episode, score))

In [None]:
training_log_path = os.path.join(log_path, 'SAC_1')

In [None]:
!tensorboard --logdir={training_log_path}

### Action-Critic Model

In [18]:
env_energy_saving = EnergySavingEnv_Disc()

In [19]:
energy_saving_model = A2C('MlpPolicy', env_energy_saving, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [20]:
energy_saving_model.learn(total_timesteps=250000)

Logging to Training/House_Logs/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 101      |
|    ep_rew_mean        | -379     |
| time/                 |          |
|    fps                | 737      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -3.25    |
|    explained_variance | 0.00778  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -50.1    |
|    value_loss         | 484      |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 101      |
|    ep_rew_mean        | -489     |
| time/                 |          |
|    fps                | 734      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
|

KeyboardInterrupt: 

In [None]:
path = os.path.join('Training', 'Saved Models', f'A2C_250k_env_house')
energy_saving_model.save(path)

del energy_saving_model



In [None]:
energy_model = A2C.load(path, env_energy_saving)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


#### Evalutaion

In [None]:
mean_reward, std_reward = evaluate_policy(energy_model, env_energy_saving, n_eval_episodes=10, render=True)

print(f"Mean reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward:18.82 +/- 0.00


In [None]:
episodes = 5

for episode in range(1, episodes+1):
    obs = env_energy_saving.reset() 
    done = False
    score = 0

    while not done:
        # obs = obs.reshape((1, -1))
        action, _ = energy_model.predict(obs)  
        obs, reward, done, info = env_energy_saving.step(action)
        score += reward

    print('Episode: {} Score {}'.format(episode, score))

Episode: 1 Score 18.82484262256623
Episode: 2 Score 18.82484262256623
Episode: 3 Score 18.82484262256623
Episode: 4 Score 18.82484262256623
Episode: 5 Score 18.82484262256623


In [None]:
training_log_path = os.path.join(log_path, 'A2C_1')

In [None]:
!tensorboard --logdir={training_log_path}

2024-05-01 22:48:33.169288: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.16.2 at http://localhost:6006/ (Press CTRL+C to quit)
^C


### PPO

In [21]:
energy_saving_model = PPO('MlpPolicy', env_energy_saving, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [22]:
energy_saving_model.learn(total_timesteps=250000)

Logging to Training/House_Logs/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 101      |
|    ep_rew_mean     | -595     |
| time/              |          |
|    fps             | 1520     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 101         |
|    ep_rew_mean          | -533        |
| time/                   |             |
|    fps                  | 1118        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009818192 |
|    clip_fraction        | 0.0416      |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.33       |
|    explained_variance   | -0.0027

<stable_baselines3.ppo.ppo.PPO at 0x7d64f353fa00>

In [23]:
path = os.path.join('Training', 'Saved Models', f'PPO_250k_env_house')
energy_saving_model.save(path)

del energy_saving_model



In [24]:
energy_model = PPO.load(path, env_energy_saving)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


#### Evalutation:

In [25]:
mean_reward, std_reward = evaluate_policy(energy_model, env_energy_saving, n_eval_episodes=10, render=True)

print(f"Mean reward:{mean_reward:.2f} +/- {std_reward:.2f}")



Mean reward:1.75 +/- 0.00


In [26]:
episodes = 5

for episode in range(1, episodes+1):
    obs = env_energy_saving.reset() 
    done = False
    score = 0

    while not done:
        # obs = obs.reshape((1, -1))
        action, _ = energy_model.predict(obs)  
        obs, reward, done, info = env_energy_saving.step(action)
        score += reward

    print('Episode: {} Score {}'.format(episode, score))

Episode: 1 Score 1.7485755700566161
Episode: 2 Score 1.7485695700558666
Episode: 3 Score 1.7485710700573665
Episode: 4 Score 1.7485710700551165
Episode: 5 Score 1.7485763200584918


In [27]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [28]:
!tensorboard --logdir={training_log_path}

2024-05-01 23:04:22.567428: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.16.2 at http://localhost:6006/ (Press CTRL+C to quit)
^C


### DQN

In [29]:
energy_saving_model = DQN('MlpPolicy', env_energy_saving, verbose=1, tensorboard_log=log_path)
energy_saving_model.learn(total_timesteps=250000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training/House_Logs/DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 101      |
|    ep_rew_mean      | -704     |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5465     |
|    time_elapsed     | 0        |
|    total_timesteps  | 404      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 101      |
|    ep_rew_mean      | -733     |
|    exploration_rate | 0.969    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 5647     |
|    time_elapsed     | 0        |
|    total_timesteps  | 808      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 101  

<stable_baselines3.dqn.dqn.DQN at 0x7d64f35844c0>

In [30]:
path = os.path.join('Training', 'Saved Models', f'DQN_250k_env_house')
energy_saving_model.save(path)
del energy_saving_model

#### Evaluation:

In [31]:
mean_reward, std_reward = evaluate_policy(energy_model, env_energy_saving, n_eval_episodes=10, render=True)
print(f"Mean reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward:1.75 +/- 0.00


In [32]:

episodes = 5

for episode in range(1, episodes+1):
    obs = env_energy_saving.reset() 
    done = False
    score = 0

    while not done:
        # obs = obs.reshape((1, -1))
        action, _ = energy_model.predict(obs)  
        obs, reward, done, info = env_energy_saving.step(action)
        score += reward

    print('Episode: {} Score {}'.format(episode, score))

Episode: 1 Score 1.748575570056617
Episode: 2 Score 1.7485733200592417
Episode: 3 Score 1.748575570058117
Episode: 4 Score 1.7485740700566168
Episode: 5 Score 1.748572570055117


In [33]:
training_log_path = os.path.join(log_path, 'DQN_1')

In [34]:
!tensorboard --logdir={training_log_path}

2024-05-01 23:10:51.992422: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.16.2 at http://localhost:6006/ (Press CTRL+C to quit)
^C
