# Import dependencies

In [2]:
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box

import numpy as np
import random
import os

from stable_baselines3 import DQN,A2C,PPO,SAC

from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Objectives:

The objective of the project is to design, train and evaluate a framework for an agent in a low-energy environment. It seeks to efficiently manage the heat and air in the room to reduce energy consumption and improve occupant comfort. 

Leveraging reinforcement learning, the goal is to train agents to dynamically adjust controls based on factors such as occupancy, solar radiation, energy availability, and so on. 

Ultimately, the project aims to help achieve optimal energy without compromising comfort, as likewise sustainable building design.

# The Environment

#### The Version with the discrete action space

In [3]:
import numpy as np
import random
import gym
import gym.spaces as spaces

class EnergySavingEnv_Disc(gym.Env):
    def __init__(self):
        self.action_space = spaces.Discrete(28)  # Total number of actions: 7 temperature adjustments * 4 airflow adjustments
        
        self.observation_space = spaces.Box(low=np.array([-1, -1]), high=np.array([1, 1]), dtype=np.float32)  # Normalized observation space: temperature and energy level

        # Define the initial temperature (random between 24 and 26 degrees Celsius)
        self.initial_temperature = np.random.uniform(24, 26)

        # Set the current temperature to the initial temperature
        self.temperature = self.initial_temperature

        # Initialize energy level
        self.energy_level = 60

        # Define the ideal temperature
        self.ideal_temperature = 22

        self.energy_capacity = 100
        self.energy_threshold = 20  
        self.energy_usage_factor = 2  
        self.base_airflow = 0 
        self.max_airflow = 3 
        self.airflow_energy_consumption = 5  # Energy consumption per unit increase in airflow

        self.time_steps = 100

        self.current_step = 0

        self.cleaning_energy_consumption = 5  # Energy consumed during home cleaning

        # Define the frequency of home cleaning (once a day/episode)
        self.cleaning_frequency = 1

        # The solar energy parameters
        self.solar_panel_efficiency = 0.18  # Efficiency of solar panels
        self.solar_panel_area = 25  # Area covered by solar panels in square meters

        self.base_electricity_cost = 1  # Base cost per unit of electricity

        self.solar_radiation_variation = 100  # Maximum variation in solar radiation
        self.solar_radiation_mean = 700  # Mean solar radiation during the day
        self.randomness_factor = 0.1  # Factor to control randomness

        self.max_people = 5

        self.num_people = 0

        self.add_remove_frequency = 25

        self.epsilon = 0.1  # Exploration rate
        self.min_epsilon = 0.01  # Minimum exploration rate
        self.epsilon_decay = 0.99  # Exploration decay rate

    def get_solar_radiation(self, time_of_day):
        # Defining solar radiation ranges for different times of the day
        if time_of_day < 6:  # Night
            solar_radiation = np.random.uniform(0, 100)
        elif time_of_day < 9:  # Early morning
            solar_radiation = np.random.uniform(500, 700)
        elif time_of_day < 15:  # Afternoon
            solar_radiation = np.random.uniform(700, 1000)
        elif time_of_day < 19:  # Evening
            solar_radiation = np.random.uniform(500, 700)
        else:  # Night
            solar_radiation = 0

        # Introduce randomness with occasional complete randomness
        if random.random() < self.randomness_factor:
            solar_radiation = np.random.uniform(self.solar_radiation_mean - self.solar_radiation_variation,
                                                 self.solar_radiation_mean + self.solar_radiation_variation)

        return solar_radiation

    def step(self, action):
        temperature_action = action // 4 + 1  # Temperature adjustment
        airflow_action = action % 4  # Airflow adjustment

        # Determine the temperature change based on the action
        temp_change = 0
        if temperature_action == 1:  # Decrease temperature by 1 degree
            temp_change = -1
        elif temperature_action == 2:  # Keep temperature (no change)
            temp_change = 0
        elif temperature_action == 3:  # Increase temperature by 1 degree
            temp_change = 1
        elif temperature_action == 4:  # Decrease temperature by 2 degrees
            temp_change = -2
        elif temperature_action == 5:  # Increase temperature by 2 degrees
            temp_change = 2
        elif temperature_action == 6:  # Decrease temperature by 3 degrees
            temp_change = -3
        elif temperature_action == 7:  # Increase temperature by 3 degrees
            temp_change = 3

        # Apply the temperature change
        self.temperature += temp_change

        # Clip temperature within the valid range
        self.temperature = max(16, min(33, self.temperature))

        # Calculate energy produced by solar panels
        solar_radiation = self.get_solar_radiation(self.current_step / self.time_steps * 24)
        solar_energy = self.solar_panel_area * self.solar_panel_efficiency * solar_radiation

        # Calculate energy usage for temperature adjustment
        energy_usage = (abs(temp_change)) * self.energy_usage_factor

        # Adjust airflow based on the action and number of people in the room
        airflow = self.base_airflow + airflow_action * (self.max_airflow / (self.action_space.n // 4 - 1))
        airflow_energy_usage = 0  # Initialize airflow energy usage
        
        # Adjust airflow energy consumption based on the number of people in the room
        if self.num_people > 0:
            airflow_energy_usage = self.airflow_energy_consumption * self.num_people

        self.energy_level = min(self.energy_level + solar_energy, self.energy_capacity)
        self.energy_level -= (energy_usage + airflow_energy_usage)

        # Clip energy level within the valid range
        self.energy_level = max(0, min(self.energy_capacity, self.energy_level))

        temperature_reward = -0.5 * (abs(self.temperature - self.ideal_temperature) ** 2)

        if self.energy_level >= self.energy_threshold:
            energy_reward = 1
        else:
            energy_reward = -1

        reward = temperature_reward + energy_reward

        done = self.current_step >= self.time_steps

        self.current_step += 1

        if self.current_step % self.add_remove_frequency == 0:
            if random.random() < 0.5: 
                if self.num_people < self.max_people:
                    self.num_people += 1
            else:
                if self.num_people > 0:
                    self.num_people -= 1

        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

        if random.random() < self.epsilon:
            action = self.action_space.sample()

        info = {}

        normalized_observation = np.array([(self.temperature - 16) / 17, self.energy_level / self.energy_capacity])

        return normalized_observation, reward, done, info

    def reset(self):
        self.current_step = 0
        self.num_people = random.randint(0, self.max_people)
        if self.current_step % self.cleaning_frequency == 0:
            self.energy_level -= self.cleaning_energy_consumption
            self.energy_level = max(0, min(self.energy_capacity, self.energy_level))

        normalized_observation = np.array([(self.temperature - 16) / 17, self.energy_level / self.energy_capacity])

        return normalized_observation

#### The Version with the continuos action space

In [4]:
class EnergySavingEnv_Cont(gym.Env):
    def __init__(self):
        self.observation_space = spaces.Box(low=np.array([-1, -1]), high=np.array([1, 1]), dtype=np.float32)  # Normalized observation space: temperature and energy level

        # Define the initial temperature (random between 24 and 26 degrees Celsius)
        self.initial_temperature = np.random.uniform(24, 26)

        # Set the current temperature to the initial temperature
        self.temperature = self.initial_temperature

        # Initialize energy level
        self.energy_level = 60

        # Define the ideal temperature
        self.ideal_temperature = 22

        # Define the resource manager properties
        self.energy_capacity = 100
        self.energy_threshold = 30  
        self.energy_usage_factor = 2  
        self.base_airflow = 0 
        self.max_airflow = 3 
        self.airflow_energy_consumption = 5  # Energy consumption per unit increase in airflow

        # Define the number of time steps for the resource manager
        self.time_steps = 100

        # Initialize the current time step
        self.current_step = 0

        self.cleaning_energy_consumption = 5  # Energy consumed during home cleaning

        # Define the frequency of home cleaning (once a day/episode)
        self.cleaning_frequency = 1

        # Adding solar energy parameters
        self.solar_panel_efficiency = 0.18  # Efficiency of solar panels
        self.solar_panel_area = 25  # Area covered by solar panels in square meters

        self.base_electricity_cost = 1  # Base cost per unit of electricity

        self.solar_radiation_variation = 100  # Maximum variation in solar radiation
        self.solar_radiation_mean = 700  # Mean solar radiation during the day
        self.randomness_factor = 0.1  # Factor to control randomness

        self.max_people = 5

        self.num_people = 0

        self.add_remove_frequency = 25

        self.epsilon = 0.1  # Exploration rate
        self.min_epsilon = 0.01  # Minimum exploration rate
        self.epsilon_decay = 0.99  # Exploration decay rate

        self.action_space = spaces.Box(low=np.array([-1.0, -1.0]), high=np.array([1.0, 1.0]), dtype=np.float32)

    def get_solar_radiation(self, time_of_day):
        # Define solar radiation ranges for different times of the day
        if time_of_day < 6:  # Night
            solar_radiation = np.random.uniform(0, 100)
        elif time_of_day < 9:  # Early morning
            solar_radiation = np.random.uniform(500, 700)
        elif time_of_day < 15:  # Afternoon
            solar_radiation = np.random.uniform(700, 1000)
        elif time_of_day < 19:  # Evening
            solar_radiation = np.random.uniform(500, 700)
        else:  # Night
            solar_radiation = 0

        if random.random() < self.randomness_factor:
            solar_radiation = np.random.uniform(self.solar_radiation_mean - self.solar_radiation_variation,
                                                 self.solar_radiation_mean + self.solar_radiation_variation)

        return solar_radiation

    def step(self, action):
        temperature_action = action[0]  # Temperature adjustment
        airflow_action = action[1]  # Airflow adjustment

        # Determine the temperature change based on the action
        temp_change = temperature_action

        # Apply the temperature change
        self.temperature += temp_change

        # Clip temperature within the valid range
        self.temperature = max(16, min(33, self.temperature))

        # Calculate energy produced by solar panels
        solar_radiation = self.get_solar_radiation(self.current_step / self.time_steps * 24)
        solar_energy = self.solar_panel_area * self.solar_panel_efficiency * solar_radiation

        # Calculate energy usage for temperature adjustment
        energy_usage = (abs(temp_change)) * self.energy_usage_factor

        # Adjust airflow based on the action and number of people in the room
        airflow = self.base_airflow + airflow_action * self.max_airflow
        airflow_energy_usage = 0  # Initialize airflow energy usage

        # Adjust airflow energy consumption based on the number of people in the room
        if self.num_people > 0:
            airflow_energy_usage = self.airflow_energy_consumption * self.num_people

        # Calculate net energy (energy usage - solar energy)
        self.energy_level = min(self.energy_level + solar_energy, self.energy_capacity)
        self.energy_level -= (energy_usage + airflow_energy_usage)

        # Clip energy level within the valid range
        self.energy_level = max(0, min(self.energy_capacity, self.energy_level))

        # Calculate reward based on energy-saving state and temperature
        temperature_reward = -0.5 * (abs(self.temperature - self.ideal_temperature) ** 2)

        if self.energy_level >= self.energy_threshold:
            energy_reward = 1
        else:
            energy_reward = -1

        reward = temperature_reward + energy_reward

        done = self.current_step >= self.time_steps
        self.current_step += 1

        if self.current_step % self.add_remove_frequency == 0:
            if random.random() < 0.5: 
                if self.num_people < self.max_people:
                    self.num_people += 1
            else:
                if self.num_people > 0:
                    self.num_people -= 1

        # self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)
        # explore_threshold = np.random.uniform(0, 1)
        # if explore_threshold < self.epsilon:
        #     action = self.action_space.sample()

        info = {}
        normalized_observation = np.array([(self.temperature - 16) / 17, self.energy_level / self.energy_capacity])

        return normalized_observation, reward, done, info

    def reset(self):
        self.current_step = 0
        self.num_people = random.randint(0, self.max_people)
        if self.current_step % self.cleaning_frequency == 0:
            # Consume energy for home cleaning
            self.energy_level -= self.cleaning_energy_consumption
            self.energy_level = max(0, min(self.energy_capacity, self.energy_level))

        normalized_observation = np.array([(self.temperature - 16) / 17, self.energy_level / self.energy_capacity])
        return normalized_observation

# Test the Environment

In [5]:
env_energy_saving = EnergySavingEnv_Cont()
env_energy_saving.observation_space.sample()

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


array([-0.2650033, -0.2998469], dtype=float32)

In [6]:
env_energy_saving.reset()

array([0.53369334, 0.55      ])

In [7]:
episodes = 2
for episode in range(1, episodes+1):
  obs = env_energy_saving.reset()
  done = False
  score = 0

  while not done:
    # env_energy_saving.render()
    action = env_energy_saving.action_space.sample()
    obs, reward, done, info = env_energy_saving.step(action)
    score += reward

  print('Episode: {} Score {}'.format(episode, score))

Episode: 1 Score -946.1001825326873
Episode: 2 Score -1116.4597682041344


# Train model

We are going to train the model with different agents/models and compare the results obtained.

In [8]:
log_path = os.path.join('Training', 'Room_Logs')

### SAC

In [9]:
# learning_rate = 0.001  # Example learning rate, you can change it to your desired value

# energy_saving_model = SAC('MlpPolicy', env_energy_saving, learning_rate=learning_rate, verbose=1, tensorboard_log=log_path)
# # energy_saving_model = SAC('MlpPolicy', env_energy_saving, verbose=1, tensorboard_log=log_path)

In [10]:
# energy_saving_model.learn(total_timesteps=250000)

In [11]:
# path = os.path.join('Training', 'Saved Models', f'SAC_250k_env_room')
# energy_saving_model.save(path)
# del energy_saving_model

In [12]:
# energy_model = SAC.load(path, env_energy_saving)

#### Evaluations:

In [13]:
# mean_reward, std_reward = evaluate_policy(energy_model, env_energy_saving, n_eval_episodes=10, render=True)

# print(f"Mean reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [14]:
# episodes = 5

# for episode in range(1, episodes+1):
#     obs = env_energy_saving.reset() 
#     done = False
#     score = 0

#     while not done:
#         # obs = obs.reshape((1, -1))
#         action, _ = energy_model.predict(obs)  
#         obs, reward, done, info = env_energy_saving.step(action)
#         score += reward

#     print('Episode: {} Score {}'.format(episode, score))

In [15]:
# training_log_path = os.path.join(log_path, 'SAC_1')

In [16]:
# !tensorboard --logdir={training_log_path}

### Action-Critic Model

In [17]:
env_energy_saving = EnergySavingEnv_Disc()

In [18]:
energy_saving_model = A2C('MlpPolicy', env_energy_saving, verbose=1, tensorboard_log=log_path)



Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [19]:
energy_saving_model.learn(total_timesteps=250000)

Logging to Training/Room_Logs/A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 101      |
|    ep_rew_mean        | -2.3e+03 |
| time/                 |          |
|    fps                | 589      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -3.3     |
|    explained_variance | 0.475    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -67.9    |
|    value_loss         | 422      |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 101      |
|    ep_rew_mean        | -2.4e+03 |
| time/                 |          |
|    fps                | 675      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| 

<stable_baselines3.a2c.a2c.A2C at 0x702c34784370>

In [20]:
path = os.path.join('Training', 'Saved Models', f'A2C_250k_env_room')
energy_saving_model.save(path)

del energy_saving_model

In [21]:
energy_model = A2C.load(path, env_energy_saving)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




#### Evalutaion

In [22]:
mean_reward, std_reward = evaluate_policy(energy_model, env_energy_saving, n_eval_episodes=10, render=True)

print(f"Mean reward:{mean_reward:.2f} +/- {std_reward:.2f}")



Mean reward:91.40 +/- 10.15


In [23]:
episodes = 5

for episode in range(1, episodes+1):
    obs = env_energy_saving.reset() 
    done = False
    score = 0

    while not done:
        # obs = obs.reshape((1, -1))
        action, _ = energy_model.predict(obs)  
        obs, reward, done, info = env_energy_saving.step(action)
        score += reward

    print('Episode: {} Score {}'.format(episode, score))

Episode: 1 Score 89.0
Episode: 2 Score 91.0
Episode: 3 Score 69.0
Episode: 4 Score 69.0
Episode: 5 Score 75.0


In [24]:
training_log_path = os.path.join(log_path, 'A2C_1')

In [26]:
!tensorboard --logdir={training_log_path}

2024-05-02 17:46:00.671412: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.16.2 at http://localhost:6006/ (Press CTRL+C to quit)
^C


### PPO

In [27]:
energy_saving_model = PPO('MlpPolicy', env_energy_saving, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [28]:
energy_saving_model.learn(total_timesteps=250000)

Logging to Training/Room_Logs/PPO_1
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 101       |
|    ep_rew_mean     | -2.13e+03 |
| time/              |           |
|    fps             | 2023      |
|    iterations      | 1         |
|    time_elapsed    | 1         |
|    total_timesteps | 2048      |
----------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 101          |
|    ep_rew_mean          | -1.79e+03    |
| time/                   |              |
|    fps                  | 1208         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0025639737 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -3.33        |
|    explain

<stable_baselines3.ppo.ppo.PPO at 0x702c2a01f1f0>

In [29]:
path = os.path.join('Training', 'Saved Models', f'PPO_250k_env_room')
energy_saving_model.save(path)

del energy_saving_model

In [30]:
energy_model = PPO.load(path, env_energy_saving)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


#### Evalutation:

In [31]:
mean_reward, std_reward = evaluate_policy(energy_model, env_energy_saving, n_eval_episodes=10, render=True)

print(f"Mean reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward:83.00 +/- 10.84


In [32]:
episodes = 5

for episode in range(1, episodes+1):
    obs = env_energy_saving.reset() 
    done = False
    score = 0

    while not done:
        # obs = obs.reshape((1, -1))
        action, _ = energy_model.predict(obs)  
        obs, reward, done, info = env_energy_saving.step(action)
        score += reward

    print('Episode: {} Score {}'.format(episode, score))

Episode: 1 Score 62.0
Episode: 2 Score 86.5
Episode: 3 Score 73.0
Episode: 4 Score 69.5
Episode: 5 Score 97.5


In [33]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [35]:
!tensorboard --logdir={training_log_path}

2024-05-02 18:02:43.465357: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
^C
Traceback (most recent call last):
  File "/home/manish/.local/lib/python3.10/site-packages/tensorboard/compat/__init__.py", line 42, in tf
    from tensorboard.compat import notf  # noqa: F401
ImportError: cannot import name 'notf' from 'tensorboard.compat' (/home/manish/.local/lib/python3.10/site-packages/tensorboard/compat/__init__.py)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/manish/.local/bin/tensorboard", line 8, in <module>
    sys.exit(run_main())
  File "/home/manish/.local/lib/python3.10/site-packages/tensorboard/main.py", line 38, in run_main
    main_lib.global_init()
  File "/home/manish/

### DQN

In [42]:
env_energy_saving = EnergySavingEnv_Disc()

energy_saving_model = DQN('MlpPolicy', env_energy_saving, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [43]:
energy_saving_model.learn(total_timesteps=250000)

Logging to Training/Room_Logs/DQN_1
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 101       |
|    ep_rew_mean      | -2.53e+03 |
|    exploration_rate | 0.985     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 10551     |
|    time_elapsed     | 0         |
|    total_timesteps  | 404       |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 101       |
|    ep_rew_mean      | -1.86e+03 |
|    exploration_rate | 0.969     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 10113     |
|    time_elapsed     | 0         |
|    total_timesteps  | 808       |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 101       |
|    ep_rew_mean      | -1.94e+03 |
|    exploration_rate | 0.95

<stable_baselines3.dqn.dqn.DQN at 0x702c2a0e9090>

In [44]:
path = os.path.join('Training', 'Saved Models', f'DQN_250k_env_room')
energy_saving_model.save(path)
del energy_saving_model

#### Evaluation

In [45]:
mean_reward, std_reward = evaluate_policy(energy_model, env_energy_saving, n_eval_episodes=10, render=True)

print(f"Mean reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward:83.20 +/- 10.25


In [46]:
episodes = 5

for episode in range(1, episodes+1):
    obs = env_energy_saving.reset() 
    done = False
    score = 0

    while not done:
        # obs = obs.reshape((1, -1))
        action, _ = energy_model.predict(obs)  
        obs, reward, done, info = env_energy_saving.step(action)
        score += reward

    print('Episode: {} Score {}'.format(episode, score))

Episode: 1 Score 70.5
Episode: 2 Score 78.5
Episode: 3 Score 57.5
Episode: 4 Score 68.5
Episode: 5 Score 100.5


In [47]:
training_log_path = os.path.join(log_path, 'DQN_1')

In [48]:
!tensorboard --logdir={training_log_path}

2024-05-02 18:05:57.553575: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.16.2 at http://localhost:6006/ (Press CTRL+C to quit)
^C
