In [None]:
import numpy as np
import torch
from citylearn.citylearn import CityLearnEnv
from citylearn.wrappers import NormalizedObservationWrapper, StableBaselines3Wrapper
from collections import defaultdict

def preprocess_observation(observation):
    """Replace NaN values in observation with a mean value."""
    observation = np.array(observation)
    mean_value = np.nanmean(observation)  # `np.nanmean` ignores NaNs when calculating the mean

    if np.isnan(observation).any():
        observation = np.full_like(observation, default_value)  # Replace NaN with default value
    return observation
    
def preprocess_reward(reward, default_value=0.0):
    """Replace NaN values in reward with a default value."""
    if np.isnan(reward):
        print(f"Found reward NAN: {reward}")
        print("Warning: NaN detected in reward, replacing with default value.")
        reward = default_value  # Replace NaN with default reward value
    print(f"Updating reward with reward: {reward}")
    return reward
    
def preprocess_action(action, action_space, default_value=0.0):
    """Ensure that action is within the action space bounds and not NaN."""
    if np.isnan(action).any():
        print("Warning: NaN detected in action, replacing with default value.")
        action = np.full_like(action, default_value)  # Replace NaN with default value
    
    # Ensure action is within the bounds of the action space
    action = np.clip(action, action_space.low, action_space.high)
    return action

# Simple rule-based policy that returns actions for multiple buildings
def rbc_policy(observation, action_space, num_buildings=9):
    """
    Simple rule based policy based on day or night time
    """
    # Action for each building (initialize to zero)
    actions = np.zeros(num_buildings)
    
    hour = observation[2]  # Hour index is 2 for all observations
    
    for i in range(num_buildings):
        if 9 <= hour <= 21:
            # Daytime: release stored energy for each building
            actions[i] = -0.08
        elif (1 <= hour <= 8) or (22 <= hour <= 24):
            # Early nightime: store DHW and/or cooling energy for each building
            actions[i] = 0.091

        # Ensure the action is within the bounds of action_space (as action_space is continuous)
        actions[i] = np.clip(actions[i], action_space.low[i], action_space.high[i])  # Clip to action space bounds
    
    return actions


class MetaRLAgent:
    """
    Meta-Reinforcement Learning Agent using Rule-Based Policy and Meta-Learning Adaptation
    """

    def __init__(self, action_space, observation_space, num_buildings=9):
        self.action_space = action_space
        self.observation_space = observation_space
        self.num_buildings = num_buildings  # Set the number of buildings
        self.models = defaultdict(lambda: None)  # For task-specific models (e.g., one per building)
        self.meta_model = None  # Placeholder for meta-learner, like PEARL or MAML

    def register_reset(self, observation, action_space, agent_id):
        """Initialize the agent, adapt the model to the task and return action"""
        self.action_space = action_space  # Use the action space directly
        observation = preprocess_observation(observation)  # Preprocess observation to handle NaN

        return self.compute_action(observation, agent_id)

    def compute_action(self, observation, agent_id):
        """Compute action using rule-based policy or learned adaptation"""
        # Rule-based policy
        observation = preprocess_observation(observation)  # Handle NaN values in observation

        actions = rbc_policy(observation, self.action_space, num_buildings=self.num_buildings)
        
        # If you had a meta-model, you could compute the action based on adaptation to the task
        if self.meta_model:
            pass  # Can implement meta-learning adaptation here, using the meta-model

        actions = preprocess_action(actions, self.action_space)  # Preprocess action to handle NaN
        return actions

    def adapt_to_task(self, task_data):
        """Adapt the agent to a new task using the provided task data."""
        task_name = task_data["task_name"]
        observations = task_data["observations"]
        
        # Example: Adaptation could involve training or fine-tuning on task-specific data
        if self.models[task_name] is None:
            self.models[task_name] = "Model"  # Initialize a new model for the task
        
        # Fine-tune or adapt the model using task data (could be gradient-based)
        
        return self.models[task_name]

    def update(self, observation, reward, next_observation):
        """Update the model using new experience (observation, reward, next_observation)"""
        # Preprocess the data
        observation = preprocess_observation(observation)
        reward = preprocess_reward(reward)
        next_observation = preprocess_observation(next_observation)
        
        # Meta-learning update
        # This could involve updating the agent's model based on new experience or fine-tuning it.
        # For example, if you are using gradient-based meta-learning:
        
        # Example: Here, we're just printing the values, in a real setting, you'd perform backpropagation
        print(f"Updating model with observation: {observation}, reward: {reward}, next_observation: {next_observation}")


    def meta_train(self, env, num_iterations=1000, episodes=2):
        """Meta-training loop similar to model.learn"""
        for iteration in range(num_iterations):
            # Loop over episodes and adapt to new task at the start of each episode
            for episode in range(episodes):
                observation, _ = env.reset()  # Get initial observation
                done = False
                
                while not done:
                    # Compute the action using the agent
                    action = self.compute_action(observation, agent_id=0)
                    
                    # Take a step in the environment
                    observation, reward, done, info, _ = env.step(action)  # Unpack all 5 values from step
                    
                     # Preprocess the reward to handle NaN
                    reward = preprocess_reward(reward)
                    
                    # Adapt to task based on observations and rewards
                    self.update(observation, reward, observation)
                    
                # After each episode, evaluate and adapt the model to the new task
                # (e.g., use meta-learning algorithms to adapt after training on this task)

    def evaluate(self, env):
        """Evaluate the agent's performance in the environment"""
        observation, _ = env.reset()  # Get initial observation
        done = False
        total_reward = 0
        
        while not done:
            action = self.compute_action(observation, agent_id=0)  # Use trained policy
            observation, reward, done, info, _ = env.step(action)  # Unpack all 5 values from step
            reward = preprocess_reward(reward)

            total_reward += reward
        
        return total_reward


# Initialize environment (similar to your setup)
env = CityLearnEnv('citylearn_challenge_2023_phase_2_local_evaluation', central_agent=True)
env = NormalizedObservationWrapper(env)  # Normalize the observations
env = StableBaselines3Wrapper(env)  # Wrap for Stable-Baselines3 compatibility

# Initialize Meta-RL Agent
action_space = env.action_space
observation_space = env.observation_space
meta_agent = MetaRLAgent(action_space, observation_space)

# Meta-train the agent
print("Starting meta-training...")
meta_agent.meta_train(env, num_iterations=50, episodes=2)

# Test the agent
print("Testing the agent...")
observations, _ = env.reset()

while not env.unwrapped.terminated:
    actions = meta_agent.compute_action(observations, agent_id=0)  # Compute action
    observations, _, _, _, _ = env.step(actions)  # Take step in environment

# Evaluate the agent's performance using KPIs
print("Evaluating agent's performance...")

kpis = env.unwrapped.evaluate()
kpis = kpis.pivot(index='cost_function', columns='name', values='value').round(3)
kpis = kpis.dropna(how='all')

Starting meta-training...
Updating model with observation: [0.35689586 0.         0.9330127  0.75       0.14202748 0.29853252
 0.83569413 0.13146444 0.         0.16991922 0.16080728 0.
 0.         0.28434265 0.73034215 0.         0.2064052  0.47085425
 0.00356402 0.         0.         0.19998    0.2240845  0.
 0.00739747 0.00739747 0.         0.06926669 0.01745084 1.
 0.         0.30769223 0.51487887 0.00199095 0.         0.
 0.19998    0.18012352 0.         0.         1.         0.
 0.71428585 0.5199968  0.00273007 0.         0.         0.19968037
 0.13422704 0.01794523 0.         1.         0.         1.        ], reward: -0.17977142333984375, next_observation: [0.35689586 0.         0.9330127  0.75       0.14202748 0.29853252
 0.83569413 0.13146444 0.         0.16991922 0.16080728 0.
 0.         0.28434265 0.73034215 0.         0.2064052  0.47085425
 0.00356402 0.         0.         0.19998    0.2240845  0.
 0.00739747 0.00739747 0.         0.06926669 0.01745084 1.
 0.         0.307

In [16]:
print(kpis)

NameError: name 'kpis' is not defined

In [None]:
import ace_tools as tools; tools.display_dataframe_to_user(name="KPIs", dataframe=kpis)  # Display KPIs

In [21]:
import numpy as np
import torch
from citylearn.citylearn import CityLearnEnv
from citylearn.wrappers import NormalizedObservationWrapper, StableBaselines3Wrapper
import torch.nn as nn
import torch.optim as optim

class MetaModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(MetaModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_size)

    def forward(self, x):
        # Ensure no in-place operations are used
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

def preprocess_observation(observation, default_value=0.0):
    """Replace NaN values in observation with a default value."""
    if np.isnan(observation).any():
        print("Warning: NaN detected in observation, replacing with default value.")
        observation = np.full_like(observation, default_value)  # Replace NaN with default value
    return observation

def preprocess_reward(reward, default_value=0.0):
    """Replace NaN values in reward with a default value."""
    if np.isnan(reward):
        print("Warning: NaN detected in reward, replacing with default value.")
        reward = default_value  # Replace NaN with default reward value
    return reward

def preprocess_action(action, action_space, default_value=0.0):
    """Ensure that action is within the action space bounds and not NaN."""
    if np.isnan(action).any():
        print("Warning: NaN detected in action, replacing with default value.")
        action = np.full_like(action, default_value)  # Replace NaN with default value
    
    # Ensure action is within the bounds of the action space
    action = np.clip(action, action_space.low, action_space.high)
    return action

# Simple rule-based policy that returns actions for multiple buildings
def rbc_policy(observation, action_space, num_buildings=9):
    """
    Simple rule-based policy based on day or night time
    """
    actions = np.zeros(num_buildings)
    hour = observation[2]  # Hour index is 2 for all observations
    
    for i in range(num_buildings):
        if 9 <= hour <= 21:
            actions[i] = -0.08
        elif (1 <= hour <= 8) or (22 <= hour <= 24):
            actions[i] = 0.091
        actions[i] = np.clip(actions[i], action_space.low[i], action_space.high[i])  # Clip to action space bounds
    
    return actions

class MetaRLAgent:
    """
    Meta-Reinforcement Learning Agent using Rule-Based Policy and Meta-Learning Adaptation
    """

    def __init__(self, action_space, observation_space, num_buildings=9):
        self.action_space = action_space
        self.observation_space = observation_space
        self.num_buildings = num_buildings
        self.meta_model = MetaModel(input_size=self.observation_space.shape[0], output_size=self.action_space.shape[0])
        # Optimizer for meta-model
        self.optimizer = optim.Adam(self.meta_model.parameters(), lr=0.001)
        self.criterion = nn.MSELoss()

    def register_reset(self, observation, action_space, agent_id):
        """Initialize the agent, adapt the model to the task and return action"""
        self.action_space = action_space
        observation = preprocess_observation(observation)
        return self.compute_action(observation, agent_id)

    def compute_action(self, observation, agent_id):
        """Compute action using rule-based policy or learned adaptation"""
        observation = preprocess_observation(observation)  # Handle NaN values in observation

        # Convert observation to tensor and ensure correct shape (batch_size, input_size)
        observation_tensor = torch.tensor(observation, dtype=torch.float32)

        # If observation is a single sample (1D), add a batch dimension
        if observation_tensor.dim() == 1:
            observation_tensor = observation_tensor.unsqueeze(0)  # Add a batch dimension
    
        # Pass observation through the meta model to get predicted action
        predicted_action = self.meta_model(observation_tensor)

        # Ensure the action is within the valid action space and has 9 values
        action = predicted_action.detach().numpy().flatten()  # Flatten it to ensure it's a 1D array with 9 values
        action = preprocess_action(action, self.action_space)  # Ensure action is valid
        return action



    def update(self, observation, reward, next_observation):
        """Update the model using new experience (observation, reward, next_observation)"""
        # Preprocess data
        observation = preprocess_observation(observation)
        reward = preprocess_reward(reward)
        next_observation = preprocess_observation(next_observation)
    
        # Forward pass: get predicted action from the meta-model
        observation_tensor = torch.tensor(observation, dtype=torch.float32)
        if observation_tensor.dim() == 1:
            observation_tensor = observation_tensor.unsqueeze(0)  # Add batch dimension
    
        predicted_action = self.meta_model(observation_tensor)
    
        # Calculate the target action using rule-based policy
        target_action = preprocess_action(rbc_policy(observation, self.action_space), self.action_space)
    
        # Ensure both predicted_action and target_action are the same shape
        predicted_action = predicted_action.view(-1)  # Flatten predicted_action to 1D tensor
        target_action = torch.tensor(target_action, dtype=torch.float32).view(-1)  # Flatten target_action to 1D tensor
    
        # Compute the loss (Mean Squared Error)
        loss = self.criterion(predicted_action, target_action)
    
        # Perform backward pass and update the model
        self.optimizer.zero_grad()
        loss.backward()  # Perform the backpropagation
        self.optimizer.step()  # Update the model's parameters
    
        # Return the loss for monitoring purposes
        return loss


    def meta_train(self, env, num_iterations=100, episodes=2):
        """Meta-training loop (Outer Loop of MAML)"""
        for iteration in range(num_iterations):
            meta_loss = 0.0
            
            for episode in range(episodes):
                observation, _ = env.reset()  # Get initial observation
                done = False
                while not done:
                    action = self.compute_action(observation, agent_id=0)
                    observation, reward, done, info, _ = env.step(action)
                    loss = self.update(observation, reward, observation)
                    meta_loss += loss

            # Meta-gradient update
            self.optimizer.zero_grad()
            meta_loss.backward()
            self.optimizer.step()
            
            print(f"Iteration {iteration + 1}/{num_iterations} complete.")

            total_reward = self.evaluate(env)  # Evaluate using the current (meta) model
            print(f"Evaluation reward after iteration {iteration + 1}: {total_reward}")

    def evaluate(self, env):
        """Evaluate the agent's performance in the environment"""
        observation, _ = env.reset()  # Get initial observation
        done = False
        total_reward = 0
        
        while not done:
            action = self.compute_action(observation, agent_id=0)  # Use trained policy
            observation, reward, done, info, _ = env.step(action)
            reward = preprocess_reward(reward)
            total_reward += reward
        
        return total_reward


In [22]:
env = CityLearnEnv('citylearn_challenge_2023_phase_2_local_evaluation', central_agent=True)
env = NormalizedObservationWrapper(env)  # Normalize the observations
env = StableBaselines3Wrapper(env)  # Wrap for Stable-Baselines3 compatibility

# Initialize Meta-RL Agent
action_space = env.action_space
observation_space = env.observation_space
meta_agent = MetaRLAgent(action_space, observation_space)

# Meta-train the agent
print("Starting meta-training...")
meta_agent.meta_train(env, num_iterations=100, episodes=2)


Starting meta-training...


  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "/opt/anaconda3/lib/python3.11/asyncio/base_events.py", line 607, in run_forever
    self._run_once()
  File "/opt/anaconda3/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once
    handle._run()
  File "/opt/anaconda3/lib/python3.11/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/opt/anaconda3/lib/python3.11/s

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [None]:
# Test the agent
print("Testing the agent...")
observations, _ = env.reset()

while not env.unwrapped.terminated:
    actions = meta_agent.compute_action(observations, agent_id=0)  # Compute action
    observations, _, _, _, _ = env.step(actions)  # Take step in environment


In [None]:
# Evaluate the agent's performance using KPIs
print("Evaluating agent's performance...")

kpis = env.unwrapped.evaluate()
kpis = kpis.pivot(index='cost_function', columns='name', values='value').round(3)
kpis = kpis.dropna(how='all')

In [24]:
import numpy as np
import torch
from citylearn.citylearn import CityLearnEnv
from citylearn.wrappers import NormalizedObservationWrapper, StableBaselines3Wrapper
from collections import defaultdict


# Meta-model definition
class MetaModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(MetaModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Preprocessing functions
def preprocess_observation(observation, default_value=0.0):
    """Replace NaN values in observation with a default value."""
    if np.isnan(observation).any():
        print("Warning: NaN detected in observation, replacing with default value.")
        observation = np.full_like(observation, default_value)
    return observation

def preprocess_reward(reward, default_value=0.0):
    """Replace NaN values in reward with a default value."""
    if np.isnan(reward):
        print("Warning: NaN detected in reward, replacing with default value.")
        reward = default_value  # Replace NaN with default reward value
    return reward

def preprocess_action(action, action_space, default_value=0.0):
    """Ensure that action is within the action space bounds and not NaN."""
    if np.isnan(action).any():
        print("Warning: NaN detected in action, replacing with default value.")
        action = np.full_like(action, default_value)  # Replace NaN with default value
    
    # Ensure action is within the bounds of the action space
    action = np.clip(action, action_space.low, action_space.high)
    return action

# Simple rule-based policy that returns actions for multiple buildings
def rbc_policy(observation, action_space, num_buildings=9):
    """
    Simple rule based policy based on day or night time
    """
    # Action for each building (initialize to zero)
    actions = np.zeros(num_buildings)
    
    hour = observation[2]  # Hour index is 2 for all observations
    
    for i in range(num_buildings):
        if 9 <= hour <= 21:
            # Daytime: release stored energy for each building
            actions[i] = -0.08
        elif (1 <= hour <= 8) or (22 <= hour <= 24):
            # Early nightime: store DHW and/or cooling energy for each building
            actions[i] = 0.091

        # Ensure the action is within the bounds of action_space (as action_space is continuous)
        actions[i] = np.clip(actions[i], action_space.low[i], action_space.high[i])  # Clip to action space bounds
    
    return actions


class MetaRLAgent:
    """
    Meta-Reinforcement Learning Agent using Rule-Based Policy and Meta-Learning Adaptation
    """

    def __init__(self, action_space, observation_space, num_buildings=9):
        self.action_space = action_space
        self.observation_space = observation_space
        self.num_buildings = num_buildings  # Set the number of buildings
        self.models = defaultdict(lambda: None)  # For task-specific models (e.g., one per building)
        self.meta_model = None  # Placeholder for meta-learner, like PEARL or MAML

    def register_reset(self, observation, action_space, agent_id):
        """Initialize the agent, adapt the model to the task and return action"""
        self.action_space = action_space  # Use the action space directly
        return self.compute_action(observation, agent_id)

    def compute_action(self, observation, agent_id):
        """Compute action using rule-based policy or learned adaptation"""
        # Rule-based policy
        actions = rbc_policy(observation, self.action_space, num_buildings=self.num_buildings)
        
        # If you had a meta-model, you could compute the action based on adaptation to the task
        if self.meta_model:
            pass  # Can implement meta-learning adaptation here, using the meta-model
        
        return actions

    def adapt_to_task(self, task_data):
        """Adapt the agent to a new task using the provided task data."""
        task_name = task_data["task_name"]
        observations = task_data["observations"]
        
        # Example: Adaptation could involve training or fine-tuning on task-specific data
        if self.models[task_name] is None:
            self.models[task_name] = "Model"  # Initialize a new model for the task
        
        # Fine-tune or adapt the model using task data (could be gradient-based)
        
        return self.models[task_name]

    def meta_train(self, env, num_iterations=1000, episodes=2):
        """Meta-training loop similar to model.learn"""
        for iteration in range(num_iterations):
            # Loop over episodes and adapt to new task at the start of each episode
            for episode in range(episodes):
                observation, _ = env.reset()  # Get initial observation
                done = False
                
                while not done:
                    # Compute the action using the agent
                    action = self.compute_action(observation, agent_id=0)
                    
                    # Take a step in the environment
                    observation, reward, done, info, _ = env.step(action)  # Unpack all 5 values from step
                   
                # After each episode, evaluate and adapt the model to the new task
                # (e.g., use meta-learning algorithms to adapt after training on this task)

    def evaluate(self, env):
        """Evaluate the agent's performance in the environment"""
        observation, _ = env.reset()  # Get initial observation
        done = False
        total_reward = 0
        
        while not done:
            action = self.compute_action(observation, agent_id=0)  # Use trained policy
            observation, reward, done, info, _ = env.step(action)  # Unpack all 5 values from step
            total_reward += reward
        
        return total_reward





# Meta RL Agent with a simple Meta-model and Rule-based Policy
class MetaRLAgent:
    def __init__(self, action_space, observation_space, num_buildings=9):
        self.action_space = action_space
        self.observation_space = observation_space
        self.num_buildings = num_buildings
        self.meta_model = MetaModel(input_size=self.observation_space.shape[0], output_size=self.action_space.shape[0])
        self.optimizer = optim.Adam(self.meta_model.parameters(), lr=0.001)
        self.criterion = nn.MSELoss()  # Mean Squared Error for loss calculation

    def compute_action(self, observation):
        """Compute action using rule-based policy or learned adaptation"""
        observation = preprocess_observation(observation)  # Handle NaN values in observation

        # Convert observation to tensor and ensure correct shape (batch_size, input_size)
        observation_tensor = torch.tensor(observation, dtype=torch.float32)

        # If observation is a single sample (1D), add a batch dimension
        if observation_tensor.dim() == 1:
            observation_tensor = observation_tensor.unsqueeze(0)  # Add batch dimension
        
        # Forward pass through the meta model to get predicted action
        predicted_action = self.meta_model(observation_tensor)
        action = predicted_action.detach().numpy()  # Convert to numpy for the environment

        # Ensure the action is within the action space bounds
        action = preprocess_action(action, self.action_space)
        return action

    def update(self, observation, reward, next_observation):
        """Update the model using new experience (observation, reward, next_observation)"""
        # Preprocess data
        observation = preprocess_observation(observation)
        reward = preprocess_reward(reward)
        next_observation = preprocess_observation(next_observation)
        
        # Convert to tensor and ensure correct shape (batch_size, input_size)
        observation_tensor = torch.tensor(observation, dtype=torch.float32)
        
        if observation_tensor.dim() == 1:
            observation_tensor = observation_tensor.unsqueeze(0)  # Add batch dimension
        
        # Forward pass through the model to get predicted action
        predicted_action = self.meta_model(observation_tensor)
        
        # Calculate the target action using rule-based policy (rbc_policy)
        target_action = preprocess_action(rbc_policy(observation, self.action_space), self.action_space)
        
        # Ensure both predicted_action and target_action are the same shape
        predicted_action = predicted_action.view(-1)  # Flatten predicted_action to 1D tensor
        target_action = torch.tensor(target_action, dtype=torch.float32).view(-1)  # Flatten target_action to 1D tensor
        
        # Compute the loss (Mean Squared Error)
        loss = self.criterion(predicted_action, target_action)
        
        # Perform backward pass and update the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss

    def meta_train(self, env, num_iterations=100, episodes=2):
        """Meta-training loop"""
        for iteration in range(num_iterations):
            meta_loss = 0.0
            
            for episode in range(episodes):
                observation, _ = env.reset()  # Get initial observation
                done = False
                while not done:
                    action = self.compute_action(observation)
                    observation, reward, done, info, _ = env.step(action)
                    loss = self.update(observation, reward, observation)
                    meta_loss += loss

            # Meta-gradient update
            self.optimizer.zero_grad()
            meta_loss.backward()
            self.optimizer.step()
            
            print(f"Iteration {iteration + 1}/{num_iterations} complete.")
            total_reward = self.evaluate(env)
            print(f"Evaluation reward after iteration {iteration + 1}: {total_reward}")

    def evaluate(self, env):
        """Evaluate the agent's performance in the environment"""
        observation, _ = env.reset()  # Get initial observation
        done = False
        total_reward = 0
        
        while not done:
            action = self.compute_action(observation)  # Use trained policy
            observation, reward, done, info, _ = env.step(action)
            total_reward += reward
        
        return total_reward

AttributeError: 'list' object has no attribute 'shape'

In [None]:
# Initialize environment (similar to your setup)
env = CityLearnEnv('citylearn_challenge_2023_phase_2_local_evaluation', central_agent=True)
env = NormalizedObservationWrapper(env)  # Normalize the observations
env = StableBaselines3Wrapper(env)  # Wrap for Stable-Baselines3 compatibility

# Initialize Meta-RL Agent
action_space = env.action_space
observation_space = env.observation_space
meta_agent = MetaRLAgent(action_space, observation_space)

# Meta-train the agent
meta_agent.meta_train(env, num_iterations=1000, episodes=2)

# Test the agent
observations, _ = env.reset()

while not env.unwrapped.terminated:
    actions = meta_agent.compute_action(observations, agent_id=0)  # Compute action
    observations, _, _, _, _ = env.step(actions)  # Take step in environment

# Evaluate the agent's performance using KPIs
kpis = env.unwrapped.evaluate()
kpis = kpis.pivot(index='cost_function', columns='name', values='value').round(3)
kpis = kpis.dropna(how='all')


In [None]:
import ace_tools as tools; tools.display_dataframe_to_user(n ame="KPIs", dataframe=kpis)  # Display KPIs

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from citylearn.citylearn import CityLearnEnv
from citylearn.agents.sac import SAC
import pandas as pd
from collections import defaultdict

# Meta-Learning Agent using SAC
class MetaRLAgent:
    def __init__(self, action_space, observation_space, num_tasks=3):
        self.num_tasks = num_tasks
        self.meta_model = SAC(observation_space, action_space)
        self.optimizer = optim.Adam(self.meta_model.parameters(), lr=0.001)

    def forward(self, task_data):
        """ Forward pass for meta-RL, adapting to a new task """
        return self.meta_model(task_data)
    
    def meta_train(self, envs, num_iterations=100, episodes=2):
        """ Meta-training loop """
        for iteration in range(num_iterations):
            meta_loss = 0
            for episode in range(episodes):
                for task_env in envs:
                    task_data = task_env.reset()  
                    loss = self.update(task_data)  # Perform task-specific update (inner loop)
                    meta_loss += loss
            
            # Meta-gradient update (outer loop)
            self.optimizer.zero_grad()
            meta_loss.backward()
            self.optimizer.step()
            print(f"Iteration {iteration + 1}/{num_iterations} complete.")
    
    def update(self, task_data):
        """ Inner loop update (task-specific adaptation) """
        task_loss = self.meta_model.learn(episodes=task_data) 
        return task_loss

# initialize environments (tasks)
def initialize_env(building_ids):
    return CityLearnEnv(schema='citylearn_challenge_2023_phase_3_1',
                        building_ids=building_ids, central_agent=False)

# Training the model on multiple buildings
def train_agent_on_buildings(building_scenarios, episodes=2):
    models = []
    for building_set in building_scenarios:
        # Initialize the building env
        env = initialize_env(building_set)
        obs_shape = env.observation_space[0].shape[0]
        print(f"Training on Buildings: {building_set}, Observation Size: {obs_shape}")

        # Initialize the SAC agent
        model = SAC(env)
        
        # Update: Use learn function correctly. You can pass steps per episode if required
        model.learn(episodes=episodes)  # Use episodes instead of total_timesteps, as per CityLearn's interface

        # Store the trained model
        models.append(model)

    return models

# Train the Meta-RL agent on multiple buildings
building_scenarios = [
    [0],  # Train on Building 0
    [1],  # Train on Building 1
    [2],  # Train on Building 2
    [3],  # Train on Building 3
    [4]  # Train on Building 4
]

models2 = train_agent_on_buildings(building_scenarios)

print("Meta-RL training completed across multiple buildings!")

# Fine-tune and evaluate the model on unseen building (e.g., Building 6)
def fine_tune_and_evaluate(models, episodes=2):
    # Load the test environment for the unseen building
    test_env = initialize_env([5])

    meta_model = models[0]  # Using the model trained on Building 0
    meta_model.env = test_env 

    # Fine-tune the model on the unseen building
    print(f"\n Fine-tuning on unseen Building {test_building}...")
    meta_model.learn(episodes=episodes)

    # --- Test
    obs, _ = test_env.reset()
    done = False
    total_reward = 0
    while not done:
        actions = meta_model.predict(obs, deterministic=True)
        obs, reward, done, _, _ = test_env.step(actions)
        total_reward += reward

    # --- Evaluate KPIs for the unseen building ---
    kpis = test_env.evaluate()
    kpis = kpis.pivot(index="cost_function", columns="name", values="value").round(3)
    kpis = kpis.dropna(how="all")

    # --- Display KPIs ---
    print(f"\nKPIs for Unseen Building 6 (after fine-tuning):")
    display(kpis)
    print(f"Total Reward from Evaluation: {total_reward}")
    return kpis

# Fine-tuning and evaluating on Building 6 (unseen building)
kpis_unseen_building = fine_tune_and_evaluate(models2)


Couldn't import dot_parser, loading of dot files will not be possible.
Training on Buildings: [0], Observation Size: 30


  o = tensor(o).to(self.device)


Training on Buildings: [1], Observation Size: 30
Training on Buildings: [2], Observation Size: 30
Training on Buildings: [3], Observation Size: 30
Training on Buildings: [4], Observation Size: 30
Meta-RL training completed across multiple buildings!


NameError: name 'test_building' is not defined

In [5]:
def fine_tune_and_evaluate(models, episodes=2):
    # Load the test environment for the unseen building
    test_env = initialize_env([5])

    # Use the trained model (example: model trained on Building 0)
    meta_model = models[0]  # Using the model trained on Building 0
    meta_model.env = test_env  # Attach the test environment

    # Fine-tune the model on the unseen building
    print(f"\n Fine-tuning on unseen Building {6}...")
    meta_model.learn(episodes=episodes)

    # --- Test the adapted model ---
    obs, _ = test_env.reset()
    done = False
    total_reward = 0
    while not done:
        actions = meta_model.predict(obs, deterministic=True)
        obs, reward, done, _, _ = test_env.step(actions)
        #total_reward = total_reward + reward

    # --- Evaluate KPIs for the unseen building ---
    kpis = test_env.evaluate()
    kpis = kpis.pivot(index="cost_function", columns="name", values="value").round(3)
    kpis = kpis.dropna(how="all")

    # --- Display KPIs ---
    print(f"\nKPIs for Unseen Building {6} (after fine-tuning):")
    display(kpis)
    #print(f"Total Reward from Evaluation: {total_reward}")
    return kpis

# Fine-tuning and evaluating on Building 6 (unseen building)
kpis_unseen_building = fine_tune_and_evaluate(models2)
print(kpis_unseen_building)


 Fine-tuning on unseen Building 6...

KPIs for Unseen Building 6 (after fine-tuning):


name,Building_1,Building_2,Building_3,Building_4,Building_5,Building_6,District
cost_function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
all_time_peak_average,,,,,,,0.894
annual_normalized_unserved_energy_total,0.023,0.022,0.017,0.018,0.021,0.019,0.02
carbon_emissions_total,0.825,1.061,0.937,0.815,0.989,0.842,0.911
cost_total,0.805,1.031,0.916,0.793,0.971,0.822,0.89
daily_one_minus_load_factor_average,,,,,,,0.997
daily_peak_average,,,,,,,0.908
discomfort_cold_delta_average,0.088,0.061,0.031,0.101,0.062,0.13,0.079
discomfort_cold_delta_maximum,3.849,2.826,2.72,2.882,2.23,3.57,3.013
discomfort_cold_delta_minimum,0.0,0.0,0.0,0.0,0.0,0.0,0.0
discomfort_cold_proportion,0.004,0.003,0.001,0.001,0.001,0.014,0.004


name                                           Building_1  Building_2  \
cost_function                                                           
all_time_peak_average                                 NaN         NaN   
annual_normalized_unserved_energy_total             0.023       0.022   
carbon_emissions_total                              0.825       1.061   
cost_total                                          0.805       1.031   
daily_one_minus_load_factor_average                   NaN         NaN   
daily_peak_average                                    NaN         NaN   
discomfort_cold_delta_average                       0.088       0.061   
discomfort_cold_delta_maximum                       3.849       2.826   
discomfort_cold_delta_minimum                       0.000       0.000   
discomfort_cold_proportion                          0.004       0.003   
discomfort_hot_delta_average                        0.727       0.285   
discomfort_hot_delta_maximum                       

In [None]:
print(kpis)