In [None]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

import environments.house_temp as ht
from useful import trees
from agents import dqn_agent
from agents import ddqn_agent
from agents import pddqn_agent

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Models

In [None]:
class house_temp_model_1(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(house_temp_model_1, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(n_observations, 32),
            nn.ReLU(),
            nn.Linear(32, n_actions)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
class house_temp_model_2(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(house_temp_model_2, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(n_observations, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
class house_temp_model_3(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(house_temp_model_3, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(n_observations, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, x):
        return self.model(x)

# Functions

In [None]:
def testing(env, agent, n_episodes, file_name, global_episode, render, options):
    """
    Testing agents for the house temperature environment 
    -----
    Input
    -----
    env: gym environment
    agent: training agent
    n_episodes: number of episodes to run
    file_name: a string for the file name for saving the model 
    global_episode: an integer for the current episode in training
    render: boolean if the episode should be printed
    options: a dict of things to change in the environment
    ------
    Output
    ------
    reward_list: mean reward for all validation episodes
    action_dict: a dict of the total actions taken per action 
    """
    agent.main_model.load_state_dict(torch.load(f"model_weights/house_temp/{file_name}.pth"))
    agent.main_model.eval()
    reward_list, action_dict, render_count = [], {}, 0

    for episode in range(n_episodes):
        state, _ = env.reset(options = options)
        total_reward = 0
        
        for steps in range(env.max_steps):            
            action = agent.act(state)
            if action not in action_dict:
                action_dict[action] = 1
            else:
                action_dict[action] += 1

            next_state, reward, termination, truncation, _ = env.step(action)
            state = next_state
            total_reward += reward

            if (global_episode + 1) % 100 == 0 and render and render_count == 0:
                print(f"Episode: {global_episode + 1} | Reward: {reward:.2f}")
                env.render()
            
            if termination or truncation:
                break

        render_count = 1
        reward_list.append(total_reward)

    return np.mean(reward_list), action_dict

In [None]:
def training(env, agent, v_agent, n_episodes, file_name, target_update_steps, render = False, optimal = False, optimal_value = 0, options = None):
    """
    Training agents for the house temp environment. Provides the mean inside temperature as a graph with the mean rewards. 
    -----
    Input
    -----
    env: gym environment
    agent: training agent
    v_agent: validation agent
    n_episodes: number of episodes to run
    file_name: file name for saving the model 
    target_update_steps: how many steps to update the target model 
    render: boolean that controls if the state should be shown
    optimal: boolean to check if the training should stop at a certain reward
    optimal_value: an integer for the best reward
    options: a dict of things to change in the environment
    ------
    Output
    ------
    reward_list: a list of the average reward per episode
    validation_rewards: a list of average rewards per set of validation episodes
    actions_list: a list of actions
    """
    reward_list, global_steps = [], 0
    validation_rewards, actions_list = [], []

    for episode in range(n_episodes):
        state, _ = env.reset(options = options)
        total_reward = 0
        
        for steps in range(env.max_steps):
            global_steps += 1
            if global_steps % target_update_steps == 0:
                agent.update_target()
            
            action = agent.act(state)
            next_state, reward, termination, truncation, _ = env.step(action)
            agent.update_memory(state, action, reward, next_state, termination)
        
            loss = agent.train_step()
            agent.decay_epsilon()
            state = next_state
            total_reward += reward
            
            if termination or truncation:
                break
    
        reward_list.append(total_reward)
        torch.save(agent.main_model.state_dict(), f"model_weights/house_temp/{file_name}.pth")
        
        if (episode + 1) % 50 == 0:
            validation_reward, actions = testing(env, v_agent, 100, file_name, episode, render, options)
            print(f"Episode {episode - 48} - {episode + 1} | Average Reward: {validation_reward:.2f}")
            validation_rewards.append(validation_reward)
            actions_list.append(actions)
            if validation_reward > optimal_value and optimal:
                return reward_list, validation_rewards, actions_list
                
    return reward_list, validation_rewards, actions_list

In [None]:
def plot(rewards, v_rewards, actions, version, episodes, action_space, action_lists):
    """
    Plots the rewards, validaiton rewards, and actions 
    -----
    Input
    -----
    rewards: a list of rewards
    v_rewards: a list of validation rewards
    actions: a list of actions
    version: a string for the version
    episodes: an integer for the total episodes
    action_space: a dict of action meaning
    action_lists: a dict of empty lists for each action

    """
    for counts in actions:
        for key in range(len(action_space)):
            action_lists[key].append(counts.get(key, 0))

    x = [i for i in range(episodes)] 

    fig, axes = plt.subplots(3, 1, figsize = (20, 10))

    axes[0].scatter(x, rewards, s = 10)
    axes[0].set_title(f"Learning Progress | {version}")
    axes[0].set_ylabel("Total Reward")
    axes[0].set_xlabel("Training Episodes")

    axes[1].plot(v_rewards)
    axes[1].set_ylabel("Validation Reward")

    for key, value in action_lists.items():
        axes[2].plot(value, label = action_space[key])
    axes[2].set_ylabel("Count")
    axes[2].set_xlabel("Validation Episodes (100 Validation for 50 Training)")
    axes[2].legend(loc = "upper left")

    for i in range(3):
        axes[i].grid()

    plt.tight_layout()
    plt.show()

# House Temperature Version 1.0

In [None]:
env = ht.house_temp_v1_0() 

v_agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_1, state_dim = 4, action_dim = env.action_space.n, gamma = 0, lr = 0, 
                                    epsilon = 0, epsilon_min = 0, decay_steps = 1, buffer_size = 0, batch_size = 0, device = device)

agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_1, state_dim = 4, action_dim = env.action_space.n, gamma = 0.99, lr = 0.001,
                                  epsilon = 1.0, epsilon_min = 0.01, decay_steps = 2500, buffer_size = 2500, batch_size = 128, device = device)

ddqn_mse_rewards_v1_0, ddqn_mse_v_rewards_v1_0, ddqn_mse_actions_v1_0 = training(env = env, agent = agent, v_agent = v_agent, n_episodes = 1000, 
                                                                                 file_name = "ddqn_mse_htv1_0", target_update_steps = 250)

In [None]:
action_space = {0: "Do Nothing", 1: "Window On", 2: "Heater On"}
action_lists = {0: [], 1: [], 2: []}

plot(rewards = ddqn_mse_rewards_v1_0, v_rewards = ddqn_mse_v_rewards_v1_0, actions = ddqn_mse_actions_v1_0, 
     version = "DDQN | MSE | V1.0", episodes = 1000, action_space = action_space, action_lists = action_lists)

# House Temperature Version 2.0

In [None]:
env = ht.house_temp_v2_0() 

v_agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_2, state_dim = 7, action_dim = env.action_space.n, gamma = 0, lr = 0, 
                                    epsilon = 0, epsilon_min = 0, decay_steps = 1, buffer_size = 0, batch_size = 0, device = device)

agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_2, state_dim = 7, action_dim = env.action_space.n, gamma = 0.99, lr = 0.00075,
                                  epsilon = 1.0, epsilon_min = 0.01, decay_steps = 28800, buffer_size = 28800, batch_size = 128, device = device)

ddqn_mse_rewards_v2_0, ddqn_mse_v_rewards_v2_0, ddqn_mse_actions_v2_0 = training(env = env, agent = agent, v_agent = v_agent, n_episodes = 1000, 
                                                                                 file_name = "ddqn_mse_htv2_0", target_update_steps = 1440)

In [None]:
action_space = {0: "Do Nothing", 1: "Heater On", 2: "Cooler On"}
action_lists = {0: [], 1: [], 2: []}

plot(rewards = ddqn_mse_rewards_v2_0, v_rewards = ddqn_mse_v_rewards_v2_0, actions = ddqn_mse_actions_v2_0, 
     version = "DDQN | MSE | V2.0", episodes = 1000, action_space = action_space, action_lists = action_lists)

# House Temperature Version 2.1

In [None]:
env = ht.house_temp_v2_1() 

v_agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_2, state_dim = 7, action_dim = env.action_space.n, gamma = 0, lr = 0, 
                                    epsilon = 0, epsilon_min = 0, decay_steps = 1, buffer_size = 0, batch_size = 0, device = device)

agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_2, state_dim = 7, action_dim = env.action_space.n, gamma = 0.99, lr = 0.00075,
                                  epsilon = 1.0, epsilon_min = 0.01, decay_steps = 28800, buffer_size = 28800, batch_size = 128, device = device)

ddqn_mse_rewards_v2_1, ddqn_mse_v_rewards_v2_1, ddqn_mse_actions_v2_1 = training(env = env, agent = agent, v_agent = v_agent, n_episodes = 1000, 
                                                                                 file_name = "ddqn_mse_htv2_1", target_update_steps = 1440, 
                                                                                 render = True)

In [None]:
action_space = {0: "Do Nothing", 1: "Heater On", 2: "Cooler On"}
action_lists = {0: [], 1: [], 2: []}

plot(rewards = ddqn_mse_rewards_v2_1, v_rewards = ddqn_mse_v_rewards_v2_1, actions = ddqn_mse_actions_v2_1, 
     version = "DDQN | MSE | V2.1", episodes = 1000, action_space = action_space, action_lists = action_lists)

# House Temperature Version 3.0

In [None]:
env = ht.house_temp_v3_0() 

v_agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_3, state_dim = 8, action_dim = env.action_space.n, gamma = 0, lr = 0, 
                                    epsilon = 0, epsilon_min = 0, decay_steps = 1, buffer_size = 0, batch_size = 0, device = device)

agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_3, state_dim = 8, action_dim = env.action_space.n, gamma = 0.99, lr = 0.0001,
                                  epsilon = 1.0, epsilon_min = 0.01, decay_steps = 57600, buffer_size = 28800, batch_size = 128, device = device)

ddqn_mse_rewards_v3_0, ddqn_mse_v_rewards_v3_0, ddqn_mse_actions_v3_0 = training(env = env, agent = agent, v_agent = v_agent, n_episodes = 1000, 
                                                                                 file_name = "ddqn_mse_htv3_0", target_update_steps = 1440)

In [None]:
action_space = {0: "All Off", 1: "Heater On", 2: "Cooler On", 3: "Window On", 
                4: "Heater/Cooler On", 5: "Heater/Window On", 6: "Cooler/Window On", 7: "All On"}
action_lists = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: []}

plot(rewards = ddqn_mse_rewards_v3_0, v_rewards = ddqn_mse_v_rewards_v3_0, actions = ddqn_mse_actions_v3_0, 
     version = "DDQN | MSE | V3.0", episodes = 1000, action_space = action_space, action_lists = action_lists)

# House Temperature Version 3.1

In [None]:
env = ht.house_temp_v3_1() 

v_agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_3, state_dim = 8, action_dim = env.action_space.n, gamma = 0, lr = 0, 
                                    epsilon = 0, epsilon_min = 0, decay_steps = 1, buffer_size = 0, batch_size = 0, device = device)

agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_3, state_dim = 8, action_dim = env.action_space.n, gamma = 0.99, lr = 0.0001,
                                  epsilon = 1.0, epsilon_min = 0.01, decay_steps = 57600, buffer_size = 28800, batch_size = 128, device = device)

ddqn_mse_rewards_v3_1, ddqn_mse_v_rewards_v3_1, ddqn_mse_actions_v3_1 = training(env = env, agent = agent, v_agent = v_agent, n_episodes = 800, 
                                                                                 file_name = "ddqn_mse_htv3_1", target_update_steps = 1440, 
                                                                                 render = True)

In [None]:
action_space = {0: "All Off", 1: "Heater On", 2: "Cooler On", 3: "Window On", 
                4: "Heater/Cooler On", 5: "Heater/Window On", 6: "Cooler/Window On", 7: "All On"}
action_lists = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: []}

plot(rewards = ddqn_mse_rewards_v3_1, v_rewards = ddqn_mse_v_rewards_v3_1, actions = ddqn_mse_actions_v3_1, 
     version = "DDQN | MSE | V3.1", episodes = 800, action_space = action_space, action_lists = action_lists)

# House Temperature Version 4.0

In [None]:
env = ht.house_temp_v4_0() 

v_agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_3, state_dim = 9, action_dim = env.action_space.n, gamma = 0, lr = 0, 
                                    epsilon = 0, epsilon_min = 0, decay_steps = 1, buffer_size = 0, batch_size = 0, device = device)

agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_3, state_dim = 9, action_dim = env.action_space.n, gamma = 0.99, lr = 0.00025,
                                  epsilon = 1.0, epsilon_min = 0.01, decay_steps = 28800, buffer_size = 57600, batch_size = 256, device = device)

ddqn_mse_rewards_v4_0, ddqn_mse_v_rewards_v4_0, ddqn_mse_actions_v4_0 = training(env = env, agent = agent, v_agent = v_agent, n_episodes = 1500, 
                                                                                 file_name = "ddqn_mse_htv4_0", target_update_steps = 14400, 
                                                                                 optimal = True, optimal_value = 10)

In [None]:
action_space = {0: "All Off", 1: "Heater On", 2: "Cooler On", 3: "Window On", 
                4: "Heater/Cooler On", 5: "Heater/Window On", 6: "Cooler/Window On", 7: "All On"}
action_lists = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: []}

plot(rewards = ddqn_mse_rewards_v4_0, v_rewards = ddqn_mse_v_rewards_v4_0, actions = ddqn_mse_actions_v4_0, 
     version = "DDQN | MSE | V4.0", episodes = 1500, action_space = action_space, action_lists = action_lists)

# House Temperature Version 4.1

In [None]:
env = ht.house_temp_v4_1() 

v_agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_3, state_dim = 9, action_dim = env.action_space.n, gamma = 0, lr = 0, 
                                    epsilon = 0, epsilon_min = 0, decay_steps = 1, buffer_size = 0, batch_size = 0, device = device)

agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_3, state_dim = 9, action_dim = env.action_space.n, gamma = 0.99, lr = 0.00025,
                                  epsilon = 1.0, epsilon_min = 0.01, decay_steps = 28800, buffer_size = 57600, batch_size = 256, device = device)

ddqn_mse_rewards_v4_1, ddqn_mse_v_rewards_v4_1, ddqn_mse_actions_v4_1 = training(env = env, agent = agent, v_agent = v_agent, n_episodes = 2500, 
                                                                                 file_name = "ddqn_mse_htv4_1", target_update_steps = 14400, 
                                                                                 optimal = True, optimal_value = -10.0)

In [None]:
action_space = {0: "All Off", 1: "Heater On", 2: "Cooler On", 3: "Window On", 
                4: "Heater/Cooler On", 5: "Heater/Window On", 6: "Cooler/Window On", 7: "All On"}
action_lists = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: []}

plot(rewards = ddqn_mse_rewards_v4_1, v_rewards = ddqn_mse_v_rewards_v4_1, actions = ddqn_mse_actions_v4_1, 
     version = "DDQN | MSE | V4.1", episodes = 1150, action_space = action_space, action_lists = action_lists)

# House Temperature Version 4.2

In [None]:
env = ht.house_temp_v4_2() 

v_agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_3, state_dim = 9, action_dim = env.action_space.n, gamma = 0, lr = 0, 
                                    epsilon = 0, epsilon_min = 0, decay_steps = 1, buffer_size = 0, batch_size = 0, device = device)

agent = ddqn_agent.ddqn_agent_mse(model = house_temp_model_3, state_dim = 9, action_dim = env.action_space.n, gamma = 0.99, lr = 0.0001,
                                  epsilon = 1.0, epsilon_min = 0.01, decay_steps = 28800, buffer_size = 57600, batch_size = 256, device = device)

ddqn_mse_rewards_v4_2, ddqn_mse_v_rewards_v4_2, ddqn_mse_actions_v4_2 = training(env = env, agent = agent, v_agent = v_agent, n_episodes = 1500, 
                                                                                 file_name = "ddqn_mse_htv4_2", target_update_steps = 14400)

In [None]:
action_space = {0: "All Off", 1: "Heater On", 2: "Cooler On", 3: "Window On", 
                4: "Heater/Cooler On", 5: "Heater/Window On", 6: "Cooler/Window On", 7: "All On"}
action_lists = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: []}

plot(rewards = ddqn_mse_rewards_v4_2, v_rewards = ddqn_mse_v_rewards_v4_2, actions = ddqn_mse_actions_v4_2, 
     version = "DDQN | MSE | V4.2", episodes = 1500, action_space = action_space, action_lists = action_lists)