In [1]:
import os
import math
import torch
import bsuite
import random
import numpy as np
import seaborn as sns
import gymnasium as gym
import matplotlib.pyplot as plt
import torch.nn.functional as F

from torch import nn
from tqdm import tqdm
from time import sleep
from scipy import stats
from scipy.stats import norm 
from collections import Counter
from collections import defaultdict
from IPython.display import clear_output
from torch.utils.data import DataLoader, TensorDataset

  fn()


In [2]:
class QLearningAgent:
    def __init__(self, alpha, epsilon, discount, get_legal_actions, item = True):
        """
        Q-Learning Agent
        based on https://inst.eecs.berkeley.edu/~cs188/sp19/projects.html
        Instance variables you have access to
          - self.epsilon (exploration prob)
          - self.alpha (learning rate)
          - self.discount (discount rate aka gamma)

        Functions you should use
          - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}
            which returns legal actions for a state
          - self.get_qvalue(state,action)
            which returns Q(state,action)
          - self.set_qvalue(state,action,value)
            which sets Q(state,action) := value
        !!!Important!!!
        Note: please avoid using self._qValues directly. 
            There's a special self.get_qvalue/set_qvalue for that.
        """

        self.get_legal_actions = get_legal_actions
        self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))
        self.alpha = alpha
        self.epsilon = epsilon
        self.discount = discount
        self.item = item

    def get_qvalue(self, state, action):
        """ Returns Q(state,action) """
        return self._qvalues[state][action]

    def set_qvalue(self, state, action, value):
        """ Sets the Qvalue for [state,action] to the given value """
        self._qvalues[state][action] = value

    def get_value(self, state):
        """
        Compute your agent's estimate of V(s) using current q-values
        V(s) = max_over_action Q(state,action) over possible actions.
        Note: please take into account that q-values can be negative.
        """
        possible_actions = self.get_legal_actions(state)

        # If there are no legal actions, return 0.0
        if len(possible_actions) == 0:
            return 0.0

        value = max([self.get_qvalue(state, a) for a in possible_actions])
        return value

    def update(self, state, action, reward, next_state, done):
        """
        You should do your Q-Value update here:
           Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
        """

        # agent parameters
        gamma = self.discount
        learning_rate = self.alpha
        
        q = reward + gamma * (1 - done) * self.get_value(next_state)
        q = (1 - learning_rate) * self.get_qvalue(state, action) + learning_rate * q
        
        if self.item:
            self.set_qvalue(state, action, q.item())
        else:
            self.set_qvalue(state, action, q)

    def get_best_action(self, state):
        """
        Compute the best action to take in a state (using current q-values). 
        """
        possible_actions = self.get_legal_actions(state)

        # If there are no legal actions, return None
        if len(possible_actions) == 0:
            return None
                
        idx = np.argmax([self.get_qvalue(state, a) for a in possible_actions])

        return possible_actions[idx]

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.  
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list). 
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        # Pick Action
        possible_actions = self.get_legal_actions(state)
        action = None

        # If there are no legal actions, return None
        if len(possible_actions) == 0:
            return None

        # agent parameters:
        epsilon = self.epsilon

        if np.random.rand() < epsilon:
            return np.random.choice(possible_actions)
        
        return self.get_best_action(state)

In [3]:
class ReplayBuffer(object):
    def __init__(self, size):
        """Create Replay buffer.
        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        """
        self._storage = []
        self._maxsize = size
        self._next_idx = 0

    def __len__(self):
        return len(self._storage)

    def add(self, obs_t, action, reward, obs_tp1, done):
        data = (obs_t, action, reward, obs_tp1, done)

        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize

    def _encode_sample(self, idxes):
        
        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
        
        for i in idxes:
            data = self._storage[i]
            obs_t, action, reward, obs_tp1, done = data
            obses_t.append(np.array(obs_t, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(reward.item)
            obses_tp1.append(np.array(obs_tp1, copy=False))
            dones.append(done)            
        return (
            np.array(obses_t),
            np.array(actions),
            np.array(rewards),
            np.array(obses_tp1),
            np.array(dones)
        )

    def sample(self, batch_size):
        """Sample a batch of experiences.
        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        """
        idxes = [ random.randint(0, len(self._storage) - 1)
            for _ in range(batch_size)]
        return self._encode_sample(idxes)


In [4]:
def get_state_number(s):
    return np.argmax(s.flatten())

def test_agent(agent, greedy=True, delay=.5):
    v = get_all_states_value(agent)
    s, _ = env.reset()
    done = False
    while not done:
        fig, ax = plt.subplots(ncols=2)
        ax[0].imshow(s)
        ax[0].set_title('State')
        im = ax[1].imshow(v)
        plt.colorbar(im)
        ax[1].set_title('Value function')
        clear_output(True)
        plt.show()
        s = get_state_number(s)
        
        if greedy:
            a = agent.get_best_action(s)
        else:
            a = agent.get_action(s)

        s, r, terminated, truncated, _  = env.step(a)
        done = terminated or truncated
        sleep(delay)

def get_all_states_value(agent):
    s_shape = env.observation_space.shape
    s_shape_flatten = np.prod(s_shape)
    v = np.zeros(s_shape_flatten)
    
    for i in range(s_shape_flatten):
        v[i] = agent.get_value(i)
    v = v.reshape(s_shape)
    return v

def to_one_hot(x, ndims):
    """ helper: take an integer vector and convert it to 1-hot matrix. """
    x = x.long().view(-1, 1)
    x = torch.zeros(x.size()[0], ndims).scatter_(1, x, 1)
    return x

def freeze_test_agent(agent, test_episodes):
    
    all_rewards = []
    for i in range(test_episodes):
        eps_rewards = 0
        v    = get_all_states_value(agent)
        s, _ = env.reset()
        done = False
        
        while not done:
            s = get_state_number(s)
            a = agent.get_best_action(s)
            n_s, r, terminated, truncated, _  = env.step(a)
            eps_rewards+=r
            done = terminated or truncated
            s = n_s
        all_rewards.append(eps_rewards)
    
    return all_rewards

def average_first_visits(episode):
    
    s_shape = env.observation_space.shape
    s_shape_flatten = np.prod(s_shape)
    available_states = {i: None for i in range(s_shape_flatten)}

    states = [np.argmax(state) for state in episode]
    states
    first_visits = {}
    for timestep, state in enumerate(states):
        if state not in first_visits:
            first_visits[state] = timestep

    not_visited_states = [state for state in available_states if state not in first_visits]

    num_steps = env.observation_space.shape[0]
    for state in not_visited_states:
            first_visits[state] = 1000  

    new_sea = np.zeros(s_shape_flatten)

    for state, first_visit in first_visits.items():
        new_sea[state] = first_visit
    
    return new_sea

def visualize_single_run(episode, agent_states):
    
    s_shape = env.observation_space.shape

    for states in range(episode):
        new_sea = average_first_visits(agent_states[states])

        if states == 0:
            seas = new_sea
        else:
            seas = np.vstack([seas, new_sea])

    average_visits = np.mean(seas, axis = 0)        
    
    return average_visits

    plt.imshow(average_visits.reshape(s_shape), cmap='Spectral')
    plt.title(f'Average First-visit visualizations\n{episode} Episodes')
    plt.colorbar()
    plt.show()

    
def visualize_multiple_runs(all_episodes, epsilon, agent_states, learning_agent, num_episodes):
    s_shape = env.observation_space.shape
    all_average_visits = []

    for i, episode in enumerate(all_episodes):
        seas = None 
        for states in range(episode):
            new_sea = average_first_visits(agent_states[states])

            if states == 0:
                seas = new_sea
            else:
                seas = np.vstack([seas, new_sea])

        average_visits = np.mean(seas, axis=0)
        all_average_visits.append(average_visits)

    print(f'average_first_visits {learning_agent} Agent for {num_episodes} Episodes')
    print('-----------------------------------------')
    
    _, axarr1 = plt.subplots(1,len(all_episodes), figsize = (15,15))
    axarr1[0].set_ylabel(f'Epsilon: {epsilon}')

    for i in range(len(all_average_visits)):

        axarr1[i].imshow(all_average_visits[i].reshape(s_shape), cmap='Spectral')

        axarr1[i].set_xticks([])
        axarr1[i].set_yticks([])
        axarr1[i].set_title(f'{all_episodes[i]} Episodes')

        if i > len(all_episodes)-2:
            last_subplot = axarr1[-1]
            cax = last_subplot.imshow(all_average_visits[i].reshape(s_shape), cmap='RdYlBu') 
            plt.colorbar(cax, ax=last_subplot,  fraction=0.05)
    
    plt.show()
    
def average_test_rewards(rewards):
    
    all_avgs = []

    for row in zip(rewards[0], rewards[1], rewards[2]):

        new_rows = []
        for i in zip(row[0], row[1], row[2]):
            m = np.mean(i)
            new_rows.append(m)
        all_avgs.append(new_rows)
    all_avgs
    
    return all_avgs

def flatten_rewards(test_rewards):
    flat_test_rewards = []
    for sublist in test_rewards:
        flat_test_rewards.extend(sublist)
    return flat_test_rewards
    
    
def state_visitation_percentage(env, episodes, states, learning_agent):
    # Count all available states in env 
    all_states = np.prod(env.observation_space.shape)
    diagonal = env.observation_space.shape[0]/2
    available_states = all_states/2 + diagonal

    # Get full list of visited states
    flat_states = []
    for sublist in states:
        flat_states.extend(sublist)

    # Get location of visited states    
    state_loc = []
    for i in flat_states:
        state = np.argmax(i)
        if state != 0:
            state_loc.append(state)
            
    # Count the unique visited states
    visited_states = set(state_loc)
    num_visited_states = len(visited_states)
    
    # Calculate the percentage of visited states
    percentage_visited = (num_visited_states / available_states) * 100

    print(f"% of total states visited/ {episodes} Eps: {percentage_visited:.2f}%  {learning_agent} ")
    
    return percentage_visited

def get_state_visitation_percentage(env, states):
    # Count all available states in env 
    all_states = np.prod(env.observation_space.shape)
    diagonal = env.observation_space.shape[0]/2
    available_states = all_states/2 + diagonal

    # Get full list of visited states
    flat_states = []
    for sublist in states:
        flat_states.extend(sublist)

    # Get location of visited states    
    state_loc = []
    for i in flat_states:
        state = np.argmax(i)
        if state != 0:
            state_loc.append(state)
            
    # Count the unique visited states
    visited_states = set(state_loc)
    num_visited_states = len(visited_states)
    
    # Calculate the percentage of visited states
    percentage_visited = (num_visited_states / available_states) * 100
    
    return percentage_visited


def plot_ftv_histograms(states):
    s_shape = env.observation_space.shape
    s_shape_flatten = np.prod(s_shape)
    available_states = {i: 0 for i in range(s_shape_flatten)}

    all_flat_states = []
    for agent_states in states:
        flat_states = []
        for sublist in agent_states:
            flat_states.extend(sublist)
        all_flat_states.append(flat_states)

    merged_dicts = []
    for run in range(len(all_flat_states)):

        first_visits = {}
        for timestep, state in enumerate(all_flat_states[run]):
            if state not in first_visits:
                first_visits[state] = timestep

        merged_dict = {key: first_visits[key] if key in first_visits else available_states[key] for key in available_states}
        merged_dicts.append(merged_dict)

    arr = np.array([list(d.values()) for d in merged_dicts])

    mean_values = np.mean(arr, axis=0)

    mean_dict = {key: int(mean_values[i]) for i, key in enumerate(merged_dicts[0].keys())}

    values = list(mean_dict.values())

    grid_size = env.observation_space.shape[0]
    state_values = [[i + j*grid_size + 1 for i in range(grid_size)] for j in range(grid_size)]
    lower_diagonal_values = []

    for row in range(grid_size):
        for col in range(grid_size):
            if row >= col:
                lower_diagonal_values.append(state_values[row][col] - 1)  # Subtract 1 here

    updated_dict = {state: mean_dict[state] for state in lower_diagonal_values}

    for key, val in updated_dict.items():
        if val == 0:
            updated_dict[key] = 50000

    return updated_dict.values()


def plot_histograms(all_is, label, color):
    data = list(plot_ftv_histograms(all_is))
    mu, std = norm.fit(data)
    sns.histplot(data, bins=20, kde=True, color = color)  
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = norm.pdf(x, mu, std)
    plt.plot(x, p, 'k', linewidth=2, label = label, color=color)


# Epsilon-Annealing Q

In [None]:
def epsilon_annealing(env, agent, seed, num_episodes, test_reward_period, initial_epsilon, final_epsilon):
    all_states  = []
    all_test_rewards = []
    all_percentages = []
    
    test_n = 0
    target_value = 0.99

    for i in tqdm(range(num_episodes)):
        s, _ = env.reset(seed=seed)
        done = False
        episode_rewards = 0
        episode_states  = []
        episode_is = []
        epsilon = max(final_epsilon, initial_epsilon * (1 - i / num_episodes))

        while not done:
            episode_states.append(s)
            i_s = get_state_number(s)
            episode_is.append(i_s)
            
            if np.random.rand() < epsilon:
                a = env.action_space.sample()  # Exploration
            else:
                i_s = get_state_number(s)
                a = agent.get_action(i_s)  # Exploitation

            s_next, r, terminated, truncated, _ = env.step(a)
            episode_rewards += r
            done = terminated or truncated
            i_s_next = get_state_number(s_next)
            agent.update(i_s, a, r, i_s_next, terminated)
            s = s_next

        all_states.append(episode_states)
        
        if i % test_reward_period == 0:
            test_n += 1
            tests = freeze_test_agent(agent, 10)
            all_test_rewards.append(np.mean(tests))
            
            percentages = get_state_visitation_percentage(env, all_states)
            all_percentages.append(percentages)

    return all_states, all_test_rewards, all_percentages


In [None]:
num_agents = 3
learning_agent = 'qlearning'

anneal_all_agent_percentages = []
anneal_all_states = []

for i in range(num_agents):
    seed = i
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

    initial_epsilon = 1.0
    final_epsilon = 0.1
    epsilon = 0.1
    num_episodes = 5000       
    test_reward_period = 100

    agent = QLearningAgent(alpha    = 0.5, 
                           epsilon  = epsilon,
                           discount = 0.9,
                           get_legal_actions=lambda s: range(env.action_space.n),
                           item = False)

    anneal_states, anneal_test_rew, anneal_percentages, = epsilon_annealing(env,
                                                                             agent,
                                                                             seed, 
                                                                             num_episodes,
                                                                             test_reward_period,
                                                                             initial_epsilon,
                                                                             final_epsilon)
    
    anneal_all_agent_percentages.append(anneal_percentages)
    anneal_all_states.append(anneal_states)
    
    
save_dir = f'saved_deepsea_results/'
precentages_path = f"{save_dir}q_learning_percentages_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(precentages_path, anneal_all_agent_percentages)

save_dir = f'saved_deepsea_results/'
state_visits_path = f"{save_dir}q_learning_states_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(state_visits_path, anneal_all_states[0])

# Intrinsic Rewards

In [None]:
class BaseIntrinsicRewardModule(nn.Module):
    def __init__(self):
        super().__init__()

    def get_intrinsic_reward(self, state, action, next_state):
        return 0.0

    def get_loss(self, state_batch, action_batch, next_state_batch):
        pass

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size))
        
        def init_weights(tensor):
            if isinstance(tensor, nn.Linear):
                nn.init.xavier_uniform_(tensor.weight)
        self.layers.apply(init_weights)
        
    def forward(self, x):
        return self.layers(x)

In [None]:
def train_with_reward(env, agent, seed, reward_module, intrinsic_weight, n_episodes, test_reward_period,
                      update_reward_period, batch_size, n_iter, std_dev, annealing = True):
    
    buffer = ReplayBuffer(size=int(1e6))
    
    if list(reward_module.parameters()):
        optimizer = torch.optim.Adam(reward_module.parameters())
    else:
        optimizer = None
                
    losses = []
    
    previous_mean = 0
    rolling_mean_window = 50
    rolling_means = [] 
    test_n = 0
    target_value = 0.99
    
    all_test_rewards = []    
    all_intrinsic_rewards = []
    all_rewards  = []
    all_states = []
    all_percentages = []
    
    for i in tqdm(range(n_episodes)):
        s, _   = env.reset(seed=seed)

        done = False
        episode_rewards = 0
        episode_states  = []
        episodes_int_rews = []
        episode_is = []
        steps = 0
        
        while not done:
            steps+=1
            
            episode_states.append(s)
            i_s = get_state_number(s)
            episode_is.append(i_s)
            
            a = agent.get_action(i_s)
            s_next, r, terminated, truncated, _  = env.step(a)
            episode_rewards += r
            
            done = terminated or truncated
            i_s_next = get_state_number(s_next)
            state_t  = torch.tensor(s).float().view(1, -1)        
            action_t = torch.tensor(a).float().view(1, -1)
            next_state_t = torch.tensor(s_next).float().view(1, -1)
            
            r_intr = intrinsic_weight * reward_module.get_intrinsic_reward(state_t, action_t, next_state_t)
            episodes_int_rews.append(r_intr)
            r += r_intr

            agent.update(i_s, a, r, i_s_next, terminated)
            buffer.add(state_t, a, r, next_state_t, terminated)
            
            s = s_next
            
            if done or truncated:
                break
                
        all_rewards.append(episode_rewards)
        all_states.append(episode_states)
        all_intrinsic_rewards.append(np.mean(episodes_int_rews))
        
        """Anneal Intrinsic Reward"""
        if annealing:
            if (i +1) % 50 == 0:
                current_mean  = np.mean(all_rewards[-rolling_mean_window:])
                if current_mean > previous_mean:
                    intrinsic_weight *= 0.9
                previous_mean = current_mean
        
        """Freeze & Test Policy"""
        if (i+1) % test_reward_period == 0:
            test_n+=1
            test_rewards = freeze_test_agent(agent,10)
            all_test_rewards.append(np.mean(test_rewards))
            
            percentages = get_state_visitation_percentage(env, all_states)
            all_percentages.append(percentages)


        """Update Intrinsic Module"""
        if (i + 1) % update_reward_period == 0 and optimizer is not None:
        
            for _ in range(n_iter):
                optimizer.zero_grad()
                state_batch, action_batch, _, next_state_batch, _ = buffer.sample(batch_size)
                state_tensor  = torch.tensor(state_batch).float().flatten(1, 2)

                action_tensor = torch.tensor(action_batch).float().view(-1, 1)
                next_state_tensor = torch.tensor(next_state_batch).float().flatten(1, 2)
                                
                loss = reward_module.get_loss(state_tensor, action_tensor, next_state_tensor)
                loss.backward()
                optimizer.step()
                losses.append(loss.item())
                    
    return all_rewards, all_states, all_test_rewards, all_intrinsic_rewards, all_percentages

# RND

In [None]:
class RandomNetworkDistilationModule(BaseIntrinsicRewardModule):
    
    def __init__(self, states_size, embedding_size, hidden_size):
        super().__init__()
        self.target_network = MLP(states_size,
                                  hidden_size,
                                  embedding_size)
        
        self.predictor_network = MLP(states_size,
                                     hidden_size,
                                     embedding_size)
        self.target_network.eval()
        
    def get_intrinsic_reward(self, state, action, next_state):
        with torch.no_grad():
            target_embedding = self.target_network(next_state)
            predictor_embedding = self.predictor_network(next_state)
            intrinsic_reward = ((target_embedding - predictor_embedding) ** 2).sum()
        return intrinsic_reward

    def get_loss(self, state_batch, action_batch, next_state_batch):
        
        with torch.no_grad():
            target_embedding  = self.target_network(next_state_batch)
            
        predictor_embedding = self.predictor_network(next_state_batch)
        loss    = 0.5*((target_embedding - predictor_embedding) ** 2).mean()

        return loss

In [None]:
num_agents = 3
rnd_all_agent_percentages = []
rnd_all_states = []

for i in range(num_agents):
    seed = i
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

    time_num_episodes = 5000      
    test_reward_period = 100

    agent = QLearningAgent(alpha    = 0.5, 
                           epsilon  = 0.1,
                           discount = 0.9,
                           get_legal_actions=lambda s: range(env.action_space.n),
                           item = False)

    rnd = RandomNetworkDistilationModule(np.prod(env.observation_space.shape), 
                                         np.prod(env.observation_space.shape), 
                                         16)

    rnd_rewards, rnd_states, rnd_test_rewards, rnd_ints,  rnd_percentages = train_with_reward(env,
                                                                                agent,
                                                                                seed,
                                                                                rnd,
                                                                                intrinsic_weight = 1,
                                                                                n_episodes=time_num_episodes,
                                                                                test_reward_period = test_reward_period,
                                                                                update_reward_period = 100,
                                                                                batch_size =  150,
                                                                                n_iter =  50,
                                                                                std_dev = 0.0, 
                                                                                annealing = True)
                    
    rnd_all_agent_percentages.append(rnd_percentages)
    rnd_all_states.append(rnd_states)
    
save_dir = f'saved_deepsea_results/'
precentages_path = f"{save_dir}rnd_percentages_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(precentages_path, rnd_all_agent_percentages)

save_dir = f'saved_deepsea_results/'
state_visits_path = f"{save_dir}rnd_states_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(state_visits_path, rnd_all_states[0])

# ICM

In [None]:
class Embedder(nn.Module):
    def __init__(self, states_size, embedding_size, hidden_size):
        super().__init__()
        self.module = MLP(states_size,
                          hidden_size,
                          embedding_size)

    def forward(self, s):
        return self.module(s)
    
class ForwardDynamics(BaseIntrinsicRewardModule):
    def __init__(self, states_size, actions_size, hidden_size, alpha=.1):
        super().__init__()
        self.module = MLP(actions_size + states_size,
                          hidden_size,
                          states_size)
        self.alpha = alpha
        self.mean_reward = 0
    
    def forward(self, s, a):
        sa = torch.cat([s, a], dim=-1)
        return s + self.module(sa)
    
    
    def get_loss(self, state_batch, action_batch, next_state_batch):
        predicted_next_states = self.forward(state_batch, action_batch)
        loss = F.mse_loss(predicted_next_states, next_state_batch)
        return loss
    
class InverseDynamics(BaseIntrinsicRewardModule):
    def __init__(self, states_size, actions_size, hidden_size, alpha=0.1):
        super().__init__()
        self.module = MLP(2 * states_size,
                          hidden_size,
                          actions_size)
        
        self.alpha = alpha
        self.mean_reward = 0
        self.actions_size = actions_size
    
    def forward(self, s, s_next):
        combined_state = torch.cat((s, s_next), dim=-1)
        a_pred_proba = self.module(combined_state)
        return torch.softmax(a_pred_proba, dim=-1)    

    def get_loss(self, state_batch, action_batch, next_state_batch):        
        a_pred_proba = self.forward(state_batch, next_state_batch)
        a_one_hot    = to_one_hot(action_batch, self.actions_size)
        return -(torch.log(a_pred_proba) * a_one_hot).sum(dim=-1).mean()

    
class ICMModule(BaseIntrinsicRewardModule):
    def __init__(self, states_size, actions_size, hidden_size, embedding_size):
        super().__init__()
        
        self.forward_model = ForwardDynamics(embedding_size,
                                             actions_size = 1, 
                                             hidden_size = hidden_size)
        self.inverse_model = InverseDynamics(embedding_size,
                                             actions_size, 
                                             hidden_size, 
                                             alpha=0.1)
        self.embedder = Embedder(states_size, 
                                 embedding_size, 
                                 hidden_size)
        self.n = 1
        self.mean_reward = 0
        self.actions_size = actions_size
        
    def get_intrinsic_reward(self, state, action, next_state):
        with torch.no_grad():       
            phi_hat_s      = self.embedder.forward(state)
            phi_hat_next_s = self.embedder.forward(next_state)
            phi_pred_next_state = self.forward_model.forward(phi_hat_s, action)
            intrinsic_reward = 0.3*(phi_pred_next_state - phi_hat_next_s).pow(2).mean()
        return intrinsic_reward
    
    def get_loss(self, state_batch, action_batch, next_state_batch):
        phi_s_batch      = self.embedder(state_batch)
        phi_next_s_batch = self.embedder(next_state_batch)
        forward_loss   = self.forward_model.get_loss(phi_s_batch.detach(), action_batch, phi_next_s_batch.detach())
        inverse_loss   = self.inverse_model.get_loss(phi_s_batch, action_batch, phi_next_s_batch)
        intrinsic_loss = forward_loss + inverse_loss
        
        return intrinsic_loss
    

In [None]:
num_agents = 3
icm_all_agent_percentages = []
icm_all_states = []

for i in range(num_agents):
    seed = i
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

    time_num_episodes = 5000       
    test_reward_period = 100

    agent = QLearningAgent(alpha    = 0.5, 
                           epsilon  = 0.1,
                           discount = 0.9,
                           get_legal_actions=lambda s: range(env.action_space.n),
                           item = False)

    icm = ICMModule(states_size = np.prod(env.observation_space.shape), 
                    actions_size   = env.action_space.n, 
                    hidden_size = 16,
                    embedding_size = 32)

    _, icm_states, icm_test_rewards, icm_ints, icm_percentages = train_with_reward(env,
                                                                        agent,
                                                                        seed, 
                                                                        icm,
                                                                        intrinsic_weight = 1,
                                                                        n_episodes=time_num_episodes,
                                                                        test_reward_period = test_reward_period,
                                                                        update_reward_period = 100,
                                                                        batch_size =  150,
                                                                        n_iter =  50,
                                                                        std_dev = 0.0, 
                                                                        annealing = True)
    icm_all_agent_percentages.append(icm_percentages)
    icm_all_states.append(icm_states)
    
save_dir = f'saved_deepsea_results/'
precentages_path = f"{save_dir}icm_percentages_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(precentages_path, icm_all_agent_percentages)

save_dir = f'saved_deepsea_results/'
state_visits_path = f"{save_dir}icm_states_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(state_visits_path, icm_all_states[0])

# Count

In [None]:
class CountBasedModule(BaseIntrinsicRewardModule):
    
    def __init__(self, hash_functions, states_size, beta):
        super().__init__()
        self.hash = {}
        self.A = np.random.normal(0, 1, (hash_functions, states_size))
        self.beta = beta
        
    def get_intrinsic_reward(self, state, action, next_state):
        counts = []
        state = state.cpu().numpy()[0]
        hash_values = np.dot(self.A, state)
        key  = str(np.sign(hash_values))
        
        if key in self.hash:
                self.hash[key] += 1
        else:
            self.hash[key] = 1

        counts.append(self.hash[key])    
        
        intrinsic_reward = beta / np.sqrt(counts[0])
        
        return torch.tensor(intrinsic_reward)

In [None]:
num_agents = 3
count_all_agent_percentages = []
count_all_states = []
    
for i in range(num_agents):
    seed = i
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

    epsilon = 0.1
    time_num_episodes = 5000     
    test_reward_period = 100
    hash_functions = 32
    beta = 1
    states_size = np.prod(env.observation_space.shape)
    agent = QLearningAgent(alpha    = 0.5, 
                           epsilon  = epsilon,
                           discount = 0.9,
                           get_legal_actions=lambda s: range(env.action_space.n),
                           item = True)

    count = CountBasedModule(hash_functions, states_size, beta)
    
    count_rewards, count_states, count_test_rewards, count_ints, count_percentages = train_with_reward(env,
                                                                        agent,
                                                                        seed,
                                                                        count,
                                                                        intrinsic_weight = 1,
                                                                        n_episodes=time_num_episodes,
                                                                        test_reward_period = test_reward_period,
                                                                        update_reward_period = 100,
                                                                        batch_size =  150,
                                                                        n_iter =  50,
                                                                        std_dev = 0.0,
                                                                        annealing = True)
    count_all_agent_percentages.append(count_percentages)
    count_all_states.append(count_states)
    
save_dir = f'saved_deepsea_results/'
precentages_path = f"{save_dir}count_percentages_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(precentages_path, count_all_agent_percentages)

save_dir = f'saved_deepsea_results/'
state_visits_path = f"{save_dir}count_states_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(state_visits_path, count_all_states[0])


# RIDE

In [None]:
class Embedder(nn.Module):
    def __init__(self, states_size, embedding_size, hidden_size):
        super().__init__()
        self.module = MLP(states_size,
                          hidden_size,
                          embedding_size)

    def forward(self, s):
        return self.module(s)
    
class ForwardDynamics(BaseIntrinsicRewardModule):
    def __init__(self, states_size, actions_size, hidden_size, alpha=.1):
        super().__init__()
        self.module = MLP(actions_size + states_size,
                          hidden_size,
                          states_size)
        self.alpha = alpha
        self.mean_reward = 0
    
    def forward(self, s, a):
        sa = torch.cat([s, a], dim=-1)
        return s + self.module(sa)
    
    
    def get_loss(self, state_batch, action_batch, next_state_batch):
        predicted_next_states = self.forward(state_batch, action_batch)
        loss = F.mse_loss(predicted_next_states, next_state_batch)
        return loss
    
class InverseDynamics(BaseIntrinsicRewardModule):
    def __init__(self, states_size, actions_size, hidden_size, alpha=0.1):
        super().__init__()
        self.module = MLP(2 * states_size,
                          hidden_size,
                          actions_size)
        
        self.alpha = alpha
        self.mean_reward = 0
        self.actions_size = actions_size
    
    def forward(self, s, s_next):
        combined_state = torch.cat((s, s_next), dim=-1)
        a_pred_proba = self.module(combined_state)
        return torch.softmax(a_pred_proba, dim=-1)    

    def get_loss(self, state_batch, action_batch, next_state_batch):        
        a_pred_proba = self.forward(state_batch, next_state_batch)
        a_one_hot    = to_one_hot(action_batch, self.actions_size)
        return -(torch.log(a_pred_proba) * a_one_hot).sum(dim=-1).mean()

    
class RIDEModule(BaseIntrinsicRewardModule):
    def __init__(self, states_size, actions_size, hidden_size, embedding_size, hash_functions, beta):
        super().__init__()
        
        self.forward_model = ForwardDynamics(embedding_size,
                                             actions_size = 1, 
                                             hidden_size = hidden_size)
        self.inverse_model = InverseDynamics(embedding_size,
                                             actions_size, 
                                             hidden_size, 
                                             alpha=0.1)
        self.embedder = Embedder(states_size, 
                                 embedding_size, 
                                 hidden_size)
        self.count_model =CountBasedModule(hash_functions, 
                                           states_size, 
                                           beta)

        self.n = 1
        self.mean_reward = 0
        self.actions_size = actions_size
        
    def get_intrinsic_reward(self, state, action, next_state):
        
        count_rewards = self.count_model.get_intrinsic_reward(state, action, next_state)

        with torch.no_grad():       
            state_emb      = self.embedder.forward(state)
            next_state_emb = self.embedder.forward(next_state)
            
            control_rewards  = (next_state_emb - state_emb).pow(2).mean()
            intrinsic_reward = count_rewards*control_rewards

        return intrinsic_reward
    
    def get_loss(self, state_batch, action_batch, next_state_batch):
        phi_s_batch      = self.embedder(state_batch)
        phi_next_s_batch = self.embedder(next_state_batch)
        forward_loss   = self.forward_model.get_loss(phi_s_batch.detach(), action_batch, phi_next_s_batch.detach())
        inverse_loss   = self.inverse_model.get_loss(phi_s_batch, action_batch, phi_next_s_batch)
        intrinsic_loss = forward_loss + inverse_loss
        
        return intrinsic_loss
    

In [None]:
num_agents = 3
ride_all_agent_percentages = []
ride_all_states = []

for i in range(num_agents):
    seed = i
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

    epsilon = 0.1
    time_num_episodes = 5000        
    test_reward_period = 100
    hash_functions = 32
    beta = 1
    states_size = np.prod(env.observation_space.shape)
    agent = QLearningAgent(alpha    = 0.5, 
                           epsilon  = epsilon,
                           discount = 0.9,
                           get_legal_actions=lambda s: range(env.action_space.n),
                           item = True)

    ride = RIDEModule(states_size = np.prod(env.observation_space.shape), 
                      actions_size   = env.action_space.n, 
                      hidden_size = 16,
                      embedding_size = 32,
                      hash_functions = 32, 
                      beta = 1)
    
    _, ride_states, ride_test_rewards, ride_ints, ride_percentages  = train_with_reward(env,
                                                                        agent,
                                                                        seed,
                                                                        ride,
                                                                        intrinsic_weight = 1,
                                                                        n_episodes=time_num_episodes,
                                                                        test_reward_period = test_reward_period,
                                                                        update_reward_period = 100,
                                                                        batch_size =  150,
                                                                        n_iter =  50,
                                                                        std_dev = 0.0,
                                                                        annealing = True)
    ride_all_agent_percentages.append(ride_percentages)
    ride_all_states.append(ride_states)
    
save_dir = f'saved_deepsea_results/'
precentages_path = f"{save_dir}ride_percentages_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(precentages_path, ride_all_agent_percentages)

save_dir = f'saved_deepsea_results/'
state_visits_path = f"{save_dir}ride_states_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(state_visits_path, ride_all_states[0])

# LBS

In [None]:
class VariationalLinear(nn.Module):
    def __init__(self, num_inputs, num_outputs, reparam_noise=1e-4):
        super(VariationalLinear, self).__init__()
        self.mu    = nn.Linear(num_inputs, num_outputs)
        self.sigma = nn.Linear(num_inputs, num_outputs)
        self.reparam_noise = reparam_noise

    def forward(self, x):
        mu = self.mu(x)
        sigma = self.sigma(x)
        sigma = F.softplus(sigma) + self.reparam_noise
        return mu, sigma

class LBS_Reward(BaseIntrinsicRewardModule):
    def __init__(self,action_size, state_size, hidden_size):
        super().__init__()

        self.trans_det = nn.Sequential(
                nn.Linear(state_size + action_size, hidden_size),  
                nn.ReLU(),
                nn.Linear(hidden_size, hidden_size),                
                nn.ReLU())
        
        self.trans_stoc = VariationalLinear(hidden_size, state_size)  
        self.repr_model = nn.Sequential(
                    nn.Linear(hidden_size + state_size, hidden_size),
                    nn.ReLU(),
                    nn.Linear(hidden_size, hidden_size),
                    nn.ReLU(),
                    VariationalLinear(hidden_size, state_size))
        
        self.beta = 1
        self.head = nn.Linear(state_size, hidden_size) 
        
        
    def forward(self, state, action, next_state):
        
        # prior
        s_a_combined = torch.cat([state, action], dim=-1).float()
        trans_det_states = self.trans_det(s_a_combined)
        
        trans_stoch_mu, trans_stoch_sigma = self.trans_stoc(trans_det_states)
        trans_stoch_distr = D.independent.Independent(D.Normal(trans_stoch_mu, trans_stoch_sigma), 1)

        # posterior 
        ns_a_combined = torch.cat([trans_det_states, next_state], dim=-1) 
        
        repr_stoch_mu, repr_stoch_sigma = self.repr_model(ns_a_combined)
        repr_stoch_distr = D.independent.Independent(D.Normal(repr_stoch_mu, repr_stoch_sigma), 1)

        return trans_det_states, trans_stoch_distr, repr_stoch_distr
    
    def get_intrinsic_reward(self, state, action, next_state):

        state = torch.flatten(state, start_dim=0) 
        action = torch.tensor(action).squeeze(0) 
        next_state = torch.flatten(next_state, start_dim=0) 

        _, trans_pred_distr, repr_pred_distr = self.forward(state, action, next_state)
        
        intrinsic_reward = D.kl.kl_divergence(repr_pred_distr, trans_pred_distr) # .mean(-1)
        
        return intrinsic_reward.detach().numpy()

    def get_loss(self, state_batch, action_batch, next_state_batch):        
    
        target_next_states, trans_pred_distr, repr_pred_distr = self.forward(state_batch, action_batch, next_state_batch)
        repr_samples = repr_pred_distr.rsample()

        target_distr = D.independent.Independent(D.Normal(target_next_states, torch.ones_like(target_next_states)), 1)
        
        repr_projections = self.head(repr_samples)
        logprob_target = target_distr.log_prob(repr_projections)
        
        kl_div_post_prior = D.kl.kl_divergence(repr_pred_distr, trans_pred_distr) 

        loss = (self.beta * kl_div_post_prior - logprob_target).mean()
        
        return loss

In [None]:
import torch.distributions as D

num_agents = 3
lbs_all_agent_percentages = []
lbs_all_states = []

for i in range(num_agents):
    seed = i
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

    epsilon = 0.1
    time_num_episodes = 5000        
    test_reward_period = 100
    states_size = np.prod(env.observation_space.shape)
    agent = QLearningAgent(alpha    = 0.5, 
                           epsilon  = epsilon,
                           discount = 0.9,
                           get_legal_actions=lambda s: range(env.action_space.n),
                           item = True)

    lbs = LBS_Reward(action_size = 1, 
                     state_size  = np.prod(env.observation_space.shape),
                     hidden_size = 16)
    

    lbs_rewards, lbs_states, lbs_test_rewards, lbs_ints, lbs_percentages = train_with_reward(env,
                                                                        agent,
                                                                        seed,
                                                                        lbs,
                                                                        intrinsic_weight = 1,
                                                                        n_episodes=time_num_episodes,
                                                                        test_reward_period = test_reward_period,
                                                                        update_reward_period = 100,
                                                                        batch_size =  150,
                                                                        n_iter =  50,
                                                                        std_dev = 0.0, 
                                                                        annealing = True)
    lbs_all_agent_percentages.append(lbs_percentages)
    lbs_all_states.append(lbs_states)
    
    
save_dir = f'saved_deepsea_results/'
precentages_path = f"{save_dir}lbs_percentages_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(precentages_path, lbs_all_agent_percentages)

save_dir = f'saved_deepsea_results/'
state_visits_path = f"{save_dir}lbs_states_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(state_visits_path, lbs_all_states[0])

# Surprise

In [None]:
class Embedder(nn.Module):
    def __init__(self, states_size, embedding_size, hidden_size):
        super().__init__()
        self.module = MLP(states_size,
                          embedding_size,
                          hidden_size)
    def forward(self, s):
        return self.module(s)

class GaussianForwardDynamics(nn.Module):
    def __init__(self, encoding_dim, action_size, latent_dim):
        super().__init__()
     
        self.fc = nn.Linear(encoding_dim + action_size, encoding_dim)
        self.fc_mu = nn.Linear(encoding_dim, latent_dim)
        self.fc_log_var = nn.Linear(encoding_dim, latent_dim)

    def forward(self, latent_state, action):
        
        x  = torch.cat([latent_state, action], dim=-1)    
        x  = F.gelu(self.fc(x))
        mu = self.fc_mu(x)
        log_var = self.fc_log_var(x)
        return mu, log_var


class SurprisalModule(BaseIntrinsicRewardModule):
    def __init__(self, input_size, action_size, linear_size, hidden_size, *args, **kwargs):
        super().__init__()
        
        self.embedder = Embedder(input_size, 
                                 linear_size, 
                                 hidden_size)
        
        self.forward_model = GaussianForwardDynamics(hidden_size, 
                                                     1,
                                                     hidden_size)
        
        self.n = 1
        self.eta0 = 0

    def get_loss_vec(self, state_batch, action_batch, next_state_batch):

        phi_s_batch = self.embedder(state_batch)

        with torch.no_grad():
            phi_next_s_batch = self.embedder(next_state_batch)
        
        mu, log_var = self.forward_model.forward(phi_s_batch, action_batch)
        dist = torch.distributions.MultivariateNormal(mu, torch.diag_embed(torch.exp(log_var)))
        loss = -dist.log_prob(phi_next_s_batch)
        return loss

    def get_intrinsic_reward(self, state, action, next_state):
        with torch.no_grad():
            loss_vec = self.get_loss_vec(state, action, next_state)
            intrinsic_reward = self.normalise_reward(loss_vec)
        return intrinsic_reward[0] 
    
    def get_loss(self, state_batch, action_batch, next_state_batch, *args, **kwargs):
        loss_vec = self.get_loss_vec(state_batch, action_batch, next_state_batch)
        loss = torch.mean(loss_vec)
        return loss
    
    def normalise_reward(self, rewards_batch):
        if rewards_batch.shape[0] == 1:
            return rewards_batch
        
        mean_rewards = torch.abs(torch.mean(rewards_batch).view(rewards_batch.shape[0], 1))
        norm_rewards = (rewards_batch - torch.min([0, mean_rewards])) / torch.max([1, mean_rewards.squeeze()] )
        return norm_rewards

In [None]:
num_agents = 3
surp_all_agent_percentages = []
surp_all_states = []

for i in range(num_agents):
    seed = i
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

    epsilon = 0.1
    time_num_episodes = 5000        
    test_reward_period = 100
    states_size = np.prod(env.observation_space.shape)
    agent = QLearningAgent(alpha    = 0.5, 
                           epsilon  = epsilon,
                           discount = 0.9,
                           get_legal_actions=lambda s: range(env.action_space.n),
                           item = True)
    
    surprisal =  SurprisalModule(input_size = np.prod(env.observation_space.shape),  
                                 action_size = env.action_space.n, 
                                 linear_size = 32, 
                                 hidden_size = 16)
        
    _, surp_states, surp_test_rewards, surp_ints, surp_percentages = train_with_reward(env,
                                                                        agent,
                                                                        seed,
                                                                        surprisal,
                                                                        intrinsic_weight = 1,
                                                                        n_episodes=time_num_episodes,
                                                                        test_reward_period = test_reward_period,
                                                                        update_reward_period = 100,
                                                                        batch_size =  150,
                                                                        n_iter =  50,
                                                                        std_dev = 0.0,
                                                                        annealing = True)
    surp_all_agent_percentages.append(surp_percentages)
    surp_all_states.append(surp_states)
        
save_dir = f'saved_deepsea_results/'
precentages_path = f"{save_dir}surp_percentages_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(precentages_path, surp_all_agent_percentages)

save_dir = f'saved_deepsea_results/'
state_visits_path = f"{save_dir}surp_states_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(state_visits_path, surp_all_states[0])

# LP Surprisal

In [None]:
from copy import deepcopy
from collections import deque
class LPSurprisalModule(SurprisalModule):
        def __init__(self, input_size, action_size, linear_size,hidden_size,learning_progress_n=1, *args,  **kwargs):
            super().__init__(input_size, action_size, linear_size, hidden_size, *args, **kwargs)
            self._old_forward_models = deque(maxlen = learning_progress_n)
            self._old_embedders = deque(maxlen = learning_progress_n)

        def get_loss_vec(self, state_batch, action_batch, next_state_batch, old=False):
            # If there haaven't been enough old models return 0
            if old:
                embedder = self._old_embedders[0]
                model = self._old_forward_models[0]
            else:
                embedder = self.embedder
                model = self.forward_model
            
            phi_s_batch = embedder(state_batch)
            with T.no_grad():
                phi_next_s_batch = embedder(next_state_batch)

            mu, log_var = model.forward(phi_s_batch, action_batch)
            dist = torch.distributions.MultivariateNormal(mu, T.diag_embed(T.exp(log_var)))
            loss_vec = -dist.log_prob(phi_next_s_batch)
            
            return loss_vec

        @torch.no_grad()
        def update_old_models(self):
            old_model = deepcopy(self.forward_model)
            old_model.eval()

            old_embedder = deepcopy(self.embedder)
            old_embedder.eval()
            self._old_embedders.append(old_embedder)
            self._old_forward_models.append(old_model)

        @torch.no_grad()
        def get_intrinsic_reward(self, state, action, next_state):
            if len(self._old_forward_models) == 0:
                intrinsic_reward = torch.zeros(state.shape[0],)
                return intrinsic_reward.squeeze()
            else:
                old_loss = self.get_loss_vec(state, action, next_state, old=True)
                new_loss = self.get_loss_vec(state, action, next_state, old=False)
                intrinsic_reward = self.normalise_reward(new_loss - old_loss)
                return intrinsic_reward.squeeze()

In [None]:
num_agents = 3
lp_surp_all_agent_percentages = []
lp_surp_all_states = []

for i in range(num_agents):
    seed = i
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

    epsilon = 0.1
    time_num_episodes = 5000        
    test_reward_period = 100
    states_size = np.prod(env.observation_space.shape)
    agent = QLearningAgent(alpha    = 0.5, 
                           epsilon  = epsilon,
                           discount = 0.9,
                           get_legal_actions=lambda s: range(env.action_space.n),
                           item = True)

    
    
    lp_surprisal =  LPSurprisalModule(input_size = np.prod(env.observation_space.shape),  
                                   action_size = env.action_space.n, 
                                   linear_size = 32, 
                                   hidden_size = 16,
                                   learning_progress_n=10)

    _, lp_surp_states, lp_surp_test_rewards, lp_surp_ints, lp_surp_percentages = train_with_reward(env,
                                                                        agent,
                                                                        seed, 
                                                                        surprisal,
                                                                        intrinsic_weight = 1,
                                                                        n_episodes=time_num_episodes,
                                                                        test_reward_period = test_reward_period,
                                                                        update_reward_period = 100,
                                                                        batch_size =  150,
                                                                        n_iter =  50,
                                                                        std_dev = 0.0,
                                                                        annealing = True)
    lp_surp_all_agent_percentages.append(lp_surp_percentages)
    lp_surp_all_states.append(lp_surp_states)
        
save_dir = f'saved_deepsea_results/'
precentages_path = f"{save_dir}lp_surp_percentages_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(precentages_path, lp_surp_all_agent_percentages)

save_dir = f'saved_deepsea_results/'
state_visits_path = f"{save_dir}lp_surp_states_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(state_visits_path, lp_surp_all_states[0])

# VAE

In [None]:
from prior_utils import *

In [None]:
env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

agent = QLearningAgent(epsilon = 0.5, 
                       alpha   = 0.5, 
                       discount=1, 
                       get_legal_actions=lambda s: range(env.action_space.n),
                       item = False)

num_episodes = 1

def get_random_states(env, num_episodes):
    eps_states = []
    for ep in tqdm(range(num_episodes)):    
        s, _ = env.reset()
        done = False
        states = [] 

        while not done:
            states.append(s)
            i_s = get_state_number(s)
            a = np.random.choice((0,1))
            s_next, r, terminated, truncated, _  = env.step(a)
            done = terminated or truncated
            i_s_next = get_state_number(s_next)
            agent.update(i_s, a, r, i_s_next, terminated)
            s = s_next

        eps_states.append(states)
    return eps_states

random_states = get_random_states(env, 100)

flat_random_states = []
for sublist in random_states:
    flat_random_states.extend(sublist)
    
random_states_tensor = torch.tensor(flat_random_states, dtype=torch.float32)[:, None]

print('Total Random States for Training:', len(random_states))
print('Total Random States Tensor Shape:',  random_states_tensor.shape)

trainloader = DataLoader(TensorDataset(random_states_tensor), batch_size = 32, shuffle = True)

In [None]:
class Encoder(nn.Module):
    def __init__(self, image_channels, input_shape):
        super(Encoder, self).__init__()
        
        self.ratio = 1.5
        self.input_shape = input_shape
    
        encoding_dim =1152 #24
#         encoding_dim =4608 #48

        self.enc1 = nn.Conv2d(in_channels=image_channels, out_channels=16, kernel_size=4, stride = 2, padding = 1)
        self.enc2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4,stride = 2, padding = 1)

        self.fc_mu = nn.Linear(encoding_dim, latent_dim)
        self.fc_log_var = nn.Linear(encoding_dim, latent_dim)
        
    def forward(self, x):
        x_shapes = []
        x = F.relu(self.enc1(x))
        x = F.relu(self.enc2(x))
        x_shapes.append(x.shape)
        
        x = x.view(-1, x_shapes[0][1]*x_shapes[0][2]*x_shapes[0][3])  

        mu = self.fc_mu(x)
        log_var = self.fc_log_var(x)
        
        return mu, log_var, x_shapes

class Decoder(nn.Module):
    def __init__(self, image_channels, input_shape):
        super(Decoder, self).__init__()
 
        self.ratio = 1.5
        self.input_shape = input_shape
        decoding_dim = 1152 #24
#         decoding_dim =4608 #48
        
        self.fc2  = nn.Linear(latent_dim,decoding_dim)
        self.dec1 = nn.ConvTranspose2d(in_channels=32, out_channels=16, kernel_size=4, stride=2, padding=1)
        #24
        self.dec2 = nn.ConvTranspose2d(in_channels=16, out_channels=image_channels, kernel_size=4, stride=2, padding=1)

        
    def forward(self, z, x_shapes):
        
        z = self.fc2(z)
        x = z.view(x_shapes[0])
        x = F.relu(self.dec1(x))
        recon = torch.sigmoid(self.dec2(x))
        return recon
    

class ConvVAE(nn.Module):
    def __init__(self, encoder, decoder, prior, latent_dim):
        super(ConvVAE, self).__init__()
 
        self.encoder = encoder
        self.decoder = decoder
        self.prior = prior
        self.latent_dim = latent_dim
        self.criterion = nn.BCELoss(reduction='none')
        
        self.learned_mu = nn.Parameter(torch.randn(1,2))
        self.learned_log_var = nn.Parameter(torch.randn(1,2))
        
    def log_prob_encoder(self, x, z):
        mu, log_var, _ = self.encoder(x)
        return log_normal_diag(z, mu, log_var, None, 1)
 
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5*log_var) 
        eps = torch.randn_like(std) 
        sample = mu + (eps * std)
        return sample
    
    def reparameterize_reward(self, mu, log_var):
        std = torch.exp(0.5*log_var) 
        eps = torch.zeros(latent_dim).expand(mu.shape[0],latent_dim)
        sample = mu + (eps * std)    
        return  sample

    def forward(self, x):
        mu, log_var, x_shapes = self.encoder(x)
        
        z = self.reparameterize(mu, log_var)
        recon = self.decoder(z, x_shapes)
        recon_loss  = self.criterion(recon, x).sum()  #new sum
        log_probs_p = self.log_prob_encoder(x, z) 
        log_probs_q = self.prior.log_prob(z)        
        kld_loss    = (log_probs_p.sum(axis = 0) - log_probs_q.sum(axis = 0)).sum(-1)  #original vae   
        loss = (recon_loss + kld_loss).mean()
        return loss
    
#     def forward(self, x):
#         mu, log_var, x_shapes = self.encoder(x)
#         z = self.reparameterize(mu, log_var)
#         recon = self.decoder(z, x_shapes)
#         recon_loss = self.criterion(recon, x).sum([1,2,3]) 
#         print('recon', recon_loss.shape)

#         log_probs_p = self.log_prob_encoder(x, z) 
#         log_probs_q = self.prior.log_prob(z)
#         kld_loss = (log_probs_p.sum(axis = 0) - log_probs_q.sum(axis = 0)).sum(-1)
#         print('kld', kld_loss.shape)
#         loss = (recon_loss + kld_loss).mean()
#         return loss
    
def plot_loss(all_losses):
    plt.plot(all_losses)
    plt.title('Average Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid()

In [None]:
class KLDivergenceReward(BaseIntrinsicRewardModule):
    
    def __init__(self, vae):
        super().__init__()
        
        self.vae = vae
        
    def get_intrinsic_reward(self, state, action, next_state):
        observation_shape = int(np.sqrt(next_state.shape[1]))

        next_state = next_state.view(-1, 1, observation_shape, observation_shape)

        with torch.no_grad():
            mu, log_var, _ = self.vae.encoder(next_state)
            z = self.vae.reparameterize_reward(mu, log_var)
        
            log_probs_p = self.vae.log_prob_encoder(next_state, z) 
            log_probs_q = self.vae.prior.log_prob(z)

            KL = (log_probs_p.sum(axis = 0) - log_probs_q.sum(axis = 0)).sum(-1)
            
        return KL
    
    def get_standard_prob(self,range_lim):
        x = np.linspace(-range_lim, range_lim)
        return torch.tensor((1 / np.sqrt(2 * np.pi * 1 * 2)) * np.exp((-(x ) * 2 )/ (2 * 1 ** 2))).view(int(50/2),2)

    def get_loss(self, state_batch, action_batch, next_state_batch):
        
        self.vae.train()        
        observation_shape = int(np.sqrt(next_state_batch.shape[1]))
        next_state_batch  = next_state_batch.view(-1, 1, observation_shape, observation_shape)
        
        mu, log_var, x_shapes = self.vae.encoder(next_state_batch)

        z = self.vae.reparameterize(mu, log_var)
        
        recon = self.vae.decoder(z, x_shapes)

        recon_loss = self.vae.criterion(recon, next_state_batch).sum()
        
        log_probs_p = self.vae.log_prob_encoder(next_state_batch, z) 
        log_probs_q = self.vae.prior.log_prob(z)
        
        kld_loss = (log_probs_p.sum(axis = 0) - log_probs_q.sum(axis = 0)).sum(-1)
        loss = (recon_loss + kld_loss).mean()
        
        self.vae.eval()
        
        return loss

In [None]:
seed = 0
random.seed(seed)
np.random.seed(seed)

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
rows = env.observation_space.shape[0]
cols = env.observation_space.shape[1]

lr = 1e-3
M  = 256
D  = (1,rows, cols) 
num_vals   = 1
latent_dim = 2
batch_size = 32
image_channels = 1 
input_shape  = (3,rows, cols) 

prior_name = 'vampprior'

encoder = Encoder(image_channels, input_shape)
decoder = Decoder(image_channels, input_shape)
prior = choose_prior(prior_name, latent_dim, M, D, num_vals, encoder, image_channels, device)
vae   = ConvVAE(encoder, decoder, prior, latent_dim).to(device)    
optimizer = torch.optim.Adam(vae.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

def VAE_train(x, model):
    x = x.to(device)

    model.zero_grad()
    loss = model(x)
    loss.backward()
    optimizer.step()
    return loss.data.item()

epochs = 20
vae.train()
all_losses = []

for epoch in tqdm(range(1, epochs+1)):           
    VAE_losses = []
    for batch_idx, (x) in enumerate(trainloader):
        x = x[0]
        VAE_losses.append(VAE_train(x, vae))
    
    scheduler.step()
    print('[%d/%d]: loss_VAE: %.3f' % ((epoch), epochs, torch.mean(torch.FloatTensor(VAE_losses))))
    all_losses.append(torch.mean(torch.FloatTensor(VAE_losses)))
plot_loss(all_losses)

torch.save(vae.state_dict(), f'deepsea_vae_models/pre_trained_vae_{prior_name}.pth')

In [None]:
num_agents = 3
prior_name = 'vampprior'

globals()[f"{prior_name}_all_agent_percentages"] = []
globals()[f"{prior_name}_all_states"] = []

for i in range(num_agents):
    seed = i
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

    epsilon = 0.1
    time_num_episodes = 5000        
    test_reward_period = 100
    states_size = np.prod(env.observation_space.shape)
    agent = QLearningAgent(alpha    = 0.5, 
                           epsilon  = epsilon,
                           discount = 0.9,
                           get_legal_actions=lambda s: range(env.action_space.n),
                           item = True)

    encoder = Encoder(image_channels, input_shape)
    decoder = Decoder(image_channels, input_shape)
    prior   = choose_prior(prior_name, latent_dim, M, D, num_vals, encoder, image_channels, device)
    vae     = ConvVAE(encoder, decoder, prior, latent_dim).to(device) 
    vae.load_state_dict(torch.load(f'deepsea_vae_models/pre_trained_vae_{prior_name}.pth'))

    kl_divergence = KLDivergenceReward(vae)
    _, states, _, _,  percentages  = train_with_reward(env,
                                                        agent,
                                                        seed,
                                                        kl_divergence,
                                                        intrinsic_weight = 1,
                                                        n_episodes=time_num_episodes,
                                                        test_reward_period = test_reward_period,
                                                        update_reward_period = 100,
                                                        batch_size =  150,
                                                        n_iter =  50,
                                                        std_dev = 0.0, 
                                                        annealing = True)
    
    globals()[f"{prior_name}_all_agent_percentages"].append(percentages)
    globals()[f"{prior_name}_all_states"].append(states)
    
save_dir = f'saved_deepsea_results/'
precentages_path = f"{save_dir}{prior_name}_percentages_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(precentages_path,  globals()[f"{prior_name}_all_agent_percentages"])

save_dir = f'saved_deepsea_results/'
state_visits_path = f"{save_dir}{prior_name}_states_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(state_visits_path, globals()[f"{prior_name}_all_states"][0])


# Coin Flip

In [None]:
def layer_init(layer):
    if isinstance(layer, nn.Linear):
        nn.init.orthogonal_(layer.weight, np.sqrt(2))
        nn.init.constant_(layer.bias, 0,)
    elif isinstance(layer, nn.Conv2d):
        nn.init.xavier_uniform_(layer.weight)
        nn.init.constant_(layer.bias, 0)
        
class CoinFlipNetwork(nn.Module):
    def __init__(self, input_size:int, fc1:int, fc2:int, d:int=20):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(input_size, fc1),
                                 nn.ReLU(),
                                 nn.Linear(fc1, fc2),
                                 nn.ReLU(),
                                 nn.Linear(fc2, d))

        self.net.apply(layer_init)
        self.prior_net = nn.Sequential(nn.Linear(input_size, fc1),
                                       nn.ReLU(),
                                       nn.Linear(fc1, fc2),
                                       nn.ReLU(),
                                       nn.Linear(fc2, d))
        
        self.prior_net.apply(layer_init)
        self.prior_net.eval()
        self.d = d
        self.prior_mean = torch.zeros((1, self.d)).float()
        self.prior_var = torch.ones((1, self.d)).float()
        self.prior_n = 0
        
    @torch.no_grad()
    def update_prior_mean_var(self, prior):
        batch_size = prior.shape[0]
        new_data_mean = prior.mean(axis=0)
        
        diff = new_data_mean - self.prior_mean
        total_count = (self.prior_n + batch_size)

        self.prior_mean = self.prior_mean + diff * batch_size / total_count

        new_var = (self.prior_var * self.prior_n) + torch.var(prior, axis=0) * batch_size
        new_var += torch.pow(diff, 2) * self.prior_n * batch_size / total_count
        new_var /= total_count

        self.prior_n = total_count
        self.prior_var = new_var

    def forward(self, x, update_prior_stats=True):
        f_x = self.net(x)
        prior_x = self.prior_net(x)
        
        if self.prior_n > 0:
            prior_x_scaled = (prior_x - self.prior_mean) / self.prior_var
        else:
            prior_x_scaled = prior_x
        
        if self.training and update_prior_stats:
            self.update_prior_mean_var(prior_x)
            
        return f_x, prior_x_scaled

    def eval(self):
        self.net.eval()

    def train(self):
        self.net.train()
        
class CoinFlipBuffer:
    def __init__(self, size:int, obs_shape, d:int=20, alpha:float = 0.5, prioritise:bool=True):
        self.obs_shape = obs_shape
        self.d = d
        self.size=size
        self.alpha=alpha
        self._init_buffers()
        self.prioritise=prioritise

    def _init_buffers(self):
        self.obs_buffer = np.zeros((self.size, *self.obs_shape), dtype=np.float32)
        self.coin_vec_buffer = np.zeros((self.size, self.d), dtype=np.float32)
        self.n_updates = np.zeros(self.size, dtype=np.int32)
        self.priorities = np.zeros(self.size, dtype=np.float32)
        self._next_idx = -1
        self.is_full=False

    def add(self, obs_t, coin_vector, *args, **kwargs):
        self._next_idx += 1
        if self._next_idx == self.size-1:
            self.is_full = True
        elif self._next_idx == self.size:
            self._next_idx = 0

        obs = np.atleast_2d(np.array(obs_t, copy=False))
        c = np.atleast_2d(np.array(coin_vector, dtype=np.int8))
        self.obs_buffer[self._next_idx] = obs
        self.coin_vec_buffer[self._next_idx] = c
        self.n_updates[self._next_idx] = 1
        self.priorities[self._next_idx] = 1.
        
    def sample(self):
        batch_size=128
        p = self.priorities / np.sum(self.priorities)
        # assert p.sum()==1, (p.sum(), p[p>0])
        if self.prioritise:
            batch_idcs = np.random.choice(range(self.size), size=batch_size, p=p)
        else:
            batch_idcs = np.random.choice(range(self.size if self.is_full else self._next_idx), size=batch_size)
        
        self._last_batch_idcs = batch_idcs
        
        batch_obs = self.obs_buffer[batch_idcs]
        batch_c = self.coin_vec_buffer[batch_idcs]

        return batch_obs, batch_c

    def update_priorities(self, f_batch):
        assert f_batch.shape[1] == self.d
        f_sq = np.array(f_batch.squeeze().detach().cpu())**2
        inverse_counts = f_sq.sum(axis=1) / self.d
        self.n_updates[self._last_batch_idcs] = self.n_updates[self._last_batch_idcs] + 1

        self.priorities[self._last_batch_idcs] = self.alpha/(1+self.n_updates[self._last_batch_idcs]) + (1-self.alpha)*inverse_counts
        
        
class CoinFlipReward(BaseIntrinsicRewardModule):
    def __init__(self, cfn: CoinFlipNetwork, buffer:CoinFlipBuffer, batch_size:int, d:int=20, bonus_exponent:float=0.5):
        super().__init__()
        self.d = d
        self.bonus_exponent = bonus_exponent
        self.cfn = cfn
        self.batch_size = batch_size
        self.buffer = buffer

    def flip_coins(self):
        return 2*np.random.binomial(1, 0.5, size=self.d) - 1

    @torch.no_grad()
    def get_intrinsic_reward(self, state, action, next_state):
        f_next_state, prior_next_state = self.cfn(next_state, False) # (batch_size, d)
        
        # f = f_next_state
        f = f_next_state + prior_next_state
        # print(f_next_state, prior_next_state, f)
        f_sq = torch.pow(f, 2)
        reward = torch.mean(f_sq, axis=-1)
        
        reward = reward ** self.bonus_exponent
        return reward[0]

    def store(self, state, *args, **kwargs):
        c_vector = self.flip_coins()
        self.buffer.add(state, c_vector)

    def get_loss(self, *args, **kwargs):
        
        states, coin_vecs = self.buffer.sample(self.batch_size)
        states = torch.flatten(torch.tensor(states).float(), 1)
        f_x, _ = self.cfn(states, True)

        with torch.no_grad():
            coin_vecs = torch.tensor(coin_vecs).float()
            self.buffer.update_priorities(f_x)
        
        assert f_x.shape == coin_vecs.shape
        loss = torch.pow(f_x - coin_vecs, 2).mean()
        return loss

In [None]:
num_agents = 3
coin_flip_all_agent_percentages = []
coin_flip_all_states = []
    
for i in range(num_agents):
    seed = i
    random.seed(seed)
    np.random.seed(seed)

    env = gym.make("bsuite/deep_sea-v0", size=24, seed=42)

    epsilon = 0.1
    time_num_episodes = 5000     
    test_reward_period = 100
    hash_functions = 32
    beta = 1
    states_size = np.prod(env.observation_space.shape)
    agent = QLearningAgent(alpha    = 0.5, 
                           epsilon  = epsilon,
                           discount = 0.9,
                           get_legal_actions=lambda s: range(env.action_space.n),
                           item = True)
    
    cfn    = CoinFlipNetwork(np.prod(env.observation_space.shape), 288, 288, 20)
    buffer = CoinFlipBuffer
    batch_size = 128
    d = 20
    bonus_exponent = 0.5

    coin_flip = CoinFlipReward(cfn, buffer, batch_size, d, bonus_exponent)
    
    _, coin_flip_percentages, coin_flip_test_rewards, coin_flip_ints, coin_flip_percentages, _ = train_with_reward(env,
                                                                        agent,
                                                                        coin_flip,
                                                                        intrinsic_weight = 1,
                                                                        n_episodes=time_num_episodes,
                                                                        test_reward_period = test_reward_period,
                                                                        update_reward_period = 100,
                                                                        batch_size =  150,
                                                                        n_iter =  50,
                                                                        std_dev = 0.0,
                                                                        annealing = True)

    coin_flip_all_agent_percentages.append(coin_flip_percentages)
    coin_flip_all_states.append(coin_flip_percentages)
    
save_dir = f'saved_deepsea_results/'
precentages_path = f"{save_dir}coin_flip_count_percentages_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(precentages_path, coin_flip_all_agent_percentages)

save_dir = f'saved_deepsea_results/'
state_visits_path = f"{save_dir}coin_flip_count_states_{env.observation_space.shape[0]}.npy"
os.makedirs(save_dir, exist_ok=True)
np.save(state_visits_path, coin_flip_all_states[0])


In [None]:
cfn= CoinFlipNetwork(np.prod(env.observation_space.shape), 288, 288, 20)
buffer=CoinFlipBuffer
batch_size = 128
d = 20
bonus_exponent = 0.5

coin_flip = CoinFlipReward(cfn, buffer, batch_size, d, bonus_exponent)

state,_ = env.reset()
action = env.action_space.sample()
next_state, reward, done, info, _ = env.step(action)

state_t  = torch.tensor(state).float().view(1, -1)
action_t = torch.tensor(action).float().view(1, -1)
next_state_t = torch.tensor(next_state).float().view(1, -1)

int_rew = coin_flip.get_intrinsic_reward( state_t, action_t, next_state_t)
int_loss = coin_flip.get_loss(state)

# BYOL

In [None]:
class EMA():
    def __init__(self, beta):
        super().__init__()
        self.beta = beta

    def update_average(self, old, new):
        if old is None:
            return new
        return old * self.beta + (1 - self.beta) * new

def update_moving_average(ema_updater, ma_model, current_model):
    for current_params, ma_params in zip(current_model.parameters(), ma_model.parameters()):
        old_weight, up_weight = ma_params.data, current_params.data
        ma_params.data = ema_updater.update_average(old_weight, up_weight)

In [None]:
class CloseLoopRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CloseLoopRNNCell, self).__init__()
        self.hidden_size = hidden_size
        self.rnn_cell = nn.RNN(input_size, hidden_size, batch_first= True)
    
    def forward(self, x, prev_hidden):
        new_hidden = self.rnn_cell(x, prev_hidden)
        return new_hidden

class OpenLoopRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(OpenLoopRNNCell, self).__init__()
        self.hidden_size = hidden_size
        self.rnn_cell = nn.RNN(input_size, hidden_size, batch_first= True)
    
    def forward(self, x, prev_hidden):
        new_hidden = self.rnn_cell(x, prev_hidden)
        return new_hidden

class Predictor(nn.Module):
    def __init__(self, input_size, output_size):
        super(Predictor, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        return self.linear(x)

class OnlineNetwork(nn.Module):
    def __init__(self, state_size, embedding_size, history_size, open_loop_horizon, action_size):
        super(OnlineNetwork, self).__init__()
        
        self.embedding_size = embedding_size
        self.history_size = history_size
        self.open_loop_horizon = open_loop_horizon

        # Encoder f_theta
        self.encoder = nn.Linear(state_size, embedding_size)

        # Close-loop RNN cell h(c)_theta
        self.close_loop_rnn_cell = CloseLoopRNNCell(embedding_size + action_size, history_size)

        # Open-loop RNN cell h(o)_theta
        self.open_loop_rnn_cell = OpenLoopRNNCell(embedding_size + action_size, history_size)

        # Predictor g_theta
        self.predictor = Predictor(history_size, embedding_size)
    
    def forward(self, observation, prev_action):
        
        # Encoder f_theta
        observation_representation = self.encoder(observation)

        # Close-loop RNN cell
        close_loop_input  = torch.cat((observation_representation, prev_action), dim=1)
        beta = torch.zeros(1, self.history_size)  # Initialize closed beta size 256
        close_loop_hidden = self.close_loop_rnn_cell(close_loop_input, beta)

        # Open-loop RNN celltr
        open_loop_hidden = torch.zeros(1, self.history_size)  # Initialize open hidden state
        open_loop_predictions = []
        
        for _ in range(self.open_loop_horizon):
            open_loop_hidden     = self.open_loop_rnn_cell(close_loop_hidden, open_loop_hidden)
            open_loop_prediction = self.predictor(open_loop_hidden)
            open_loop_predictions.append(open_loop_prediction)
        return open_loop_predictions    

In [None]:
K = 5 #temporal_window

class F(): #encoder
    ...
    
class G(): 
    ...
    
states = [state_1, state_2, ..., state_T]
actions = [a_1,..., a_T]
f = F(states)

b = G(f[0], actions) # approximation of [F(state_2, state_3,...,state_T)]

L = mse(b, f[1:]) #intrinsic reward



In [None]:
env = gym.make("bsuite/deep_sea-v0", size=12, seed=42)
    
# hyperparameters
action_size = 1
state_size  = np.prod(env.observation_space.shape)
hidden_size = 16
history_size = 33
embedding_size = 32
open_loop_horizon = 5

# agent steps
state,_ = env.reset()
action  = env.action_space.sample()
s_next, r, terminated, truncated, _  = env.step(action)
state_t  = torch.tensor(state).float().view(1, -1)
action_t = torch.tensor(action).unsqueeze(0).unsqueeze(0)

online_network = OnlineNetwork(state_size, embedding_size, history_size, open_loop_horizon, action_size)
target_network = TargetNetwork(online_network, alpha=0.01)

# Update target network parameters
target_network.update_target_network()
target_predictions = target_network.forward(observation, prev_action)

In [None]:
import copy
env = gym.make("bsuite/deep_sea-v0", size=12, seed=42)
    
# hyperparameters
action_size = 1
state_size  = np.prod(env.observation_space.shape)
hidden_size = 16
history_size = 33
embedding_size = 32
open_loop_horizon = 3

# agent steps
state,_ = env.reset()
action  = env.action_space.sample()
s_next, r, terminated, truncated, _  = env.step(action)
state_t  = torch.tensor(state).float().view(1, -1)
action_t = torch.tensor(action).unsqueeze(0).unsqueeze(0)

online_network = OnlineNetwork(state_size, embedding_size, history_size, open_loop_horizon, action_size)
online_predictions = online_network(state_t, action_t)
online_predictions = torch.cat(online_predictions)

norm_online_predictions = online_predictions / ((online_predictions**2)*0.5)
#print(norm_online_predictions)
#norm_online_predictions = online_predictions / ((online_predictions**2)*0.5)


target_network = copy.deepcopy(online_network)
target_projections = target_network.encoder(state_t)
norm_target_projections = target_projections / ((target_projections**2)*0.5)

loss = (norm_online_predictions - norm_target_projections.detach())**2


In [None]:

# Example usage
online_network = OnlineNetwork(embedding_size, history_size, open_loop_horizon, action_size)
target_network = TargetNetwork(online_network, alpha=0.01)

# Update target network parameters
target_network.update_target_network()

# Use target network for inference
observation = torch.randn(1, embedding_size)
target_predictions = target_network(observation)
print(target_predictions)





In [None]:
env = gym.make("bsuite/deep_sea-v0", size=12, seed=42)

time_num_episodes = 5000      
test_reward_period = 100

agent = QLearningAgent(alpha    = 0.5, 
                       epsilon  = 0.1,
                       discount = 0.9,
                       get_legal_actions=lambda s: range(env.action_space.n),
                       item = False)
states_size = np.prod(env.observation_space.shape)
action_size =  env.action_space.n
hidden_size = 16
embedding_size = 32

byol = ObeservationEncoder(states_size, 
                           action_size,
                           embedding_size, 
                           hidden_size)
state,_ = env.reset()
action = env.action_space.sample()
s_next, r, terminated, truncated, _  = env.step(action)

state_t  = torch.tensor(state).float().view(1, -1)

latent_state = byol.forward(state_t)
print(latent_state.shape)