In [None]:
import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import torch.nn.functional as F
import gymnasium as gym
import matplotlib.pyplot as plt
import random
from minigrid.wrappers import RGBImgObsWrapper
from tqdm import tqdm
import math
import pygame
from minigrid.core.constants import COLOR_NAMES
from minigrid.core.grid import Grid
from minigrid.core.mission import MissionSpace
from minigrid.core.world_object import Goal, Wall
from minigrid.manual_control import ManualControl
from minigrid.minigrid_env import MiniGridEnv
import random
from minigrid.wrappers import RGBImgObsWrapper
import pickle
from env import FourRoomsEnv

In [30]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

horizon_scale = 0.02
return_scale = 0.02
replay_size = 100000
n_updates_per_iter = 200
n_episodes_per_iter = 20
last_few = 50
batch_size = 32


In [31]:
class BF(nn.Module):
    def __init__(self, obs_space, action_space, hidden_size, seed):
        super(BF, self).__init__()
        torch.manual_seed(seed)
        # CNN
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.relu = nn.ReLU()
        
        
        conv_output_size = self._get_conv_output_size(obs_space)
        
        # fc_img
        self.fc1 = nn.Linear(conv_output_size, hidden_size)
        
        # fc_command
        self.commands = nn.Linear(2, hidden_size)
        
        # fc_combined
        self.fc_comb = nn.Linear(hidden_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_space)

    def _get_conv_output_size(self, obs_space):
        with torch.no_grad():
            input = torch.zeros(1,obs_space[2], obs_space[0], obs_space[1])
            output = self.conv1(input)
            output = self.relu(output)
            output = self.conv2(output)
            output = self.relu(output)
            output = self.pool(output)
            output = output.view(1, -1)
        return output.size(1) 
        
    def forward(self, state, command):
        state = self.relu(self.conv1(state))
        state = self.relu(self.conv2(state))
        state = self.pool(state)
        state = state.view(state.size(0), -1)
        
        state_out = self.relu(self.fc1(state))
        command_out = self.relu(self.commands(command))
        
        combined = torch.cat((state_out, command_out), dim=1)
        combined = self.relu(self.fc_comb(combined))
        
        combined = self.relu(self.fc2(combined))
        action_probs = self.fc3(combined)
        return action_probs

    
    def action(self, state, desire, horizon):
        command = torch.cat((desire * return_scale, horizon * horizon_scale), dim=-1).unsqueeze(0)
        action_prob = self.forward(state, command)
        probs = torch.softmax(action_prob, dim=-1)
        m = Categorical(probs)
        action = m.sample()
        return action
    
    
    def greedy_action(self, state, desire, horizon):
        command = torch.cat((desire * return_scale, horizon * horizon_scale), dim=-1).unsqueeze(0)
        action_prob = self.forward(state, command)
        probs = torch.softmax(action_prob, dim=-1)
        action = torch.argmax(probs).item()
        return action

In [32]:
class ReplayBuffer():
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = []

    def add_sample(self, states, actions, rewards):
        episode = {"states": states, "actions": actions, "rewards": rewards, "summed_rewards": sum(rewards)}
        self.buffer.append(episode)
        if len(self.buffer) > self.max_size:
            self.buffer.sort(key=lambda i: i["summed_rewards"], reverse=True)
            self.buffer = self.buffer[:self.max_size]

    def get_random_samples(self, batch_size):
        if batch_size > len(self.buffer):
            raise ValueError("Batch size larger than buffer size.")
        return random.sample(self.buffer, batch_size)

    def get_nbest(self, n):
        if n > len(self.buffer):
            raise ValueError("n is larger than buffer size.")
        self.buffer.sort(key=lambda i: i["summed_rewards"], reverse=True)
        return self.buffer[:n]

    def __len__(self):
        return len(self.buffer)
    
    # clear the buffer 
    def clear(self):
        self.buffer = []
        print("Buffer has been cleared.")

In [33]:
def load_data_to_replay_buffer(buffer, filename=" "):
    with open(filename, 'rb') as f:
        recorded_data = pickle.load(f)

    for episode in recorded_data:
        states, actions, rewards = episode
        buffer.add_sample(states, actions, rewards)
    
    return 'Successfully Loading to Replay buffer'

In [34]:
env = FourRoomsEnv(render_mode="human")
env = RGBImgObsWrapper(env)
action_space = 3
obs_space = env.observation_space['image'].shape

buffer = ReplayBuffer(replay_size)
bf = BF(obs_space, action_space, 64, 1).to(device)
load_data_to_replay_buffer(buffer, filename="fourRoomRecord.pkl")
optimizer = optim.Adam(params=bf.parameters(), lr=1e-3)

print(f"Loaded {len(buffer)} episodes into the ReplayBuffer.")
print(f"Observation space: {obs_space}")
print(f"Action space: {action_space}")
print(f"Device: {device}")


Loaded 220 episodes into the ReplayBuffer.
Observation space: (104, 104, 3)
Action space: 3
Device: cuda:0


In [35]:
## OBSERVE THE WEIGHTS before training
for p in bf.parameters():
    print(p)

Parameter containing:
tensor([[[[ 0.0992, -0.0849, -0.0373],
          [ 0.0903, -0.1812,  0.1154],
          [-0.0396,  0.0979,  0.0268]],

         [[-0.0236,  0.0534,  0.0095],
          [ 0.0703, -0.0750, -0.0140],
          [-0.0173,  0.0279, -0.0008]],

         [[ 0.1682,  0.0599, -0.0717],
          [-0.1162, -0.0323, -0.0830],
          [-0.0617,  0.0092,  0.1147]]],


        [[[ 0.1046, -0.1881,  0.1193],
          [ 0.0538,  0.1825,  0.1270],
          [-0.1753, -0.1830, -0.0928]],

         [[ 0.1690, -0.0321,  0.0824],
          [-0.0894,  0.1888, -0.0814],
          [ 0.1443,  0.0023, -0.1014]],

         [[ 0.0989, -0.1022,  0.0566],
          [-0.0556, -0.0211, -0.1850],
          [-0.0918,  0.1044, -0.0468]]],


        [[[ 0.1917,  0.1543, -0.0090],
          [-0.1285,  0.1172,  0.0597],
          [-0.1244,  0.1250,  0.1168]],

         [[ 0.1707, -0.1079, -0.0317],
          [-0.0037,  0.0281, -0.1461],
          [-0.1366,  0.1047, -0.0451]],

         [[ 0.0940,  0

In [36]:
# FUNCTIONS FOR Sampling exploration commands
def sampling_exploration(top_X_eps=last_few):
    """
    This function calculates the new desired reward and new desired horizon based on the replay buffer.
    New desired horizon is calculated by the mean length of the best last X episodes. 
    New desired reward is sampled from a uniform distribution given the mean and the std calculated from the last best X performances.
    where X is the hyperparameter last_few.
    """
    if len(buffer) < top_X_eps:
        raise ValueError("Not enough episodes in the buffer to sample from.")

    top_X = buffer.get_nbest(top_X_eps)
    if not top_X:
        return torch.FloatTensor([0]), torch.FloatTensor([0])  # Default or minimal values if no data is available

    # The exploratory desired horizon dh0 is set to the mean of the lengths of the selected episodes
    new_desired_horizon = np.mean([len(i["states"]) for i in top_X])
    # Save all top_X cumulative returns in a list 
    returns = [i["summed_rewards"] for i in top_X]
    # From these returns calculate the mean and std
    mean_returns = np.mean(returns)
    std_returns = np.std(returns)
    # Sample desired reward from a uniform distribution given the mean and the std
    new_desired_reward = np.random.uniform(mean_returns, mean_returns + std_returns)

    return torch.FloatTensor([new_desired_reward]), torch.FloatTensor([new_desired_horizon])

In [37]:
# FUNCTIONS FOR TRAINING
def select_time_steps(saved_episode):
    """
    Given a saved episode from the replay buffer, this function samples random time steps (t1 and t2) in that episode:
    T = max time horizon in that episode.
    Returns t1, t2, and T. Ensures that t1 and t2 are different and t1 < t2.
    """
    T = len(saved_episode["states"])  # episode max horizon
    t1 = np.random.randint(0, T - 1)  # Ensure there is at least one step after t1
    t2 = np.random.randint(t1 + 1, T)  # Ensure t2 is after t1

    return t1, t2, T

def create_training_input(episode, t1, t2):
    """
    Based on the selected episode and the given time steps, this function returns 4 values:
    1. state at t1
    2. the desired reward: sum over all rewards from t1 to t2 (exclusive)
    3. the time horizon: t2 - t1
    4. the target action taken at t1
    
    Buffer episodes are structured as [cumulative episode reward, states, actions, rewards].
    """
    state = episode["states"][t1]
    desired_reward = sum(episode["rewards"][t1:t2])
    time_horizon = t2 - t1
    action = episode["actions"][t1]
    
    return state, desired_reward, time_horizon, action

def create_training_examples(batch_size):
    """
    Creates a data set of training examples that can be used to create a data loader for training.
    ============================================================
    1. For the given batch_size, episode indices are randomly selected.
    2. Based on these episodes, t1 and t2 are sampled for each selected episode.
    3. For the selected episode and sampled t1 and t2, training values are gathered.
    ______________________________________________________________
    Output are four lists of length batch_size:
    states, rewards, horizons, and actions.
    """
    states = []
    rewards = []
    horizons = []
    actions = []
    episodes = buffer.get_random_samples(batch_size)
    
    for ep in episodes:
        t1, t2, T = select_time_steps(ep)
        t2 = T
        state, desired_reward, time_horizon, action = create_training_input(ep, t1, t2)
        

        states.append(torch.FloatTensor(state))
        rewards.append(torch.FloatTensor([desired_reward]))
        horizons.append(torch.FloatTensor([time_horizon]))
        actions.append(torch.tensor(action, dtype=torch.long))
        
    
    # print("Number of states:", len(states))
    # print("Number of rewards:", len(rewards))
    # print("Number of horizons:", len(horizons))
    # print("Number of actions:", len(actions))
    
    # for i in range(len(states)):
    #      print(f"State {i}: shape {states[i].shape}, type {type(states[i])}")
    # for i in range(len(states)):  
    #      print(f"Reward {i}: shape {rewards[i].shape}, type {type(rewards[i])}")
    # for i in range(len(states)):
    #      print(f"Horizon {i}: shape {horizons[i].shape}, type {type(horizons[i])}")
    # for i in range(len(states)):
    #      print(f"Action {i}: shape {actions[i].shape}, type {type(actions[i])}")

    return states, rewards, horizons, actions

def train_behavior_function(batch_size):
    """
    Trains the BF with on a cross entropy loss where the inputs are the action probabilities based on the state and command.
    The targets are the actions appropriate to the states from the replay buffer.
    """
    states, rewards, horizons, actions = create_training_examples(batch_size)

    # Convert lists to tensors and move them to the appropriate device
    state_tensors = torch.stack(states).to(device)
    reward_tensors = torch.stack(rewards).to(device)
    horizon_tensors = torch.stack(horizons).to(device)
    action_tensors = torch.stack(actions).to(device)
    
    # print("State Tensor:          ", state_tensors.shape)
    # print("\tRewards tensor:      ", reward_tensors.shape)
    # print("\tHorizons tensor:     ", horizon_tensors.shape)
    # print("\tActions tensor:      ", action_tensors.shape)

    # Ensure reward_tensors and horizon_tensors have correct shape for concatenation
    command = torch.cat((reward_tensors, horizon_tensors), dim=1)
    # Run model
    outputs = bf(state_tensors, command).float()
    # print("Outputs: ", outputs , "Action Tensors: ", action_tensors)
    # Compute cross entropy loss
    loss = F.cross_entropy(outputs, action_tensors)

    # Optimization step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

In [38]:
def evaluate(desired_return, desired_time_horizon):
    """
    Runs one episode of the environment to evaluate the bf.
    """
    obs, _ = env.reset() 
    state = obs['image']
    total_rewards = 0
    while True:
        state_tensor = torch.from_numpy(state).float().permute(2, 0, 1).unsqueeze(0).to(device)
        action = bf.action(state_tensor, desired_return.to(device), desired_time_horizon.to(device)).item()
        next_obs, reward, done, truncated, _ = env.step(action)
        next_state = next_obs['image']
        
        total_rewards += reward
        state = next_state
        desired_return -= reward
        desired_time_horizon -= 1
        desired_time_horizon = torch.FloatTensor([max(desired_time_horizon.item(), 1)])
        
        if done or truncated:
            break
    return total_rewards

In [39]:
init_desired_reward = 13
init_time_horizon = 50

In [40]:
# Algorithm 2 - Generates an Episode unsing the Behavior Function:
def generate_episode(desired_return = torch.FloatTensor([init_desired_reward]), desired_time_horizon = torch.FloatTensor([init_time_horizon])):    
    """
    Generates more samples for the replay buffer.
    """
    obs, _ = env.reset()
    state = obs['image']
    states = []
    actions = []
    rewards = []
    while True:
        state_tensor = torch.from_numpy(state).float().permute(2, 0, 1).to(device) 
        action = bf.action(state_tensor.unsqueeze(0).to(device), desired_return.to(device), desired_time_horizon.to(device)).item()
        next_obs, reward, done, truncated, _ = env.step(action)
        next_state = next_obs['image']
        
        states.append(state_tensor.cpu().numpy())
        actions.append(action)
        rewards.append(reward)
        
        state = next_state
        desired_return -= reward
        desired_time_horizon -= 1
        desired_time_horizon = torch.FloatTensor([np.maximum(desired_time_horizon, 1).item()])
        
        if done:
            break 

    if len(states) < 2:
        return generate_episode(desired_return, desired_time_horizon)
    
    return [states, actions, rewards]


# Algorithm 1 - Upside - Down Reinforcement Learning 
def run_upside_down(max_episodes):
    all_rewards = []
    losses = []
    average_100_reward = []
    desired_rewards_history = []
    horizon_history = []
    
    for ep in tqdm(range(1, max_episodes + 1), desc="Training Progress"):

        # improve|optimize bf based on replay buffer
        loss_buffer = []
        for i in range(n_updates_per_iter):
            bf_loss = train_behavior_function(batch_size)
            loss_buffer.append(bf_loss)
        bf_loss = np.mean(loss_buffer)
        losses.append(bf_loss)

        # run x new episode and add to buffer
        for i in range(n_episodes_per_iter):
            # Sample exploratory commands based on buffer
            new_desired_reward, new_desired_horizon = sampling_exploration()
            generated_episode = generate_episode(new_desired_reward, new_desired_horizon)
            buffer.add_sample(generated_episode[0], generated_episode[1], generated_episode[2])

        new_desired_reward, new_desired_horizon = sampling_exploration()
        # monitoring desired reward and desired horizon
        desired_rewards_history.append(new_desired_reward.item())
        horizon_history.append(new_desired_horizon.item())

        ep_rewards = evaluate(new_desired_reward, new_desired_horizon)
        all_rewards.append(ep_rewards)
        average_100_reward.append(np.mean(all_rewards[-100:]))

        print("\rEpisode: {} | Rewards: {:.2f} | Mean_100_Rewards: {:.2f} | Loss: {:.2f}".format(
            ep, ep_rewards, np.mean(all_rewards[-100:]), bf_loss), end="", flush=True)
        if ep % 100 == 0:
            print("\rEpisode: {} | Rewards: {:.2f} | Mean_100_Rewards: {:.2f} | Loss: {:.2f}".format(
                ep, ep_rewards, np.mean(all_rewards[-100:]), bf_loss))

    return all_rewards, average_100_reward, desired_rewards_history, horizon_history, losses

In [41]:
env = FourRoomsEnv()
env = RGBImgObsWrapper(env)

In [42]:
%%time
rewards, average, d, h, loss = run_upside_down(max_episodes=10)
plt.figure(figsize=(15,8))
plt.subplot(2,2,1)
plt.title("Rewards")
plt.plot(rewards, label="rewards")
plt.plot(average, label="average100")
plt.legend()
plt.subplot(2,2,2)
plt.title("Loss")
plt.plot(loss)
plt.subplot(2,2,3)
plt.title("desired Rewards")
plt.plot(d)
plt.subplot(2,2,4)
plt.title("desired Horizon")
plt.plot(h)
plt.show()

Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Total Reward: -41.00
Total Reward: -41.10
Total Reward: -41.20
Total Reward: -41.30
Total Reward: -41.40
Total Reward: -41.50
Total Reward: -41.60
Total Reward: -41.70
Total Reward: -41.80
Total Reward: -41.90
Total Reward: -42.00
Total Reward: -42.10
Total Reward: -41.20
Total Reward: -41.30
Total Reward: -41.40
Total Reward: -41.50
Total Reward: -41.60
Total Reward: -41.70
Total Reward: -41.80
Total Reward: -41.90
Total Reward: -42.00
Total Reward: -42.10
Total Reward: -42.20
Total Reward: -42.30
Total Reward: -32.30
Total Reward: -33.70
Total Reward: -49.00
Total Reward: -49.10
Total Reward: -49.20
Total Reward: -48.30
Total Reward: -48.40
Total Reward: -47.50
Total Reward: -47.60
Total Reward: -48.70
Total Reward: -48.80
Total Reward: -48.90
Total Reward: -48.00
Total Reward: -49.10
Total Reward: -50.20
Total Reward: -50.30
Total Reward: -50.40
Total Reward: -50.50
Total Reward: -50.60
Total Reward: -50.70
Total Reward: -49.80
Total Reward: -48.90
Total Reward: -48.00
Total Reward:

Training Progress:  10%|█         | 1/10 [00:38<05:45, 38.41s/it]

Total Reward: -11.20
Total Reward: -35.70
Total Reward: 4.10
Total Reward: 4.90
Total Reward: 11.60
Total Reward: 3.20
Total Reward: 12.60
Total Reward: -49.00
Total Reward: -49.10
Total Reward: -49.20
Total Reward: -49.30
Total Reward: -49.40
Total Reward: -49.50
Total Reward: -49.60
Total Reward: -49.70
Total Reward: -49.80
Total Reward: -49.90
Total Reward: -50.00
Total Reward: -50.10
Total Reward: -50.20
Total Reward: -50.30
Total Reward: -50.40
Total Reward: -50.50
Total Reward: -50.60
Total Reward: -50.70
Total Reward: -50.80
Total Reward: -50.90
Total Reward: -51.00
Total Reward: -51.10
Total Reward: -51.20
Total Reward: -51.30
Total Reward: -51.40
Total Reward: -51.50
Total Reward: -51.60
Total Reward: -51.70
Total Reward: -52.80
Total Reward: -52.90
Total Reward: -53.00
Total Reward: -53.10
Total Reward: -54.20
Total Reward: -55.30
Total Reward: -55.40
Total Reward: -55.50
Total Reward: -55.60
Total Reward: -55.70
Total Reward: -55.80
Total Reward: -56.90
Total Reward: -57.00


Training Progress:  10%|█         | 1/10 [01:35<14:19, 95.54s/it]

Total Reward: -128.40
Total Reward: -128.50
Total Reward: -129.60
Total Reward: -130.70
Total Reward: -130.80
Total Reward: -131.90
Total Reward: -133.00
Total Reward: -133.10
Total Reward: -133.20
Total Reward: -133.30
Total Reward: -132.40
Total Reward: -131.50
Total Reward: -131.60
Total Reward: -131.70
Total Reward: -132.80
Total Reward: -133.90





KeyboardInterrupt: 

In [43]:
# SAVE MODEL
name = "model_FourRoom_Mannul.pth"
torch.save(bf.state_dict(), name)

In [44]:
## OBSERVE THE WEIGHTS after training
for p in bf.parameters():
    print(p)

Parameter containing:
tensor([[[[ 0.0921, -0.0920, -0.0440],
          [ 0.0816, -0.1901,  0.1075],
          [-0.0472,  0.0909,  0.0198]],

         [[-0.0322,  0.0461,  0.0024],
          [ 0.0601, -0.0844, -0.0231],
          [-0.0269,  0.0189, -0.0095]],

         [[ 0.1599,  0.0527, -0.0787],
          [-0.1262, -0.0416, -0.0921],
          [-0.0710,  0.0005,  0.1064]]],


        [[[ 0.1149, -0.1744,  0.1452],
          [ 0.0602,  0.1879,  0.1404],
          [-0.1706, -0.1818, -0.0859]],

         [[ 0.1836, -0.0176,  0.0999],
          [-0.0712,  0.2066, -0.0609],
          [ 0.1640,  0.0228, -0.0786]],

         [[ 0.1099, -0.0887,  0.0752],
          [-0.0430, -0.0071, -0.1664],
          [-0.0779,  0.1188, -0.0285]]],


        [[[ 0.1738,  0.1437, -0.0113],
          [-0.1356,  0.1194,  0.0791],
          [-0.1345,  0.1261,  0.1443]],

         [[ 0.1491, -0.1307, -0.0477],
          [-0.0240,  0.0054, -0.1584],
          [-0.1570,  0.0814, -0.0534]],

         [[ 0.0772, -0