In [1]:
!pip install gym[box2d]~=0.18.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym[box2d]~=0.18.0
  Downloading gym-0.18.3.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 5.2 MB/s 
Collecting pyglet<=1.5.15,>=1.4.0
  Downloading pyglet-1.5.15-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 35.5 MB/s 
Collecting box2d-py~=2.3.5
  Downloading box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 54.3 MB/s 
[?25hBuilding wheels for collected packages: gym
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.18.3-py3-none-any.whl size=1657528 sha256=0097f528236239dd3c77e67ffeade7e8e3424931ab5beb3b6a6c4d913cb66f3f
  Stored in directory: /root/.cache/pip/wheels/1a/ec/6d/705d53925f481ab70fd48ec7728558745eeae14dfda3b49c99
Successfully built gym
Installing collected packages: pyglet, gym, box2d-py
  Attempting uninstall: g

In [2]:
!pip install pygame 

import os
os.environ['SDL_VIDEODRIVER']='dummy'
import pygame
pygame.display.set_mode((640,480))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame
  Downloading pygame-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.8 MB)
[K     |████████████████████████████████| 21.8 MB 1.4 MB/s 
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.1.2
pygame 2.1.2 (SDL 2.0.16, Python 3.7.15)
Hello from the pygame community. https://www.pygame.org/contribute.html


<Surface(640x480x32 SW)>

In [74]:
def get_expected_rewards(rewards, gamma=0.9):
    G = np.zeros_like(rewards, dtype=float)
    G[-1] = rewards[-1]
    for idx in range(-2, -len(rewards)-1, -1):
        G[idx] = rewards[idx] + gamma * G[idx+1]
    return G


def to_one_hot(y_tensor, ndims):
    y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
    y_one_hot = torch.zeros(
        y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)
    return y_one_hot

In [75]:
import torch.nn as nn

class CostNN(nn.Module):
    def __init__(
        self, 
        state_dim,
        hidden_dim1 = 128, 
        out_features = 1, 
    ):
        super(CostNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim1),
            nn.ReLU(),
            nn.Linear(hidden_dim1, out_features),
        )
    def forward(self, x):
        return self.net(x)    

In [76]:
import numpy as np
import torch
import torch.nn.functional as F

class PG(nn.Module):
    def __init__(self, state_shape, n_actions):
        super().__init__()
        self.state_shape = state_shape
        self.n_actions = n_actions
        self.model = nn.Sequential(
            nn.Linear(in_features = state_shape[0], out_features = 128),
            nn.ReLU(),
            nn.Linear(in_features = 128 , out_features = 64),
            nn.ReLU(),
            nn.Linear(in_features = 64 , out_features = self.n_actions)
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), 1e-3)
    def forward(self, x):
        logits = self.model(x)
        return logits
    
    def predict_probs(self, states):
        states = torch.FloatTensor(states)
        logits = self.model(states).detach()
        probs = F.softmax(logits, dim = -1).numpy()
        return probs[0]
    
    def generate_session(self, env, t_max=1000):
        states, actions, traj_probs, rewards = [], [], [], []
        s = env.reset()
        q_t = 1.0
        for t in range(t_max):
            action_probs = self.predict_probs(np.array([s]))
            a = np.random.choice(self.n_actions,  p = action_probs)
            new_s, r, done, info = env.step(a)
            
            q_t *= action_probs[a]

            states.append(s)
            actions.append(a)   
            traj_probs.append(q_t)
            rewards.append(r)

            s = new_s
            if done:
                break

        return states, actions, traj_probs, rewards

    def _get_cumulative_rewards(self, rewards, gamma=0.99):
        G = np.zeros_like(rewards, dtype = float)
        G[-1] = rewards[-1]
        for idx in range(-2, -len(rewards)-1, -1):
            G[idx] = rewards[idx] + gamma * G[idx+1]
        return G

    def _to_one_hot(self, y_tensor, ndims):
        y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
        y_one_hot = torch.zeros(
            y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)
        return y_one_hot

    def train_on_env(self, env, gamma=0.99, entropy_coef=1e-2):
        states, actions, rewards = self.generate_session(env)
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int32)
        cumulative_returns = np.array(self._get_cumulative_rewards(rewards, gamma))
        cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32)

        logits = self.model(states)
        probs = nn.functional.softmax(logits, -1)
        log_probs = nn.functional.log_softmax(logits, -1)

        log_probs_for_actions = torch.sum(
            log_probs * self._to_one_hot(actions, env.action_space.n), dim=1)
    
        entropy = -torch.mean(torch.sum(probs*log_probs), dim = -1 )
        loss = -torch.mean(log_probs_for_actions*cumulative_returns -entropy*entropy_coef)

        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()
        return np.sum(rewards)

In [None]:
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn


from torch.optim.lr_scheduler import StepLR

# SEEDS
seed = 18095048
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# ENV SETUP
env_name = 'LunarLander-v2'
env = gym.make(env_name).unwrapped
if seed is not None:
    env.seed(seed)
n_actions = env.action_space.n
state_shape = env.observation_space.shape
state = env.reset()

# LOADING EXPERT/DEMO SAMPLES
demo_trajs = np.load('dataset.npy', allow_pickle=True)

# INITILIZING POLICY AND REWARD FUNCTION
policy = PG(state_shape, n_actions)
cost_f = CostNN(state_shape[0] + 1) # states (8) + action, trajectory
policy_optimizer = torch.optim.Adam(policy.parameters(), 1e-2)
cost_optimizer = torch.optim.Adam(cost_f.parameters(), 1e-2, weight_decay=1e-4)

mean_rewards = []
mean_costs = []
mean_loss_rew = []
EPISODES_TO_PLAY = 1
REWARD_FUNCTION_UPDATE = 20
DEMO_BATCH = 20
sample_trajs = []

D_demo, D_samp = np.array([]), np.array([])

# CONVERTS TRAJ LIST TO STEP LIST
def preprocess_traj(traj_list, step_list, is_Demo = False):
    step_list = step_list.tolist()
    for traj in traj_list:
        states = np.array(traj[0])
        if is_Demo:
            probs = np.ones((states.shape[0], 1))
        else:
            probs = np.array(traj[2]).reshape(-1, 1)
        actions = np.array(traj[1]).reshape(-1, 1)
        x = np.concatenate((states, actions, probs), axis=1)
        step_list.extend(x)
    return np.array(step_list)

D_demo = preprocess_traj(demo_trajs, D_demo, is_Demo=True)
return_list, sum_of_cost_list = [], []
for i in range(10000):
    trajs = [policy.generate_session(env) for _ in range(EPISODES_TO_PLAY)]
    sample_trajs = trajs + sample_trajs # todelete
    D_samp = preprocess_traj(trajs, D_samp)

    # UPDATING REWARD FUNCTION (TAKES IN D_samp, D_demo)
    loss_rew = []

    #REWARD_FUNCTION_UPDATE = round(REWARD_FUNCTION_UPDATE * 1/((i+1)))+900
    print('REWARD FUNCTION UPDATE: ', REWARD_FUNCTION_UPDATE)
    for _ in range(REWARD_FUNCTION_UPDATE):

        selected_samp = np.random.choice(len(D_samp), DEMO_BATCH)
        selected_demo = np.random.choice(len(D_demo), DEMO_BATCH)

        D_s_samp = D_samp[selected_samp]
        D_s_demo = D_demo[selected_demo]

        #D̂ samp ← D̂ demo ∪ D̂ samp
        D_s_samp = np.concatenate((D_s_demo, D_s_samp), axis = 0)
        #D_s_samp = np.add(D_s_demo, D_s_samp) 

        states, actions, probs = D_s_samp[:,:-2], D_s_samp[:,-2], D_s_samp[:,-1]
        states_expert, actions_expert = D_s_demo[:,:-2], D_s_demo[:,-2]

        # Reducing from float64 to float32 for making computaton faster
        states = torch.tensor(states, dtype=torch.float32)
        probs = torch.tensor(probs, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.float32)
        states_expert = torch.tensor(states_expert, dtype=torch.float32)
        actions_expert = torch.tensor(actions_expert, dtype=torch.float32)
        costs_samp = cost_f(torch.cat((states, actions.reshape(-1, 1)), dim=-1))
        costs_demo = cost_f(torch.cat((states_expert, actions_expert.reshape(-1, 1)), dim=-1))

        # LOSS CALCULATION FOR IOC (COST FUNCTION)
        loss_IOC = torch.mean(costs_demo) + \
                torch.log(torch.mean(torch.exp(-costs_samp)/(probs+1e-7))) #should be the objective function, Finn has one defintion CS285 another
        # UPDATING THE COST FUNCTION
        cost_optimizer.zero_grad()
        loss_IOC.backward()
        cost_optimizer.step()

        loss_rew.append(loss_IOC.detach())

    for traj in trajs:
        states, actions, probs, rewards = traj
        
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.float32)

        logits = policy(states) # forward pass
        probs = nn.functional.softmax(logits, -1) # get estimated actions for states
        log_probs = nn.functional.log_softmax(logits, -1)
            
        costs = cost_f(torch.cat((states, actions.reshape(-1, 1)), dim=-1)).detach().numpy() # current cost function
        cumulative_returns_np = np.array(get_expected_rewards(-costs, 0.9)) # recursively get expected discounted rewards (rewards given by the current cost function)
        cumulative_returns = torch.tensor(cumulative_returns_np, dtype=torch.float32)

        log_probs_for_actions = torch.sum(
            log_probs * to_one_hot(actions, env.action_space.n), dim=1)
    
        entropy = -torch.mean(torch.sum(probs*log_probs), dim = -1 )
        # loss = -torch.mean(log_probs_for_actions*cumulative_returns - entropy*1e-2) # loss for the policy (isnt it the cost function output?)
        loss = -log_probs_for_actions*cumulative_returns
        # UPDATING THE POLICY NETWORK
        policy_optimizer.zero_grad()
        loss.sum().backward()
        policy_optimizer.step()

        returns = sum(rewards)
        sum_of_cost = np.sum(costs)
        return_list.append(returns)
        sum_of_cost_list.append(sum_of_cost)

    mean_rewards.append(np.mean(return_list))
    mean_costs.append(np.mean(sum_of_cost_list))
    mean_loss_rew.append(np.mean(loss_rew))

    # PLOTTING PERFORMANCE
    if i % 10 == 0:
        # clear_output(True)
        print(f"mean reward:{np.mean(return_list)} loss: {loss_IOC}")

        plt.figure(figsize=[16, 12])
        plt.subplot(2, 2, 1)
        plt.title(f"Mean reward per {EPISODES_TO_PLAY} games")
        plt.plot(mean_rewards)
        plt.grid()

        plt.subplot(2, 2, 2)
        plt.title(f"Mean cost per {EPISODES_TO_PLAY} games")
        plt.plot(mean_costs)
        plt.grid()

        plt.subplot(2, 2, 3)
        plt.title(f"Mean loss per {REWARD_FUNCTION_UPDATE} batches")
        plt.plot(mean_loss_rew)
        plt.grid()

        # plt.show()
        plt.savefig('GCL_learning_curve.png')
        plt.close()

    if np.mean(return_list) > 500:
        break

REWARD FUNCTION UPDATE:  20
mean reward:-299.0389781966891 loss: 14.728302955627441
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
mean reward:-450.7742209022214 loss: 14.629894256591797
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
mean reward:-591.1496831344762 loss: 14.562402725219727
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDATE:  20
REWARD FUNCTION UPDA