In [1]:
import pybullet_envs
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.distributions.normal import Normal

from cost import CostNN
from sac_torch import Agent


# ENV SETUP
env_name = 'InvertedPendulumBulletEnv-v0'
env = gym.make(env_name)
state = env.reset()

def get_cumulative_rewards(rewards, gamma=0.99):
    G = np.zeros_like(rewards, dtype=float)
    G[-1] = rewards[-1]
    for idx in range(-2, -len(rewards)-1, -1):
        G[idx] = rewards[idx] + gamma * G[idx+1]
    return G




In [2]:
# INITILIZING POLICY AND REWARD FUNCTION
policy = Agent(input_dims=env.observation_space.shape[0], env=env, 
                n_actions=env.action_space.shape[0])
cost_f = CostNN(env.observation_space.shape[0] + env.action_space.shape[0])
policy_optimizer = torch.optim.Adam(policy.actor.parameters(), 3e-4)
cost_optimizer = torch.optim.Adam(cost_f.parameters(), 1e-2, weight_decay=1e-4)

mean_rewards = []
mean_costs = []
mean_loss_rew = []
mean_policy_loss = []
EPISODES_TO_PLAY = 1
REWARD_FUNCTION_UPDATE = 10
DEMO_BATCH = 100

D_demo_states = np.load('expert_samples/sac_inverted_pendulum_states.npy',allow_pickle=True)
D_demo_actions = np.load('expert_samples/sac_inverted_pendulum_actions.npy',allow_pickle=True)
D_demo_probs = np.load('expert_samples/sac_inverted_pendulum_probs.npy',allow_pickle=True)



In [3]:
cost_f = cost_f.cuda()
policy.actor.train()
cost_f. train()
return_list, sum_of_cost_list, policy_loss = [], [], []
D_sample_states = np.array([])
D_sample_actions = np.array([])
D_sample_probs = np.array([])
for i in range(1000):
    #trajs = [policy.generate_session(env) for _ in range(EPISODES_TO_PLAY)]
    #sample_trajs = trajs + sample_trajs
    #D_samp = preprocess_traj(trajs, D_samp)
    state, prob, action, rewards = policy.generate_session(env)
    D_sample_states = np.concatenate((D_sample_states, state.reshape(-1)))
    D_sample_actions = np.concatenate((D_sample_actions, action))
    D_sample_probs = np.concatenate((D_sample_probs, prob))
    # UPDATING REWARD FUNCTION (TAKES IN D_samp, D_demo)
    loss_rew = []
    for _ in range(REWARD_FUNCTION_UPDATE):
        D_samp = D_sample_states.reshape(-1,5)
        selected_samp = np.random.choice(len(D_samp), DEMO_BATCH)
        selected_demo = np.random.choice(len(D_demo_states), DEMO_BATCH)

        D_s_samp_states  = D_samp[selected_samp]
        D_s_samp_actions = D_sample_actions[selected_samp]
        D_s_samp_probs   = D_sample_probs[selected_samp]
        D_s_demo_states  = D_demo_states[selected_demo]
        D_s_demo_actions = D_demo_actions[selected_demo]
        D_s_demo_probs   = D_demo_probs[selected_demo]

        #D̂ samp ← D̂ demo ∪ D̂ samp
        states  = np.concatenate((D_s_demo_states, D_s_samp_states), axis = 0)
        actions = np.concatenate((D_s_demo_actions, D_s_samp_actions), axis = 0)
        probs   = np.concatenate((D_s_demo_probs, D_s_samp_probs), axis = 0)

#         states, probs, actions = D_s_samp[:,:-2], D_s_samp[:,-2], D_s_samp[:,-1]
#         states_expert, actions_expert = D_s_demo[:,:-2], D_s_demo[:,-1]
        states_expert = D_s_demo_states
        actions_expert = D_s_demo_actions

        # Reducing from float64 to float32 for making computaton faster
        states = torch.tensor(states, dtype=torch.float32).cuda()
        probs = torch.tensor(probs, dtype=torch.float32).cuda()
        actions = torch.tensor(actions, dtype=torch.float32).cuda()
        states_expert = torch.tensor(states_expert, dtype=torch.float32).cuda()
        actions_expert = torch.tensor(actions_expert, dtype=torch.float32).cuda()

        costs_samp = cost_f(torch.cat((states, actions.reshape(-1, 1)), dim=-1))
        costs_demo = cost_f(torch.cat((states_expert, actions_expert.reshape(-1, 1)), dim=-1))

        probs = torch.exp(probs)
        # LOSS CALCULATION FOR IOC (COST FUNCTION)
        loss_IOC = torch.mean(costs_demo) + \
                torch.log(torch.mean(torch.exp(-costs_samp)/(probs+1e-7)))
        # UPDATING THE COST FUNCTION
        cost_optimizer.zero_grad()
        loss_IOC.backward()
        cost_optimizer.step()

        loss_rew.append(loss_IOC.detach().cpu().numpy())

    for _ in range(EPISODES_TO_PLAY):
        
        states = torch.tensor(state, dtype=torch.float32).cuda()
        actions = torch.tensor(action, dtype=torch.float32).cuda()
            
        costs = cost_f(torch.cat((states, actions.reshape(-1, 1)), dim=-1)).detach().cpu().numpy()
        cumulative_returns = np.array(get_cumulative_rewards(costs, 0.99))
        cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32).cuda()

        mu, sigma = policy.actor(states)
        distribution = Normal(mu, sigma)
#         logits = policy(states)
#         probs = nn.functional.softmax(logits, -1)
#         log_probs = nn.functional.log_softmax(logits, -1)

#         log_probs_for_actions = torch.sum(
#             log_probs * to_one_hot(actions, env.action_space.n), dim=1)
        log_probs = distribution.log_prob(actions)
        log_probs -= torch.log(1-actions.pow(2)+policy.actor.reparam_noise)
        log_probs = log_probs.sum(1, keepdim=True)
        
        probs = torch.exp(log_probs)
    
        entropy = torch.mean(probs*log_probs)
        loss = -torch.mean(log_probs*cumulative_returns -entropy*1e-2) 

        # UPDATING THE POLICY NETWORK
        policy_optimizer.zero_grad()
        loss.backward()
        policy_optimizer.step()
        policy_loss.append(loss.detach().cpu().numpy())

    returns = np.sum(rewards)
    sum_of_cost = np.sum(costs)
    return_list.append(returns)
    sum_of_cost_list.append(sum_of_cost)

    mean_rewards.append(np.mean(return_list))
    mean_costs.append(np.mean(sum_of_cost_list))
    mean_loss_rew.append(np.mean(loss_rew))
    mean_policy_loss.append(np.mean(policy_loss))

    # PLOTTING PERFORMANCE
    if i % 10 == 0:
        # clear_output(True)
        print(f"mean reward:{np.mean(return_list)} loss: {loss_IOC}")

        plt.figure(figsize=[16, 12])
        plt.subplot(2, 2, 1)
        plt.title(f"Mean reward per {EPISODES_TO_PLAY} games")
        plt.plot(mean_rewards)
        plt.grid()

        plt.subplot(2, 2, 2)
        plt.title(f"Mean cost per {EPISODES_TO_PLAY} games")
        plt.plot(mean_costs)
        plt.grid()

        plt.subplot(2, 2, 3)
        plt.title(f"Mean loss per {REWARD_FUNCTION_UPDATE} batches")
        plt.plot(mean_loss_rew)
        plt.grid()
        
        plt.subplot(2, 2, 4)
        plt.title(f"Mean policy loss per {EPISODES_TO_PLAY} games")
        plt.plot(mean_policy_loss)
        plt.grid()

        # plt.show()
        plt.savefig('plots/GCL_learning_curve.png')
        plt.close()

    if np.mean(return_list) > 500:
        break

mean reward:27.0 loss: 15.1333646774292
mean reward:22.454545454545453 loss: 14.933072090148926
mean reward:26.571428571428573 loss: 14.810850143432617
mean reward:25.903225806451612 loss: 14.832557678222656
mean reward:25.365853658536587 loss: 14.829389572143555
mean reward:24.372549019607842 loss: 14.81758975982666
mean reward:23.934426229508198 loss: 14.7814302444458
mean reward:24.464788732394368 loss: 14.804716110229492
mean reward:25.11111111111111 loss: 14.860937118530273
mean reward:24.87912087912088 loss: 14.84914779663086
mean reward:25.633663366336634 loss: 14.832154273986816
mean reward:25.00900900900901 loss: 14.782355308532715
mean reward:25.05785123966942 loss: 14.831594467163086
mean reward:25.633587786259543 loss: 14.78981876373291
mean reward:25.638297872340427 loss: 14.841951370239258
mean reward:25.920529801324502 loss: 14.787618637084961
mean reward:25.48447204968944 loss: 14.811004638671875
mean reward:25.935672514619885 loss: 14.796098709106445
mean reward:25.734