<a href="https://colab.research.google.com/github/IanWangg/SF-For-Transfer/blob/master/SAC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

%cd gdrive/My Drive/Workplace

# !pip install -e ./pybullet-gym

import sys
sys.path.append('./pybullet-gym')
sys.path.append('./pybullet')
sys.path.append('./OSRPG')
import pybulletgym

Mounted at /content/gdrive
/content/gdrive/My Drive/Workplace


In [3]:
import rlkit.torch.pytorch_util as ptu
from rlkit.data_management.env_replay_buffer import EnvReplayBuffer
from rlkit.torch.sac.policies import TanhGaussianPolicy, MakeDeterministic
from rlkit.torch.networks import ConcatMlp

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ReplayBuffer(object):
    def __init__(self, state_dim, action_dim, max_size=int(1e6)):
        self.max_size = max_size
        self.ptr = 0
        self.size = 0

        self.start_ptr = 0
        self.start_size = 0

        self.state = np.zeros((max_size, state_dim))
        self.action = np.zeros((max_size, action_dim))
        self.next_state = np.zeros((max_size, state_dim))
        self.reward = np.zeros((max_size, 1))
        self.not_done = np.zeros((max_size, 1))

        self.start_state = np.zeros((max_size, state_dim))

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def add(self, state, action, next_state, reward, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.next_state[self.ptr] = next_state
        self.reward[self.ptr] = reward
        self.not_done[self.ptr] = 1. - done

        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size):
        ind = np.random.randint(self.size, size=batch_size)

        return (
            torch.FloatTensor(self.state[ind]).to(self.device),
            torch.FloatTensor(self.action[ind]).to(self.device),
            torch.FloatTensor(self.next_state[ind]).to(self.device),
            torch.FloatTensor(self.reward[ind]).to(self.device),
            torch.FloatTensor(self.not_done[ind]).to(self.device)
        )


class SAC(object):
    def __init__(
        self,
        state_dim,
        action_dim,
        max_action,
        env_name,
        buffer_size=int(1e6),
        discount=0.99,
        tau=5e-3,
        delay=1,
        preserve_SR=False,
    ):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        '''
        define actor
        '''
        self.actor = TanhGaussianPolicy(
            obs_dim=state_dim,
            action_dim=action_dim,
            hidden_sizes=[256, 256],
        ).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

        self.eval_actor = MakeDeterministic(self.actor)
        
        '''
        define two critic
        '''
        self.critic1 = ConcatMlp(
            input_size=state_dim + action_dim,
            output_size=1,
            hidden_sizes=[256, 256],
        ).to(device)
        self.critic2 = ConcatMlp(
            input_size=state_dim + action_dim,
            output_size=1,
            hidden_sizes=[256, 256],
        ).to(device)
        self.critic_target1 = copy.deepcopy(self.critic1)
        self.critic_target2 = copy.deepcopy(self.critic2)
        self.critic_optimizer1 = torch.optim.Adam(self.critic1.parameters(), lr=3e-4)
        self.critic_optimizer2 = torch.optim.Adam(self.critic2.parameters(), lr=3e-4)

        self.discount = discount
        self.tau = tau

        self.total_it = 0

        self.max_action = max_action

        self.replay = ReplayBuffer(state_dim, action_dim, buffer_size)

        # update delay
        self.delay = delay

        # perserve SR or not
        self.preserve_SR = preserve_SR

        # psi_loss check
        self.psi_loss = []

        # entropy term
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        self.entropy_target = -np.prod(self.env.action_space.shape).item()
        self.log_alpha = ptu.zeros(1, requires_grad=True).to(device)
        self.alpha_optimizer = torch.optim.Adam(
            [self.log_alpha],
            lr=1e-4,
        )
            

    def soft_update(self):
        for param, target_param in zip(self.critic1.parameters(), self.critic_target1.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        for param, target_param in zip(self.critic2.parameters(), self.critic_target2.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


    def train_all(self, replay_buffer, batch_size=256):
        state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)
        '''
        Train alpha and actor
        '''
        dist = self.actor(state) # get the action distribution based on the observation
        new_obs_actions, log_pi = dist.rsample_and_logprob()
        log_pi = log_pi.unsqueeze(-1)
        alpha_loss = -(self.log_alpha * (log_pi + self.entropy_target).detach()).mean()
        alpha = self.log_alpha.exp()

        q_new_actions = torch.min(
            self.critic1(state, new_obs_actions),
            self.critic2(state, new_obs_actions)
        )
        actor_loss = (alpha*log_pi - q_new_actions).mean()
        # actor_loss = -self.W(self.critic1(state, new_obs_actions)).mean()
        # self.actor_optimizer.zero_grad()
        # actor_loss.backward(retain_graph=True)
        # self.actor_optimizer.step()

        '''
        Train QF
        '''
        # if not self.perserve_SR: # if we don't have a mature SR yet, learn it
        q1_pred = self.critic1(state, action)
        q2_pred = self.critic2(state, action)
        next_dist = self.actor(next_state)
        new_next_actions, new_log_pi = next_dist.rsample_and_logprob()
        new_log_pi = new_log_pi.unsqueeze(-1)
        target_q_values = torch.min(
            self.critic_target1(next_state, new_next_actions),
            self.critic_target2(next_state, new_next_actions),
        ) - alpha * new_log_pi

        q_target = reward + not_done * self.discount * target_q_values
        qf1_loss = F.mse_loss(q1_pred, q_target.detach())
        qf2_loss = F.mse_loss(q2_pred, q_target.detach())

        self.alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.alpha_optimizer.step()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer1.zero_grad()
        qf1_loss.backward()
        self.critic_optimizer1.step()

        self.critic_optimizer2.zero_grad()
        qf2_loss.backward()
        self.critic_optimizer2.step()


        if self.total_it % self.delay == 0:
            self.soft_update()

    def select_action(self, state):
        # print(self.actor.get_action(state))
        return self.actor.get_action(state)[0]

    def select_eval_action(self, state):
        # print(self.eval_actor.get_action(state))
        return self.eval_actor.get_action(state)[0]

    def train(self, replay_buffer, batch_size=256):
        self.total_it += 1
        self.train_all(replay_buffer)

        # self.train_actor(state, action, next_state, reward, done)
        # self.train_encoder_decoder(state, action, next_state, reward, done)
        # self.train_SR(state, action, next_state, reward, done)
        # self.train_w(state, action, next_state, reward, done)

In [19]:
from tqdm import trange
import gym
import pybulletgym

def eval_policy(policy, env, seed, eval_episodes=10):
    if isinstance(env, str):
        eval_env = gym.make(env)
    else:
        eval_env = env # a non_proper way 
    eval_env.seed(seed + 100)

    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(), False
        while not done:
            action = policy.select_eval_action(np.array(state))
            state, reward, done, _ = eval_env.step(action)
            avg_reward += reward

    avg_reward /= eval_episodes

    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
    print("---------------------------------------")
    return avg_reward

def train_agent(
    agent_func,
    env_name,
    max_steps=int(8e5),
    max_step_before_learning=10000,
    preserve_SR=False,
):
    if isinstance(env_name, str):
        env = gym.make(env_name)
    else:
        env = env_name
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    replay_buffer = ReplayBuffer(state_dim, action_dim)

    policy = agent_func(state_dim, action_dim, max_action, env_name) if callable(agent_func) else agent_func # if agent_func is a function, create an agent, otherwise, use the agent

    evaluations = [eval_policy(policy, env_name, 0)] # 0 is the seed

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(max_steps)):
        
        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < max_step_before_learning:
            action = env.action_space.sample()
        else:
            # action = (
            #     policy.select_action(np.array(state))
            #     + np.random.normal(0, max_action * 0.2, size=action_dim)
            # ).clip(-max_action, max_action)
            action = policy.select_action(np.array(state)).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action) 
        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

        # Store data in replay buffer
        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= max_step_before_learning:
            policy.train(replay_buffer, 128)

        if done: 
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1 

        # Evaluate episode
        if (t + 1) % 1000 == 0:
            evaluations.append(eval_policy(policy, env_name, 0))
    
    return evaluations, policy

In [None]:
ptu.set_gpu_mode(True)
eval, policy = train_agent(SAC, 'HopperMuJoCoEnv-v0')



WalkerBase::__init__
WalkerBase::__init__
WalkerBase::__init__
---------------------------------------
Evaluation over 10 episodes: 14.178
---------------------------------------
Total T: 7 Episode Num: 1 Episode T: 7 Reward: 3.720
Total T: 15 Episode Num: 2 Episode T: 8 Reward: 5.525
Total T: 18 Episode Num: 3 Episode T: 3 Reward: 2.448
Total T: 21 Episode Num: 4 Episode T: 3 Reward: 1.184
Total T: 24 Episode Num: 5 Episode T: 3 Reward: 1.901
Total T: 30 Episode Num: 6 Episode T: 6 Reward: 4.678
Total T: 41 Episode Num: 7 Episode T: 11 Reward: 9.348
Total T: 46 Episode Num: 8 Episode T: 5 Reward: 3.232
Total T: 48 Episode Num: 9 Episode T: 2 Reward: -0.793
Total T: 54 Episode Num: 10 Episode T: 6 Reward: 2.858
Total T: 56 Episode Num: 11 Episode T: 2 Reward: 1.821
Total T: 63 Episode Num: 12 Episode T: 7 Reward: 4.256
Total T: 66 Episode Num: 13 Episode T: 3 Reward: 3.683
Total T: 68 Episode Num: 14 Episode T: 2 Reward: 1.841
Total T: 73 Episode Num: 15 Episode T: 5 Reward: 3.571
Tota