<a href="https://colab.research.google.com/github/IanWangg/DSFPG/blob/master/Walker_Walk_Backward.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

%cd gdrive/My Drive/Workplace

!git clone https://github.com/benelot/pybullet-gym.git

%cd pybullet-gym/

!pip install -e .

Mounted at /content/gdrive
/content/gdrive/My Drive/Workplace
Cloning into 'pybullet-gym'...
remote: Enumerating objects: 804, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 804 (delta 21), reused 28 (delta 6), pack-reused 750[K
Receiving objects: 100% (804/804), 19.31 MiB | 13.52 MiB/s, done.
Resolving deltas: 100% (437/437), done.
Checking out files: 100% (252/252), done.
/content/gdrive/My Drive/Workplace/pybullet-gym
Obtaining file:///content/gdrive/My%20Drive/Workplace/pybullet-gym
Collecting pybullet>=1.7.8
[?25l  Downloading https://files.pythonhosted.org/packages/73/6d/60b97ffc579db665bdd87f2cb47fe1215ae770fbbc1add84ebf36ddca63b/pybullet-3.1.7.tar.gz (79.0MB)
[K     |████████████████████████████████| 79.0MB 37kB/s 
[?25hBuilding wheels for collected packages: pybullet


In [3]:
import gym
import pybulletgym
env = gym.make('Walker2DMuJoCoEnv-v0')
env.reset().shape

WalkerBase::__init__




(17,)

# Define the agent

In [None]:
#@title
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ReplayBuffer(object):
    def __init__(self, state_dim, action_dim, max_size=int(1e6)):
        self.max_size = max_size
        self.ptr = 0
        self.size = 0

        self.start_ptr = 0
        self.start_size = 0

        self.state = np.zeros((max_size, state_dim))
        self.action = np.zeros((max_size, action_dim))
        self.next_state = np.zeros((max_size, state_dim))
        self.reward = np.zeros((max_size, 1))
        self.not_done = np.zeros((max_size, 1))

        self.start_state = np.zeros((max_size, state_dim))

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def add(self, state, action, next_state, reward, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.next_state[self.ptr] = next_state
        self.reward[self.ptr] = reward
        self.not_done[self.ptr] = 1. - done

        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size):
        ind = np.random.randint(self.size, size=batch_size)

        return (
            torch.FloatTensor(self.state[ind]).to(self.device),
            torch.FloatTensor(self.action[ind]).to(self.device),
            torch.FloatTensor(self.next_state[ind]).to(self.device),
            torch.FloatTensor(self.reward[ind]).to(self.device),
            torch.FloatTensor(self.not_done[ind]).to(self.device)
        )


class Encoder_Decoder(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Encoder_Decoder, self).__init__()

        self.e1 = nn.Linear(state_dim + action_dim, 256)
        self.e2 = nn.Linear(256, 256)

        self.r1 = nn.Linear(256, 1, bias=False)

        self.a1 = nn.Linear(256, 256)
        self.a2 = nn.Linear(256, action_dim)

        self.d1 = nn.Linear(256, 256)
        self.d2 = nn.Linear(256, state_dim)


    def forward(self, state, action):
        l = F.relu(self.e1(torch.cat([state, action], 1)))
        l = F.relu(self.e2(l))

        r = self.r1(l)

        d = F.relu(self.d1(l))
        ns = self.d2(d)

        d = F.relu(self.a1(l))
        a = self.a2(d)

        return ns, r, a, l

    def latent(self, state, action):
        l = F.relu(self.e1(torch.cat([state, action], 1)))
        l = F.relu(self.e2(l))
        return l


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        self.l1 = nn.Linear(state_dim + action_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 256)

    def forward(self, state, action):
        q1 = F.relu(self.l1(torch.cat([state, action], 1)))
        q1 = F.relu(self.l2(q1))
        return self.l3(q1)
  

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, action_dim)
        
        self.max_action = max_action

    def forward(self, state):
        state = torch.tensor(state).to(device)
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a))


class DSFPG(object):
    def __init__(
        self,
        state_dim,
        action_dim,
        max_action,
        max_step_before_learning,
        buffer_size=int(1e6),
        discount=0.99,
        tau=0.005
    ):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.encoder_decoder = Encoder_Decoder(state_dim, action_dim).to(device)
        self.ed_optimizer = torch.optim.Adam(self.encoder_decoder.parameters(), lr=3e-4)

        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

        self.W = torch.ones(1, 256, requires_grad=True, device=device)
        self.W_optimizer = torch.optim.Adam([self.W], lr=3e-4)

        self.discount = discount
        self.tau = tau
        self.max_step_before_learning = max_step_before_learning

        self.total_it = 0

        self.max_action = max_action

        self.replay = ReplayBuffer(state_dim, action_dim, buffer_size)


    def train_encoder_decoder(self, state, action, next_state, reward, done, batch_size=256):
        self.replay.add(state, action, next_state, reward, done)

        if self.replay.size > self.max_step_before_learning:
            state, action, next_state, reward, not_done = self.replay.sample(batch_size)

            recons_next, recons_reward, recons_action, lat = self.encoder_decoder(state, action)
            ed_loss = F.mse_loss(recons_next, next_state) + 0.1 * F.mse_loss(recons_reward, reward) + F.mse_loss(recons_action, action)

            self.ed_optimizer.zero_grad()
            ed_loss.backward()
            self.ed_optimizer.step()


    def train_SR(self, state, action, next_state, reward, done, batch_size=256):
        self.replay.add(state, action, next_state, reward, done)

        if self.replay.size > self.max_step_before_learning:
            state, action, next_state, reward, not_done = self.replay.sample(batch_size)

            with torch.no_grad():
                next_action = self.actor_target(next_state)
                # add randomness to the next action, this should be removed if the result is not idea 
                next_action = (next_action + torch.randn_like(next_action) * self.max_action * 0.1).clamp(-self.max_action, self.max_action)

                latent = self.encoder_decoder.latent(state, action)
                target_Q = latent + self.discount * not_done * self.critic_target(next_state, next_action)

            current_Q = self.critic(state, action)
            critic_loss = F.mse_loss(current_Q, target_Q)

            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


    def train_w(self, state, action, next_state, reward, done, batch_size=256):
        self.replay.add(state, action, next_state, reward, done)

        if self.replay.size > self.max_step_before_learning:
            state, action, next_state, reward, not_done = self.replay.sample(batch_size)

            with torch.no_grad():
                latent = self.encoder_decoder(state, action)
            
            reward_estimate = latent * self.W
            W_loss = F.mse_loss(reward_estimate, reward)

            self.W_optimizer.zero_grad()
            W_loss.backward()
            self.W_optimizer.step()

    def train_actor(self, state, action, next_state, reward, done, batch_size=256):
        self.replay.add(state, action, next_state, reward, done)

        if self.replay.size > self.max_step_before_learning:
            state, action, next_state, reward, not_done = self.replay.sample(batch_size)

            actor_loss = -(self.critic(state, self.actor(state)) * self.W).mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

    def select_action(self, state):
        with torch.no_grad():
            action = self.actor(state)
        return action

    def train(self, state, action, next_state, reward, done):
        self.train_encoder_decoder(state, action, next_state, reward, done)
        self.train_SR(state, action, next_state, reward, done)
        self.train_w(state, action, next_state, reward, done)
        self.train_actor(state, action, next_state, reward, done)

# Use PyBullet Built-in Locomotion Environment

In [None]:
from tqdm import trange
import gym
import pybulletgym

def train_agent(agent_func, 
                env_name, # this should be an env object
                runs=1,
                max_steps=int(1e6),
                max_step_before_learning=int(3e4)
                ):
    returns_timing = []
    returns_value = []
    agents = []
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    for run in trange(runs, desc='runs'):
        env.seed(run + 100)
        total_steps = 0
        done = True
        # each element in returns array should be of shape [episodic_return, steps]
        # if an episode is not over, episodic_return is 0
        rewards = []
        episodic_return = 0
        agent = agent_func(state_dim=state_dim,
                            action_dim=action_dim,
                            max_action=max_action,
                            max_step_before_learning=max_step_before_learning,
                            buffer_size=max_steps)

        while total_steps < max_steps:
            if done:
                state = env.reset()
                rewards.append([total_steps, episodic_return])
                episodic_return = 0

            action = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            agent.train(state, action, next_state, reward, done)
            episodic_return += reward
        
        returns_timing.append(rewards[:, 0])
        returns_timing.append(rewards[:, 1])
        agents.append(agents)

        filename = f'./state_dict/{agent_func.__name__}-{env_name}-{random_seed}.pt'
        torch.save(agent.state_dict(), filename)
    
    return agents, returns_timing, returns_value

In [None]:
agents, t, r = train_agent(agent_func=DSFPG,
                           env_name='Walker2DMuJoCoEnv-v0')




runs:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A

WalkerBase::__init__




NameError: ignored

In [None]:
}def plot_rewards(rewards, plot_seperate=True , clip=int(1e6), title='unnamed'):
    smooth = 5000
    
    colors = ['red', 'blue', 'green', 'm', 'k', 'y', '#999999']
    
    plt.figure(figsize=(16,6), dpi=200)
    if(plot_seperate):
        for k, v in rewards.items():
            for t, r in zip(v[0], v[1]):
                plt.plot(t, r, label=k)
        plt.legend(), plt.show()
        return
    
    for j, (k, v) in enumerate(rewards.items()):
        r_vec = np.zeros((len(v[0]), clip-smooth+1))
        for i, (t, r) in enumerate(zip(v[0], v[1])):
            r_vec[i,:] = convolve(np.interp(np.arange(clip), t, r), smooth)
    
        mean = np.mean(np.array(r_vec), axis=0)
        std = np.std(np.array(r_vec), axis=0)
        plt.plot(mean, label=k, color=colors[j])
        plt.fill_between(np.arange(0, len(mean)), mean+std, mean-std, facecolor=colors[j], alpha=0.3)
    
    plt.xlabel('timesteps'), plt.ylabel('episodic returns')
    plt.title(title)
    plt.legend(loc='lower right'), plt.show()