<a href="https://colab.research.google.com/github/IanWangg/DSFPG/blob/master/TD3_walk_backward.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

%cd gdrive/My Drive/Workplace

# !pip install -e ./pybullet-gym

import sys
sys.path.append('./pybullet-gym')
sys.path.append('./pybullet')
import pybulletgym

Mounted at /content/gdrive
/content/gdrive/My Drive/Workplace


In [None]:
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)
# Paper: https://arxiv.org/abs/1802.09477

class ReplayBuffer(object):
    def __init__(self, state_dim, action_dim, max_size=int(1e6)):
        self.max_size = max_size
        self.ptr = 0
        self.size = 0

        self.start_ptr = 0
        self.start_size = 0

        self.state = np.zeros((max_size, state_dim))
        self.action = np.zeros((max_size, action_dim))
        self.next_state = np.zeros((max_size, state_dim))
        self.reward = np.zeros((max_size, 1))
        self.not_done = np.zeros((max_size, 1))

        self.start_state = np.zeros((max_size, state_dim))

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def add(self, state, action, next_state, reward, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.next_state[self.ptr] = next_state
        self.reward[self.ptr] = reward
        self.not_done[self.ptr] = 1. - done

        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size):
        ind = np.random.randint(self.size, size=batch_size)

        return (
            torch.FloatTensor(self.state[ind]).to(self.device),
            torch.FloatTensor(self.action[ind]).to(self.device),
            torch.FloatTensor(self.next_state[ind]).to(self.device),
            torch.FloatTensor(self.reward[ind]).to(self.device),
            torch.FloatTensor(self.not_done[ind]).to(self.device)
        )


class Actor(nn.Module):
	def __init__(self, state_dim, action_dim, max_action):
		super(Actor, self).__init__()

		self.l1 = nn.Linear(state_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, action_dim)
		
		self.max_action = max_action
		

	def forward(self, state):
		a = F.relu(self.l1(state))
		a = F.relu(self.l2(a))
		return self.max_action * torch.tanh(self.l3(a))


class Critic(nn.Module):
	def __init__(self, state_dim, action_dim):
		super(Critic, self).__init__()

		# Q1 architecture
		self.l1 = nn.Linear(state_dim + action_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, 1)

		# Q2 architecture
		self.l4 = nn.Linear(state_dim + action_dim, 256)
		self.l5 = nn.Linear(256, 256)
		self.l6 = nn.Linear(256, 1)


	def forward(self, state, action):
		sa = torch.cat([state, action], 1)

		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)

		q2 = F.relu(self.l4(sa))
		q2 = F.relu(self.l5(q2))
		q2 = self.l6(q2)
		return q1, q2


	def Q1(self, state, action):
		sa = torch.cat([state, action], 1)

		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)
		return q1


class TD3(object):
	def __init__(
		self,
		state_dim,
		action_dim,
		max_action,
		discount=0.99,
		tau=0.005,
		policy_noise=0.2,
		noise_clip=0.5,
		policy_freq=2
	):

		self.actor = Actor(state_dim, action_dim, max_action).to(device)
		self.actor_target = copy.deepcopy(self.actor)
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

		self.critic = Critic(state_dim, action_dim).to(device)
		self.critic_target = copy.deepcopy(self.critic)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

		self.max_action = max_action
		self.discount = discount
		self.tau = tau
		self.policy_noise = policy_noise
		self.noise_clip = noise_clip
		self.policy_freq = policy_freq

		self.total_it = 0


	def select_action(self, state):
		state = torch.FloatTensor(state.reshape(1, -1)).to(device)
		return self.actor(state).cpu().data.numpy().flatten()


	def train(self, replay_buffer, batch_size=256):
		self.total_it += 1

		# Sample replay buffer 
		state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

		with torch.no_grad():
			# Select action according to policy and add clipped noise
			noise = (
				torch.randn_like(action) * self.policy_noise
			).clamp(-self.noise_clip, self.noise_clip)
			
			next_action = (
				self.actor_target(next_state) + noise
			).clamp(-self.max_action, self.max_action)

			# Compute the target Q value
			target_Q1, target_Q2 = self.critic_target(next_state, next_action)
			target_Q = torch.min(target_Q1, target_Q2)
			target_Q = reward + not_done * self.discount * target_Q

		# Get current Q estimates
		current_Q1, current_Q2 = self.critic(state, action)

		# Compute critic loss
		critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

		# Optimize the critic
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()

		# Delayed policy updates
		if self.total_it % self.policy_freq == 0:

			# Compute actor losse
			actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
			
			# Optimize the actor 
			self.actor_optimizer.zero_grad()
			actor_loss.backward()
			self.actor_optimizer.step()

			# Update the frozen target models
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


	def save(self, filename):
		torch.save(self.critic.state_dict(), filename + "_critic")
		torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")
		
		torch.save(self.actor.state_dict(), filename + "_actor")
		torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer")


	def load(self, filename):
		self.critic.load_state_dict(torch.load(filename + "_critic"))
		self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
		self.critic_target = copy.deepcopy(self.critic)

		self.actor.load_state_dict(torch.load(filename + "_actor"))
		self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
		self.actor_target = copy.deepcopy(self.actor)
		

# Train / Evaluate the agent

In [None]:
from tqdm import trange
import gym
import pybulletgym

def eval_policy(policy, env_name, seed, eval_episodes=10):
    eval_env = gym.make(env_name)
    eval_env.seed(seed + 100)

    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(), False
        while not done:
            action = policy.select_action(np.array(state))
            state, reward, done, _ = eval_env.step(action)
            avg_reward += reward

    avg_reward /= eval_episodes

    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
    print("---------------------------------------")
    return avg_reward

def train_agent(
    agent_func,
    env_name,
    max_steps=int(1e6),
    max_step_before_learning=10000,
):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    replay_buffer = ReplayBuffer(state_dim, action_dim)
    policy = agent_func(state_dim, action_dim, max_action)
    evaluations = [eval_policy(policy, env_name, 0)] # 0 is the seed

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(max_steps)):
        
        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < max_step_before_learning:
            action = env.action_space.sample()
        else:
            action = (
                policy.select_action(np.array(state))
                + np.random.normal(0, max_action * 0.1, size=action_dim)
            ).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action) 
        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

        # Store data in replay buffer
        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= max_step_before_learning:
            policy.train(replay_buffer, 256)

        if done: 
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1 

        # Evaluate episode
        if (t + 1) % 1000 == 0:
            evaluations.append(eval_policy(policy, env_name, 0))
    
    return evaluations



In [None]:
eval = train_agent(TD3, 'HopperMuJoCoEnv-v0')



WalkerBase::__init__
WalkerBase::__init__
---------------------------------------
Evaluation over 10 episodes: 4.733
---------------------------------------
Total T: 3 Episode Num: 1 Episode T: 3 Reward: 1.912
Total T: 6 Episode Num: 2 Episode T: 3 Reward: 2.012
Total T: 9 Episode Num: 3 Episode T: 3 Reward: 1.373
Total T: 15 Episode Num: 4 Episode T: 6 Reward: 5.352
Total T: 19 Episode Num: 5 Episode T: 4 Reward: 1.781
Total T: 22 Episode Num: 6 Episode T: 3 Reward: 2.056
Total T: 26 Episode Num: 7 Episode T: 4 Reward: 2.712
Total T: 30 Episode Num: 8 Episode T: 4 Reward: 3.715
Total T: 33 Episode Num: 9 Episode T: 3 Reward: 0.163
Total T: 36 Episode Num: 10 Episode T: 3 Reward: 1.548
Total T: 40 Episode Num: 11 Episode T: 4 Reward: 2.646
Total T: 42 Episode Num: 12 Episode T: 2 Reward: 1.343
Total T: 48 Episode Num: 13 Episode T: 6 Reward: 3.488
Total T: 54 Episode Num: 14 Episode T: 6 Reward: 3.184
Total T: 61 Episode Num: 15 Episode T: 7 Reward: 4.350
Total T: 66 Episode Num: 16 Ep

KeyboardInterrupt: ignored

# Training Function

In [None]:
from tqdm import trange
import gym
import pybulletgym

def mini_batch_rl(agent_func, 
                env_name, # this should be an env object
                runs=1,
                max_steps=int(1e6),
                max_path_length=1000,
                max_step_before_learning=10000
                ):
    returns_timing = []
    returns_value = []
    agents = []
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    for run in trange(runs, desc='runs'):
        env.seed(run + 100)
        total_steps = 0
        done = True
        # each element in returns array should be of shape [episodic_return, steps]
        # if an episode is not over, episodic_return is 0
        rewards = []
        episodic_return = 0
        agent = agent_func(state_dim=state_dim,
                            action_dim=action_dim,
                            max_action=max_action,
                            max_step_before_learning=max_step_before_learning,
        )
        
        path_length=0

        while total_steps < max_steps:
            if done or path_length >= max_path_length:
                print(f'Steps : {total_steps}, Episodic_return : {episodic_return}')
                state = env.reset()
                done = False
                rewards.append([total_steps, episodic_return])
                episodic_return = 0
                path_length = 0

            path_length += 1
            total_steps += 1
            action = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            agent.train(state, action, next_state, reward, done)
            episodic_return += reward
        
        returns_timing.append(rewards[:, 0])
        returns_timing.append(rewards[:, 1])
        agents.append(agents)

        filename = f'./state_dict/{agent_func.__name__}-{env_name}-{random_seed}.pt'
        torch.save(agent.state_dict(), filename)
    
    return agents, returns_timing, returns_value

In [None]:
agents, t, r = mini_batch_rl(agent_func=TD3,
                           env_name='HopperMuJoCoEnv-v0')

runs:   0%|          | 0/1 [00:00<?, ?it/s]

WalkerBase::__init__





TypeError: ignored