In [1]:
import sys
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import animation, rc
from IPython.display import Math, HTML
import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.normal import Normal
import numpy as np
if not os.path.exists('.mujoco_setup_complete'):
  # Get the prereqs
  !apt-get -qq update
  !apt-get -qq install -y libosmesa6-dev libgl1-mesa-glx libglfw3 libgl1-mesa-dev libglew-dev patchelf
  # Get Mujoco
  !mkdir ~/.mujoco
  !wget -q https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz
  !tar -zxf mujoco.tar.gz -C "$HOME/.mujoco"
  !rm mujoco.tar.gz
  # Add it to the actively loaded path and the bashrc path (these only do so much)
  !echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/.mujoco/mujoco210/bin' >> ~/.bashrc 
  !echo 'export LD_PRELOAD=$LD_PRELOAD:/usr/lib/x86_64-linux-gnu/libGLEW.so' >> ~/.bashrc 
  # THE ANNOYING ONE, FORCE IT INTO LDCONFIG SO WE ACTUALLY GET ACCESS TO IT THIS SESSION
  !echo "/root/.mujoco/mujoco210/bin" > /etc/ld.so.conf.d/mujoco_ld_lib_path.conf
  !ldconfig
  # Install Mujoco-py
  !pip3 install -U 'mujoco-py<2.2,>=2.1'
  # run once
  !touch .mujoco_setup_complete

try:
  if _mujoco_run_once:
    pass
except NameError:
  _mujoco_run_once = False
if not _mujoco_run_once:
  # Add it to the actively loaded path and the bashrc path (these only do so much)
  try:
    os.environ['LD_LIBRARY_PATH']=os.environ['LD_LIBRARY_PATH'] + ':/root/.mujoco/mujoco210/bin'
  except KeyError:
    os.environ['LD_LIBRARY_PATH']='/root/.mujoco/mujoco210/bin'
  try:
    os.environ['LD_PRELOAD']=os.environ['LD_PRELOAD'] + ':/usr/lib/x86_64-linux-gnu/libGLEW.so'
  except KeyError:
    os.environ['LD_PRELOAD']='/usr/lib/x86_64-linux-gnu/libGLEW.so'
  # presetup so we don't see output on first env initialization
  import mujoco_py
  _mujoco_run_once = True
#source of this code block : https://gist.github.com/BuildingAtom/3119ac9c595324c8001a7454f23bf8c8

Extracting templates from packages: 100%
(Reading database ... 106398 files and directories currently installed.)
Preparing to unpack .../00-libx11-6_2%3a1.6.9-2ubuntu1.5_amd64.deb ...
Unpacking libx11-6:amd64 (2:1.6.9-2ubuntu1.5) over (2:1.6.9-2ubuntu1.2) ...
Selecting previously unselected package libwayland-server0:amd64.
Preparing to unpack .../01-libwayland-server0_1.18.0-1ubuntu0.1_amd64.deb ...
Unpacking libwayland-server0:amd64 (1.18.0-1ubuntu0.1) ...
Selecting previously unselected package libgbm1:amd64.
Preparing to unpack .../02-libgbm1_21.2.6-0ubuntu0.1~20.04.2_amd64.deb ...
Unpacking libgbm1:amd64 (21.2.6-0ubuntu0.1~20.04.2) ...
Selecting previously unselected package libegl-mesa0:amd64.
Preparing to unpack .../03-libegl-mesa0_21.2.6-0ubuntu0.1~20.04.2_amd64.deb ...
Unpacking libegl-mesa0:amd64 (21.2.6-0ubuntu0.1~20.04.2) ...
Selecting previously unselected package libegl1:amd64.
Preparing to unpack .../04-libegl1_1.3.2-1~ubuntu0.20.04.2_amd64.deb ...
Unpack

In [2]:
import torch
import pdb
import copy
!pip3 install box2d-py
!pip3 install gym[Box_2D]

Collecting box2d-py
  Downloading box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.6/448.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: box2d-py
Successfully installed box2d-py-2.3.8
[0m

In [3]:
def soft_update(target, source, tau):
	for target_param, param in zip(target.parameters(), source.parameters()):
		target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)

def hard_update(target, source):
	for target_param, param in zip(target.parameters(), source.parameters()):
		target_param.data.copy_(param.data)

In [4]:
class ReplayBuffer():
    def __init__(self,input_shape, n_actions,max_size=int(1e6)):
        self.memory_size = max_size
        self.memory_counter = 0
        self.state = np.zeros((self.memory_size, input_shape))
        self.state_ = np.zeros((self.memory_size, input_shape))
        self.action = np.zeros((self.memory_size, n_actions))
        self.reward = np.zeros(self.memory_size)
        self.done = np.zeros(self.memory_size)

    def add(self, state, action, reward, state_, done):
        index = self.memory_counter % self.memory_size
        self.state[index] = state
        self.state_[index] = state_
        self.action[index] = action
        self.reward[index] = reward
        self.done[index] = done
        self.memory_counter += 1

    def sample(self, batch_size):
        max_memory = min(self.memory_counter, self.memory_size)

        batch = np.random.choice(max_memory, batch_size)

        state = self.state[batch]
        action= self.action[batch]
        reward = self.reward[batch]
        state_ = self.state_[batch]
        done = self.done[batch]
        return state, action, reward, state_, done

In [5]:
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)
# Paper: https://arxiv.org/abs/1802.09477


class Actor(nn.Module):
	def __init__(self, state_dim, action_dim, max_action):
		super(Actor, self).__init__()

		self.l1 = nn.Linear(state_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, action_dim)

		self.max_action = max_action


	def forward(self, state):
		a = F.relu(self.l1(state))
		a = F.relu(self.l2(a))
		return self.max_action * torch.tanh(self.l3(a))


class Critic(nn.Module):
	def __init__(self, state_dim, action_dim):
		super(Critic, self).__init__()

		# Q1 architecture
		self.l1 = nn.Linear(state_dim + action_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, 1)

		# Q2 architecture
		self.l4 = nn.Linear(state_dim + action_dim, 256)
		self.l5 = nn.Linear(256, 256)
		self.l6 = nn.Linear(256, 1)


	def forward(self, state, action):
		sa = torch.cat([state, action], 1)

		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)

		q2 = F.relu(self.l4(sa))
		q2 = F.relu(self.l5(q2))
		q2 = self.l6(q2)
		return q1, q2


	def Q1(self, state, action):
		sa = torch.cat([state, action], 1)

		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)
		return q1




class TD3(object):
	def __init__(
		self,
		state_dim,
		action_dim,
		max_action,
		discount=0.99,
		tau=0.005,
		policy_noise=0.2,
		noise_clip=0.5,
		policy_freq=2
	):

		self.actor = Actor(state_dim, action_dim, max_action).to(device)
		self.actor_target = copy.deepcopy(self.actor)
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

		self.critic = Critic(state_dim, action_dim).to(device)
		self.critic_target = copy.deepcopy(self.critic)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

		self.max_action = max_action
		self.discount = discount
		self.tau = tau
		self.policy_noise = policy_noise
		self.noise_clip = noise_clip
		self.policy_freq = policy_freq

		self.total_it = 0


	def select_action(self, state):
		state = torch.FloatTensor(state.reshape(1, -1)).to(device)
		return self.actor(state).cpu().data.numpy().flatten()


	def train(self, replay_buffer, batch_size=100):
		if replay_buffer.memory_size<batch_size:
			return
		self.total_it += 1
		# Sample replay buffer
		state, action, reward,next_state, not_done = replay_buffer.sample(batch_size)
		reward = T.tensor(reward, dtype=T.float)
		not_done = T.tensor(not_done,dtype=T.float)
		next_state = T.tensor(next_state, dtype=T.float)
		state = T.tensor(state, dtype=T.float)
		action = T.tensor(action, dtype=T.float)
		with torch.no_grad():
			# Select action according to policy and add clipped noise
			noise = (
				torch.randn_like(action) * self.policy_noise
			).clamp(-self.noise_clip, self.noise_clip)

			next_action = (
				self.actor_target(next_state) + noise
			).clamp(-self.max_action, self.max_action)

			# Compute the target Q value
			target_Q1, target_Q2 = self.critic_target(next_state, next_action)
			target_Q = torch.squeeze(torch.min(target_Q1, target_Q2))
			target_Q = reward + not_done * self.discount * target_Q
		# Get current Q estimates
		current_Q1, current_Q2 = self.critic(state, action)

		# Compute critic loss
		critic_loss = F.mse_loss(torch.squeeze(current_Q1), target_Q) + F.mse_loss(torch.squeeze(current_Q2), target_Q)

		# Optimize the critic
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()

		# Delayed policy updates
		if self.total_it % self.policy_freq == 0:

			# Compute actor losse
			actor_loss = -self.critic.Q1(state, self.actor(state)).mean()

			# Optimize the actor
			self.actor_optimizer.zero_grad()
			actor_loss.backward()
			self.actor_optimizer.step()

			# Update the frozen target models
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


	def save(self, filename):
		torch.save(self.critic.state_dict(), filename + "_critic")
		torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")

		torch.save(self.actor.state_dict(), filename + "_actor")
		torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer")


	def load(self, filename):
		self.critic.load_state_dict(torch.load(filename + "_critic"))
		self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
		self.critic_target = copy.deepcopy(self.critic)

		self.actor.load_state_dict(torch.load(filename + "_actor"))
		self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
		self.actor_target = copy.deepcopy(self.actor)

In [6]:
import numpy as np
import torch
import gym
import argparse
import os
import copy
import pandas as pd
import json,os

alg="TD3"
seed=3
envName='Ant-v3'
StepLimit=1000000
start_timesteps=10000
evaluationStep=5000
batch_size=100

def eval_policy(policy, env_name,eval_episodes=10):
	eval_env = gym.make(env_name)
	avg_reward = 0.
	for _ in range(eval_episodes):
		state, _ = eval_env.reset()
		done= False
		truncuated= False
		while (not done) and (not truncuated):
			action = policy.select_action(np.array(state))
			state, reward, done,truncuated, _ = eval_env.step(action)
			avg_reward += reward
	avg_reward /= eval_episodes
	return avg_reward


env = gym.make(envName)
env.action_space.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

state_dim = env.observation_space.shape[0]
state_max = env.observation_space.shape
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

kwargs = {"state_dim": state_dim,"action_dim": action_dim,"max_action": max_action,"discount": 0.99,"tau": 0.005,}
kwargs["policy_noise"] = 0.2 * max_action
kwargs["noise_clip"] = 0.5 * max_action
kwargs["policy_freq"] = 2
policy = TD3(**kwargs)

replay_buffer = ReplayBuffer(state_dim, action_dim)
evaluations = [eval_policy(policy, envName)]
time_step=0
while (time_step<StepLimit):
    state, _ = env.reset(seed=seed)
    done = False
    truncuated = False
    while (not done) and (not truncuated):
        if time_step < start_timesteps:
            action = env.action_space.sample()
        else:
            action = (policy.select_action(np.array(state))
                + np.random.normal(0, max_action * 0.1, size=action_dim)).clip(-max_action, max_action)
        next_state, reward, done,truncuated, _ = env.step(action)
        replay_buffer.add(state, action, reward,next_state, int(not(done)))
        if time_step > start_timesteps:
            policy.train(replay_buffer)
        state = next_state
        time_step+=1
        if (time_step % evaluationStep) == 0:
            avg_reward=eval_policy(policy,envName)
            evaluations.append(avg_reward)
            print(f"Evaluation over {10} episodes: {avg_reward:.3f}  time step{time_step}")
variant = dict(algorithm=alg,env=envName,)
if not os.path.exists(f"./data/{envName}/{alg}/seed{seed}"):
    os.makedirs(f'./data/{envName}/{alg}/seed{seed}')
with open(f'./data/{envName}/{alg}/seed{seed}/variant.json', 'w') as outfile:
    json.dump(variant,outfile)
data = np.array(evaluations)
df = pd.DataFrame(data=data,columns=["Average Return"]).reset_index()
df['Timesteps'] = df['index'] * evaluationStep
df['env'] = envName
df['algorithm_name'] = alg
df.to_csv(f'./data/{envName}/{alg}/seed{seed}/progress.csv', index = False)        

  f"The environment {id} is out of date. You should consider "
  "This version of the mujoco environments depends "


Evaluation over 10 episodes: 997.094  time step5000
Evaluation over 10 episodes: 995.649  time step10000
Evaluation over 10 episodes: 901.256  time step15000
Evaluation over 10 episodes: 765.532  time step20000
Evaluation over 10 episodes: 291.046  time step25000
Evaluation over 10 episodes: 647.599  time step30000
Evaluation over 10 episodes: 768.775  time step35000
Evaluation over 10 episodes: 833.591  time step40000
Evaluation over 10 episodes: 858.405  time step45000
Evaluation over 10 episodes: 875.778  time step50000
Evaluation over 10 episodes: 846.627  time step55000
Evaluation over 10 episodes: 867.015  time step60000
Evaluation over 10 episodes: 890.383  time step65000
Evaluation over 10 episodes: 900.282  time step70000
Evaluation over 10 episodes: 893.936  time step75000
Evaluation over 10 episodes: 896.025  time step80000
Evaluation over 10 episodes: 879.941  time step85000
Evaluation over 10 episodes: 781.528  time step90000
Evaluation over 10 episodes: 489.535  time step