In [1]:
import sys
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import animation, rc
from IPython.display import Math, HTML
import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.normal import Normal
import numpy as np
if not os.path.exists('.mujoco_setup_complete'):
  # Get the prereqs
  !apt-get -qq update
  !apt-get -qq install -y libosmesa6-dev libgl1-mesa-glx libglfw3 libgl1-mesa-dev libglew-dev patchelf
  # Get Mujoco
  !mkdir ~/.mujoco
  !wget -q https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz
  !tar -zxf mujoco.tar.gz -C "$HOME/.mujoco"
  !rm mujoco.tar.gz
  # Add it to the actively loaded path and the bashrc path (these only do so much)
  !echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/.mujoco/mujoco210/bin' >> ~/.bashrc 
  !echo 'export LD_PRELOAD=$LD_PRELOAD:/usr/lib/x86_64-linux-gnu/libGLEW.so' >> ~/.bashrc 
  # THE ANNOYING ONE, FORCE IT INTO LDCONFIG SO WE ACTUALLY GET ACCESS TO IT THIS SESSION
  !echo "/root/.mujoco/mujoco210/bin" > /etc/ld.so.conf.d/mujoco_ld_lib_path.conf
  !ldconfig
  # Install Mujoco-py
  !pip3 install -U 'mujoco-py<2.2,>=2.1'
  # run once
  !touch .mujoco_setup_complete

try:
  if _mujoco_run_once:
    pass
except NameError:
  _mujoco_run_once = False
if not _mujoco_run_once:
  # Add it to the actively loaded path and the bashrc path (these only do so much)
  try:
    os.environ['LD_LIBRARY_PATH']=os.environ['LD_LIBRARY_PATH'] + ':/root/.mujoco/mujoco210/bin'
  except KeyError:
    os.environ['LD_LIBRARY_PATH']='/root/.mujoco/mujoco210/bin'
  try:
    os.environ['LD_PRELOAD']=os.environ['LD_PRELOAD'] + ':/usr/lib/x86_64-linux-gnu/libGLEW.so'
  except KeyError:
    os.environ['LD_PRELOAD']='/usr/lib/x86_64-linux-gnu/libGLEW.so'
  # presetup so we don't see output on first env initialization
  import mujoco_py
  _mujoco_run_once = True
#source of this code block : https://gist.github.com/BuildingAtom/3119ac9c595324c8001a7454f23bf8c8
!pip3 install box2d-py
!pip3 install gym[Box_2D]

Extracting templates from packages: 100%
(Reading database ... 106398 files and directories currently installed.)
Preparing to unpack .../00-libx11-6_2%3a1.6.9-2ubuntu1.5_amd64.deb ...
Unpacking libx11-6:amd64 (2:1.6.9-2ubuntu1.5) over (2:1.6.9-2ubuntu1.2) ...
Selecting previously unselected package libwayland-server0:amd64.
Preparing to unpack .../01-libwayland-server0_1.18.0-1ubuntu0.1_amd64.deb ...
Unpacking libwayland-server0:amd64 (1.18.0-1ubuntu0.1) ...
Selecting previously unselected package libgbm1:amd64.
Preparing to unpack .../02-libgbm1_21.2.6-0ubuntu0.1~20.04.2_amd64.deb ...
Unpacking libgbm1:amd64 (21.2.6-0ubuntu0.1~20.04.2) ...
Selecting previously unselected package libegl-mesa0:amd64.
Preparing to unpack .../03-libegl-mesa0_21.2.6-0ubuntu0.1~20.04.2_amd64.deb ...
Unpacking libegl-mesa0:amd64 (21.2.6-0ubuntu0.1~20.04.2) ...
Selecting previously unselected package libegl1:amd64.
Preparing to unpack .../04-libegl1_1.3.2-1~ubuntu0.20.04.2_amd64.deb ...
Unpack

In [2]:
import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.normal import Normal

In [3]:
class ReplayMemory():
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *input_shape))
        self.new_state_memory = np.zeros((self.mem_size, *input_shape))
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def push(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = done

        self.mem_cntr += 1

    def sample(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, states_, dones

    def __len__(self):
        return self.mem_cntr

def create_log_gaussian(mean, log_std, t):
	quadratic = -((0.5 * (t - mean) / (log_std.exp())).pow(2))
	l = mean.shape
	log_z = log_std
	z = l[-1] * math.log(2 * math.pi)
	log_p = quadratic.sum(dim=-1) - log_z.sum(dim=-1) - 0.5 * z
	return log_p

def logsumexp(inputs, dim=None, keepdim=False):
	if dim is None:
		inputs = inputs.view(-1)
		dim = 0
	s, _ = torch.max(inputs, dim=dim, keepdim=True)
	outputs = s + (inputs - s).exp().sum(dim=dim, keepdim=True).log()
	if not keepdim:
		outputs = outputs.squeeze(dim)
	return outputs

def soft_update(target, source, tau):
	for target_param, param in zip(target.parameters(), source.parameters()):
		target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)

def hard_update(target, source):
	for target_param, param in zip(target.parameters(), source.parameters()):
		target_param.data.copy_(param.data)

In [4]:
LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
epsilon = 1e-6
def weights_init_(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight, gain=1)
        torch.nn.init.constant_(m.bias, 0)

class ValueNetwork(nn.Module):
    def __init__(self, num_inputs, hidden_dim):
        super(ValueNetwork, self).__init__()

        self.linear1 = nn.Linear(num_inputs, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)

        self.apply(weights_init_)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x


class QNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_dim):
        super(QNetwork, self).__init__()

        # Q1 architecture
        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)

        # Q2 architecture
        self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
        self.linear5 = nn.Linear(hidden_dim, hidden_dim)
        self.linear6 = nn.Linear(hidden_dim, 1)

        self.apply(weights_init_)

    def forward(self, state, action):
        xu = torch.cat([state, action], 1)

        x1 = F.relu(self.linear1(xu))
        x1 = F.relu(self.linear2(x1))
        x1 = self.linear3(x1)

        x2 = F.relu(self.linear4(xu))
        x2 = F.relu(self.linear5(x2))
        x2 = self.linear6(x2)

        return x1, x2


class GaussianPolicy(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None):
        super(GaussianPolicy, self).__init__()

        self.linear1 = nn.Linear(num_inputs, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)

        self.mean_linear = nn.Linear(hidden_dim, num_actions)
        self.log_std_linear = nn.Linear(hidden_dim, num_actions)

        self.apply(weights_init_)

        # action rescaling
        if action_space is None:
            self.action_scale = torch.tensor(1.)
            self.action_bias = torch.tensor(0.)
        else:
            self.action_scale = torch.FloatTensor(
                (action_space.high - action_space.low) / 2.)
            self.action_bias = torch.FloatTensor(
                (action_space.high + action_space.low) / 2.)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        mean = self.mean_linear(x)
        log_std = self.log_std_linear(x)
        log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX)
        return mean, log_std

    def sample(self, state):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        normal = Normal(mean, std)
        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
        y_t = torch.tanh(x_t)
        action = y_t * self.action_scale + self.action_bias
        log_prob = normal.log_prob(x_t)
        # Enforcing Action Bound
        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon)
        log_prob = log_prob.sum(1, keepdim=True)
        mean = torch.tanh(mean) * self.action_scale + self.action_bias
        return action, log_prob, mean

    def to(self, device):
        self.action_scale = self.action_scale.to(device)
        self.action_bias = self.action_bias.to(device)
        return super(GaussianPolicy, self).to(device)

In [5]:


class SAC(object):
    def __init__(self, num_inputs, action_space):

        self.gamma = 0.99
        self.tau = 0.005
        self.alpha=0.2
        
        self.target_entropy = -torch.prod(torch.Tensor(action_space.shape)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=3e-4)

        self.policy_type = "Gaussian"
        self.target_update_interval = 1

        self.device = torch.device('cuda:0' if T.cuda.is_available() else 'cpu')

        self.critic = QNetwork(num_inputs, action_space.shape[0], 256).to(device=self.device)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=3e-4)
        self.critic_target = QNetwork(num_inputs, action_space.shape[0], 256).to(self.device)
        
        hard_update(self.critic_target, self.critic)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0], 256, action_space).to(self.device)
        self.policy_optim = optim.Adam(self.policy.parameters(), lr=3e-4)
            
    def select_action(self, state, evaluate=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if evaluate is False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(next_state_batch, next_state_action)
            min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target)
        qf1, qf2 = self.critic(state_batch, action_batch)  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(qf1, next_q_value)  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(qf2, next_q_value)  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf_loss = qf1_loss + qf2_loss

        self.critic_optim.zero_grad()
        qf_loss.backward()
        self.critic_optim.step()

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()
        self.alpha = self.log_alpha.exp()

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item()



In [6]:
pip install gym[box2d]

Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting swig==4.*
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting box2d-py==2.3.5
  Downloading box2d_py-2.3.5-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: swig, box2d-py, pygame
  Attempting uninstall: box2d-py
    Found existing installation: box2d-py 2.3.8
    Uninstalling box2d-py-2.3.8:
      Successfully uninstalled box2d-py-2.3.8
Successfully installed box2d-py-2.3.5 pygame-2.1.0 swig-4.1.1
[0mNote:

In [7]:
import torch
import copy
import pandas as pd
import json,os

alg="SAC"
seed=3
envName='Ant-v3'
num_steps=1000000
evaluationStep=5000
printStep=100
start_timesteps=10000

def eval_policy(policy, env_name,eval_episodes):
    eval_env = gym.make(envName)
    avg_reward = 0.
    for i in range(eval_episodes):
        state, _ = eval_env.reset()
        done=False
        truncated=False
        while (not done) and (not truncated):
            action = policy.select_action(np.array(state),evaluate=True)
            state, reward, done,truncated,_ = eval_env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    return avg_reward

env = gym.make(envName)
env.action_space.seed(seed)
T.manual_seed(seed)
np.random.seed(seed)


agent = SAC(env.observation_space.shape[0], env.action_space,)

memory = ReplayMemory(1000000,env.observation_space.shape,env.action_space.shape[0])

evaluations = [eval_policy(agent, envName,10)]

total_numsteps = 0
updates = 0
agent.update_sys = 0
base_weight = 0.6
ep_reward_list = []

variant = dict(algorithm=alg,env=envName)
stepss=0
for i in range(1,500000000):
    episode_reward = 0
    episode_steps = 0
    done = False
    state ,_ = env.reset(seed=seed)
    truncated=False
    while (not done) and (not truncated):
        if stepss < start_timesteps:
            action = env.action_space.sample()
        else:
            action = agent.select_action(state)
        next_state, reward, done,truncated,_ = env.step(action) # Step
        episode_steps += 1
        total_numsteps += 1
        episode_reward += reward
        stepss+=1
        memory.push(state, action, reward, next_state, not(done))
        state = next_state
        if len(memory) > 10000:
            agent.update_parameters(memory, 100, updates)
            updates += 1
        if(total_numsteps%evaluationStep)==0:
            avg_reward=eval_policy(agent,envName,10)
            evaluations.append(avg_reward)
            print(f"Evaluation over {10} episodes: {avg_reward:.3f}  step{stepss}")
    if(stepss>num_steps):
            break
    ep_reward_list.append(episode_reward)
    if (i%printStep)==0:
        if i<100:
            print(f"episode: {i}   reward: {episode_reward}  avg so far:{sum(ep_reward_list)/len(ep_reward_list)} steps so far:{total_numsteps}")
        else:
            print(f"episode: {i}   reward: {episode_reward}  m :{sum(ep_reward_list[-100:])/len(ep_reward_list[-100:])} t {sum(ep_reward_list)/len(ep_reward_list)}    steps so far:{total_numsteps}")

if not os.path.exists(f"./data/{envName}/{alg}/seed{seed}"):
    os.makedirs(f'./data/{envName}/{alg}/seed{seed}')
with open(f'./data/{envName}/{alg}/seed{seed}/variant.json', 'w') as outfile:
    json.dump(variant,outfile)
data = np.array(evaluations)
df = pd.DataFrame(data=data,columns=["Average Return"]).reset_index()
df['Timesteps'] = df['index'] * evaluationStep
df['env'] = envName
df['algorithm_name'] = alg
df.to_csv(f'./data/{envName}/{alg}/seed{seed}/progress.csv', index = False)

  f"The environment {id} is out of date. You should consider "
  "This version of the mujoco environments depends "


Evaluation over 10 episodes: 949.487  step5000
Evaluation over 10 episodes: 951.622  step10000
Evaluation over 10 episodes: 588.258  step15000
Evaluation over 10 episodes: 32.416  step20000
Evaluation over 10 episodes: 706.363  step25000
episode: 100   reward: 613.6889813653929  m :-81.0266048503942 t -81.0266048503942    steps so far:26452
Evaluation over 10 episodes: 758.792  step30000
Evaluation over 10 episodes: 731.832  step35000
Evaluation over 10 episodes: 837.498  step40000
Evaluation over 10 episodes: 821.404  step45000
Evaluation over 10 episodes: 680.982  step50000
Evaluation over 10 episodes: 576.656  step55000
Evaluation over 10 episodes: 681.165  step60000
Evaluation over 10 episodes: 778.034  step65000
Evaluation over 10 episodes: 836.481  step70000
Evaluation over 10 episodes: 857.871  step75000
Evaluation over 10 episodes: 903.160  step80000
Evaluation over 10 episodes: 928.894  step85000
Evaluation over 10 episodes: 881.895  step90000
Evaluation over 10 episodes: 907.