In [None]:
# !conda install swig
# !pip install gym[all]

# !pip install pyvirtualdisplay
# !pip install tqdm
# !pip install neptune-client
# !conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch

In [None]:
# import sys
# import os
# # sys.path.append('set PATH=C:/Users/c3296143/.mujoco/mujoco200/bin;%PATH%')
# # sys.path.append('set PATH=C://Users//c3296143//.mujoco//mujoco200//bin;%PATH%')
# # sys.path.append('C:/Users/c3296143/.mujoco/mujoco200/bin')
# # sys.path.append('C://Users//c3296143//.mujoco//mujoco200//bin')
# # os.environ['LD_LIBRARY_PATH']=os.environ['LD_LIBRARY_PATH'] + 'C:/Users/c3296143/root/.mujoco/mujoco200/bin'
# # old = os.environ.get("LD_LIBRARY_PATH")
# old = os.environ.get("PATH")
# # os.environ["PATH"] = 'C:\> set PATH=%PATH%;C:/Users/c3296143/.mujoco/mujoco200/bin'
# # os.environ["PATH"] = 'set PATH=C:/Users/c3296143/.mujoco/mujoco200/bin;%PATH%'

# if old:
#     os.environ["PATH"] = old + ";" +'C:\\Users\\c3296143\\.mujoco\\mujoco200\\bin'
# #     os.environ["LD_LIBRARY_PATH"] = old + ":" + 'C:/Users/c3296143/.mujoco/mujoco200/bin'
# # else:
# #     os.environ["LD_LIBRARY_PATH"] = 'C:/Users/c3296143/.mujoco/mujoco200/bin'
# else:
#     os.environ["PATH"] = 'C:\\Users\\c3296143\\.mujoco\\mujoco200\\bin'

# print(os.environ["PATH"])

In [None]:
import glfw
import time
import math
import random
from datetime import datetime

In [None]:
from copy import deepcopy
from collections import namedtuple, deque
import gym
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.optim import RMSprop

In [None]:
from typing import Dict, List, Optional, Tuple
from tqdm import tqdm
import numpy as np
import neptune.new as neptune
from PIL import Image
import imageio
from pyvirtualdisplay import Display
# Display().start()

In [None]:
nep_log  = neptune.init(
    project="xhnfirst/RA-DDPG-IP-test",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI1NTg5MDI2OS01MTVmLTQ2YjUtODA1Yy02ZWQyNDgxZDcwN2UifQ==",
)  # your credentials

In [None]:
env = gym.make('InvertedPendulum-v5-down')
frame = []
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device = ', device)

In [None]:
def mlp(sizes, activation, output_activation=nn.Identity):
    layers = []
    for j in range(len(sizes)-1):
        act = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
    return nn.Sequential(*layers)


class MLPActor(nn.Module):

    def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
        super().__init__()
        pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim]
        self.pi = mlp(pi_sizes, activation, nn.Tanh)
        self.act_limit = act_limit

    def forward(self, obs):
        # Return output from network scaled to action space limits.
        return self.act_limit * self.pi(obs)

class MLPQFunction(nn.Module):

    def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
        super().__init__()
        self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)

    def forward(self, obs, act):
        q = self.q(torch.cat([obs, act], dim=-1))
        return torch.squeeze(q, -1) # Critical to ensure q has right shape.

class MLPActorCritic(nn.Module):

    def __init__(self, hidden_sizes=(256,256),
                 activation=nn.ReLU, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        super().__init__()

        obs_dim = 4
        act_dim = 1
        act_limit = 1

        # build policy and value functions
        self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit).to(device)
        self.q = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation).to(device)

    def act(self, obs):
        with torch.no_grad():
            return self.pi(obs)

In [None]:
Transition = namedtuple('Transition',
                        ('obs', 'act', 'rew', 'next_obs', 'done'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
params = {
    "dropout": 0.2,
    "learning_rate": 0.001,
    # "optimizer": "SGD",
    "optimizer": "Adam",
    "hid": 64,
    "l": 3,
    "seed": 0,
    "steps_per_epoch": 500,
    "steps_video": 5000,
    "epochs": 1000,
    "replay_size": int(1e8),
    "gamma": 0.98,
    "polyak": 0.995,
    "pi_lr": 1e-4,
    "q_lr": 1e-4,
    "batch_size": 1000,
    "start_steps": 3000, 
    "update_after": 1500,
    "update_every": 300,
    "act_noise": 0.01,
    "num_test_episodes": 5,
    "max_ep_len": 100,
    "max_video_len": 100,
    "save_model_len": 20000,
    "obs_dim": 4,
    "act_dim": 1,
    "act_limit": 1
}

ac_kwargs=dict(hidden_sizes=[params["hid"]]*params["l"])

In [None]:
nep_log["parameters"] = params

torch.manual_seed(params["seed"])
np.random.seed(params["seed"])

# Action limit for clamping: critically, assumes all dimensions share the same bound!
print('obs_dim = ', params["obs_dim"] )
print('act_dim = ', params["act_dim"])
print('act_limit = ', params["act_limit"])

# Create actor-critic module and target networks
ac = MLPActorCritic(**ac_kwargs)
ac_targ = deepcopy(ac)

# Freeze target networks with respect to optimizers (only update via polyak averaging)
for p in ac_targ.parameters():
    p.requires_grad = False

memory = ReplayMemory(params["replay_size"])

In [None]:
# Set up function for computing DDPG Q-loss
def compute_loss_q(data):

    o = torch.cat(data.obs).float()
    a = torch.cat(data.act).float()
    r = torch.cat(data.rew).float()
    o2 =torch.cat(data.next_obs).float()
    d = torch.cat(data.done).float()
    q = ac.q(o,a)


    # Bellman backup for Q function
    with torch.no_grad():
        q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
        backup = r + params["gamma"] * (1 - d) * q_pi_targ

    # MSE loss against Bellman backup
    loss_q = ((q - backup)**2).mean()

    return loss_q

# Set up function for computing DDPG pi loss
def compute_loss_pi(data):

    o = torch.cat(data.obs).float()

    q_pi = ac.q(o, ac.pi(o))

    return -q_pi.mean()

In [None]:
pi_optimizer = RMSprop(ac.pi.parameters(), lr=params["pi_lr"])
q_optimizer = RMSprop(ac.q.parameters(), lr=params["q_lr"])

def update(data):
    # First run one gradient descent step for Q.


    q_optimizer.zero_grad()
    loss_q = compute_loss_q(data)

    loss_q.backward()

    q_optimizer.step()


    # Freeze Q-network so you don't waste computational effort 
    # computing gradients for it during the policy learning step.
    for p in ac.q.parameters():
        p.requires_grad = False

    # Next run one gradient descent step for pi.
    pi_optimizer.zero_grad()
    loss_pi = compute_loss_pi(data)
    loss_pi.backward()
    pi_optimizer.step()

    # Unfreeze Q-network so you can optimize it at next DDPG step.
    for p in ac.q.parameters():
        p.requires_grad = True



    # Finally, update target networks by polyak averaging.
    with torch.no_grad():
        for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
            # NB: We use an in-place operations "mul_", "add_" to update target
            # params, as opposed to "mul" and "add", which would make new tensors.
            p_targ.data.mul_(params["polyak"])
            p_targ.data.add_((1 - params["polyak"]) * p.data)

In [None]:
def get_action(o, noise_scale):
    a = ac.act(torch.as_tensor(o, dtype=torch.float32))
    # print('a = ', a)
    a += noise_scale * torch.randn(params["act_dim"]).to(device)
    return torch.clip(a, -params["act_limit"], params["act_limit"])

def test_agent(epoch):
    test_main = 0
    test_step = 0
    for j in range(params["num_test_episodes"]):
        obs, d, test_ep_ret, test_ep_len = env.reset(), False, 0, 0
        o = obs
        o = torch.tensor([o], dtype=torch.float32, device=device)
        while not(test_ep_len == params["max_ep_len"]):
            a_cpu = get_action(o, 0).cpu().data.numpy()
            obs, r, d, _ = env.step(a_cpu[0])
            o = obs
            o = torch.tensor([o], dtype=torch.float32, device=device)

            test_ep_ret += r
            test_ep_len += 1
        test_ep_main = test_ep_ret/test_ep_len
        test_step +=1
        test_main += test_ep_main

    print('test_rew_main = ', float(test_main/test_step))
    nep_log["test/reward"].log(test_main/test_step)



def video_agent(epoch):
    screen = env.render(mode='rgb_array')
    im = Image.fromarray(screen)
    images = [im]
    obs, d, test_ep_ret, test_ep_len = env.reset(), False, 0, 0
    o = obs
    o = torch.tensor([o], dtype=torch.float32, device=device)
    while not(test_ep_len == params["max_video_len"]):
        a_cpu = get_action(o, 0).cpu().data.numpy()
        obs, r, d, _ = env.step(a_cpu[0])
        screen = env.render(mode='rgb_array')
        images.append(Image.fromarray(screen))
        o = obs
        o = torch.tensor([o], dtype=torch.float32, device=device)
        test_ep_len += 1
    # print("begin writing image")
    now = datetime.now()
    current_time = str(now.isoformat())
    image_file = 'images/inverted-pendulum-v5-%s%d.gif'% (current_time.replace(":","-"), epoch)
    images[1].save(image_file, save_all=True, append_images=images[1:], loop=0, duration=1)

In [None]:

obs, ep_ret, ep_len = env.reset(), 0, 0

# o = list(obs['robot0_proprio-state']) + list(obs['object-state'])
o =obs
# env.viewer.set_camera(camera_id=0)


# Define neutral value
neutral = np.zeros(7)

# Keep track of done variable to know when to break loop

# Prepare for interaction with environment
total_steps = params["steps_per_epoch"] * params["epochs"]
start_time = time.time()

o = torch.tensor([o], device=device)


start_time_rec = datetime.now()
r_true = 0
total_main = 0
ep_rew_main = 0
reward_dict={}

In [None]:
model = ac.q
print("Model_q's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

model = ac.pi
print("Model_pi's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

In [None]:
# Main loop: collect experience in env and update/log each epoch
low = -1
high = 1
env.reset()
for t in tqdm(range(total_steps)):
    
    # Until start_steps have elapsed, randomly sample actions
    # from a uniform distribution for better exploration. Afterwards, 
    # use the learned policy (with some noise, via act_noise). 
    # env.render()
    if t > params["start_steps"]:
        a = get_action(o, params["act_noise"])      # Tensor
        # print("a = ", a)
        a_out = a.cpu().data.numpy()
        a_cpu = a_out[0]
    else:
        a = torch.tensor([np.random.uniform(low, high)], dtype=torch.float32, device=device)
        a_cpu = a.cpu().data.numpy()
        # print("a = ", a)


    # print("a_cpu = ", a_cpu[0])
    # Step the env
    obs2, r, d, _= env.step(a_cpu)
    # print("env.step(a_cpu) = ", env.step(a_cpu))
    # print("obs2 = ", obs2)
    # env.render()

    o2 = obs2


 
    ep_len += 1
    total_main += r
    ep_ret += r

    # print("a = ", a)
    a_s = torch.tensor([a_cpu], dtype=torch.float32, device=device)
    # print("a_s = ", a_s)
    o2 = torch.tensor([o2], dtype=torch.float32, device=device)
    r = torch.tensor([r], dtype=torch.float32, device=device)
    d = torch.tensor([d], dtype=torch.float32, device=device)

    # Store experience to replay buffer
    memory.push(o, a_s, r, o2, d)
    # print("o = ", o)
    # print("a = ", a)
    nep_log["train/o"].log(o)
    nep_log["train/a"].log(a)
    nep_log["train/r"].log(r)
    nep_log["train/o2"].log(o2)
    nep_log["train/d"].log(d)

    # Super critical, easy to overlook step: make sure to update 
    # most recent observation!
    o=o2

    
    
    # End of trajectory handling
    if (ep_len == params["max_ep_len"]):
        ep_rew = ep_ret/params["max_ep_len"]
        ep_rew_main += ep_rew
        obs, ep_ret, ep_len = env.reset(), 0, 0
        # o = list(obs['robot0_proprio-state']) + list(obs['object-state'])
        o = obs
        o = torch.tensor([o], device=device)


    # Update handling
    if t >= params["update_after"] and t % params["update_every"] == 0:
        for i in range(params["update_every"]):

            transitions = memory.sample(params["batch_size"])
            # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
            # detailed explanation). This converts batch-array of Transitions
            # to Transition of batch-arrays.
            batch = Transition(*zip(*transitions))
            update(data=batch)

    # End of epoch handling
    if (t+1) % params["steps_per_epoch"] == 0:
        epoch = (t+1) // params["steps_per_epoch"]
        train_reward = ep_rew_main/(params["steps_per_epoch"]/params["max_ep_len"])
        nep_log["train/reward"].log(train_reward)
        nep_log["train/total_main"].log(total_main)
        # print('train_rew_main = ', train_reward.cpu().data.numpy()[0])
        print('train_rew_main = ', train_reward)
        ep_rew_main = 0
        # Test the performance of the deterministic version of the agent.
        test_agent(epoch)
        

    # if (t+1) % params["steps_video"] == 0:
    #     epoch = (t+1) // params["steps_per_epoch"]
        # now = datetime.now()
        # current_time = str(now.isoformat())
        # print('current_time = ', current_time)
        # video_agent(epoch)
        # now = datetime.now()
        # current_time = str(now.isoformat())
        # print('current_time = ', current_time)

    if (t+1) % params["save_model_len"] == 0:
        epoch = (t+1) // params["steps_per_epoch"]
        now = datetime.now()
        current_time = str(now.isoformat())
        torch.save({
                    'model of ac.q': ac.q.state_dict(),
                    'model of ac.pi': ac.pi.state_dict(),
                    'q_optimizer_state_dict': q_optimizer.state_dict(),
                    'pi_optimizer_state_dict': pi_optimizer.state_dict(),

                    }, "model_nn/model_nn_%s%d.pt" % (current_time.replace(":","-"), epoch))


In [None]:
print("pi_optimizer's state_dict:")
for var_name in pi_optimizer.state_dict():
    print(var_name, "\t", pi_optimizer.state_dict()[var_name])

print("q_optimizer's state_dict:")
for var_name in q_optimizer.state_dict():
    print(var_name, "\t", q_optimizer.state_dict()[var_name])

In [None]:
now = datetime.now()

current_time = str(now.isoformat())



torch.save({
            'model of ac.q': ac.q.state_dict(),
            'model of ac.pi': ac.pi.state_dict(),
            'q_optimizer_state_dict': q_optimizer.state_dict(),
            'pi_optimizer_state_dict': pi_optimizer.state_dict(),

            }, "model_nn/model_nn_%s%d.pt" % (current_time.replace(":","-"), epoch))

In [None]:
nep_log.stop()