# Lunar Landing Continuous using PPO

## Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
from torch.optim import AdamW
import numpy as np
import copy
from collections import deque
import gym
from gym.spaces import Discrete, Box
from itertools import count
import random
from gym.wrappers import NormalizeObservation, NormalizeReward
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


## Methodology 

In [2]:
T = lambda x: torch.as_tensor(x, dtype=torch.float32)
Ti = lambda x: torch.as_tensor(x, dtype=torch.int64)

In [3]:
# Actor Network
class ActorNet(nn.Module):
    def __init__(self, input_size, hidden_units=64, output_size=2):
        super(ActorNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_units),
            nn.Tanh(),
            nn.Linear(hidden_units, int(hidden_units/2)),
            nn.Tanh()
        )
        self.mu_head = nn.Linear(int(hidden_units/2),  output_size)
        self.logstd_head = nn.Linear(int(hidden_units/2),  output_size)

    def forward(self, x):
        x = self.model(x)
        loc = torch.tanh(self.mu_head(x)) * 2
        scale = torch.exp(self.logstd_head(x))
        return loc, scale

    def __call__(self, x):
        out = self.forward(x)
        return out

In [4]:
class CriticNet(nn.Module):
    def __init__(self, input_size, hidden_units=64, output_size=2):
        super(CriticNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_units),
            nn.Tanh(),
            nn.Linear(hidden_units, int(hidden_units/2)),
            nn.Tanh()
        )
        self.value_head = nn.Linear(int(hidden_units/2), 1)

    def forward(self, x):
        x = self.model(x)
        value = self.value_head(x)
        return value

    def __call__(self, x):
        out = self.forward(x)
        return out

In [5]:
class RunningMem():
    def __init__(self):
        self.reset()

    def store(self, obs, action, logprob, reward, done, obs_, values, values_):
        self.obs.append(obs)
        self.actions.append(action.unsqueeze(-1))
        self.logprobs.append(logprob)
        self.rewards.append(reward.unsqueeze(-1))
        self.dones.append(done.unsqueeze(-1))
        self.obs_.append(obs_)
        self.values.append(values)
        self.values_.append(values_)


    def batches(self, batchsize, nenvs, memsteps, gamma, lmbda):
        size = nenvs*memsteps
        idx = list(range(size))
        random.shuffle(idx)

        b_obs = torch.stack(self.obs)
        b_actions = torch.stack(self.actions)
        b_logprobs = torch.stack(self.logprobs)
        b_rewards = torch.stack(self.rewards)
        b_dones = torch.stack(self.dones)
        b_obs_ = torch.stack(self.obs_)
        b_values = torch.stack(self.values)
        b_values_ = torch.stack(self.values_)

        gaes = []
        gae = T(np.zeros(nenvs)).view(nenvs,-1)
        for i in range(len(b_obs)-1,-1,-1):
            delta = b_rewards[i] + gamma * b_values_[i] * (1-b_dones[i]) - b_values[i]
            gae = delta + gamma * lmbda * (1-b_dones[i]) * gae
            gaes.insert(0, gae)
        
        b_obs = b_obs.view(size, -1)
        b_actions = b_actions.view(size, -1)
        b_logprobs = b_logprobs.view(size, -1)
        b_rewards = b_rewards.view(size, -1)
        b_dones = b_dones.view(size, -1)
        b_obs_ = b_obs_.view(size, -1)
        b_values = b_values.view(size, -1)
        b_values_ = b_values_.view(size, -1)
        b_gae = torch.stack(gaes).view(size, -1)

        for batchn in range(0, len(idx), batchsize):
            batchidx = idx[batchn:batchn+batchsize]
            batchidx = Ti(batchidx)
            mb_obs = torch.index_select(b_obs, 0, batchidx)
            mb_actions = torch.index_select(b_actions, 0, batchidx)
            mb_logprobs = torch.index_select(b_logprobs, 0, batchidx)
            mb_rewards = torch.index_select(b_rewards, 0, batchidx)
            mb_dones = torch.index_select(b_dones, 0, batchidx)
            mb_obs_ = torch.index_select(b_obs_, 0, batchidx)
            mb_values = torch.index_select(b_values, 0, batchidx)
            mb_values_ = torch.index_select(b_values_, 0, batchidx)
            mb_gae = torch.index_select(b_gae, 0, batchidx)
            yield mb_obs, mb_actions, mb_logprobs, mb_rewards, mb_dones, mb_obs_, mb_values, mb_values_, mb_gae


    def reset(self):
        self.obs = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.dones = []    
        self.obs_ = []
        self.values = []
        self.values_ = []
        self.gae = []

In [9]:
class PpoContinues():
    def __init__(self, trials = None, num_envs = 5):
        if trials:
            self.optimization(trials)
        else:
            self.num_envs = num_envs

        self.env_name = 'LunarLanderContinuous-v2'
        self.env = gym.vector.make(self.env_name, num_envs=self.num_envs, asynchronous=False) #, new_step_api=True
        self.obs_dim = self.env.single_observation_space.shape[0]
        self.n_acts = self.env.single_action_space.shape[0]

        self.init_actor()
        self.init_critic()

    def initialize_weights(self, m):
        if isinstance(m, nn.Conv2d):
            nn.init.xavier_normal_(m.weight.data, nonlinearity='relu')
            if m.bias is not None:
                nn.init.constant_(m.bias.data, 0)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight.data, 1)
            nn.init.constant_(m.bias.data, 0)
        elif isinstance(m, nn.Linear):
            nn.init.xavier_normal_(m.weight.data)
            nn.init.constant_(m.bias.data, 0)
    
    def optimization(self, trials):
        self.num_envs = trials.suggest_int("num_envs", 1, 10)
    
    def init_actor(self):
        self.actor_hidden_size = 128
        self.ActorNetwork = ActorNet(self.obs_dim, self.actor_hidden_size, self.n_acts)
        self.ActorNetwork.apply(self.initialize_weights)
        self.actor_optim = AdamW(self.ActorNetwork.parameters(), lr=0.0003)
    
    def init_critic(self):
        self.critic_hidden_size = 128
        self.CriticNetwork = CriticNet(self.obs_dim, self.actor_hidden_size, self.n_acts)
        self.CriticNetwork.apply(self.initialize_weights)
        self.critic_optim = AdamW(self.CriticNetwork.parameters(), lr=0.0003)
    
    def sim_action(self, obs):
        loc, std = self.ActorNetwork(T(obs))
        dist = Normal(loc=loc, scale=std+1e-6)
        action = dist.sample()
        action_log_prob = torch.sum(dist.log_prob(action), dim=-1, keepdim=True)
        return action, action_log_prob
    
    def train(self, mem, gamma=0.99, batchsize=10, epoch_repeat=20, epsilon=0.2, lmbda=0.95, memsteps = 500):
        for epochrep in range(epoch_repeat):
            for batch in mem.batches(batchsize, self.num_envs, memsteps, gamma, lmbda):
                obs, actions, logprobs, rewards, dones, obs_, values, values_, gae = batch
                gae = (gae - torch.mean(gae)) / (torch.std(gae) + 1e-6)
                target = gae + values
                state_values = self.CriticNetwork(obs)
                critic_loss = F.smooth_l1_loss(state_values, target).mean()
                
                new_loc, new_scale = self.ActorNetwork(obs)
                dist = Normal(loc=new_loc, scale=new_scale+1e-6)
                new_logprobs = torch.sum(dist.log_prob(actions), dim=-1, keepdim=True)
                rho = torch.exp(new_logprobs - logprobs)
                surrgt1 = rho * gae
                surrgt2 = rho.clamp(1-epsilon, 1+epsilon) * gae
                policy_loss = -torch.minimum(surrgt1, surrgt2).mean()

                loss = policy_loss + 0.5*critic_loss
                self.actor_optim.zero_grad()
                self.critic_optim.zero_grad()
                loss.backward()
                self.actor_optim.step()
                self.critic_optim.step()


In [10]:
model = PpoContinues()
results = deque(maxlen=50)

memsteps = 500
gamma=0.9
lmbda=0.
batchsize=80
mem = RunningMem()

epsilon = 0.18
epoch_repeat=15
totreward = np.zeros(model.num_envs)
stepcount = 0
epoc = 0
obs = model.env.reset()

In [11]:
while True:
    stepcount += 1
    action, action_log_prob = model.sim_action(obs)
    next_obs, reward, done, _ = model.env.step(action.numpy())
    with torch.no_grad():
        values = model.CriticNetwork(T(obs))
        values_ = model.CriticNetwork(T(next_obs))
    mem.store(T(obs), action, action_log_prob, T(reward), Ti(done), T(next_obs), values, values_)
    obs = next_obs
    totreward = reward
    doneidx = np.where(done==True)

    for k in doneidx[0]:
        results.append(totreward[k])
        totreward[k] = 0
            
    if stepcount>1 and stepcount % memsteps == 0:
        epoc+=1
        model.train(mem, gamma=gamma, batchsize=batchsize, epoch_repeat=epoch_repeat, epsilon=epsilon, lmbda=lmbda, memsteps=memsteps)   
        mem.reset()
        if epoc % 10 == 0:
            print(f'Epoc: {epoc} Avg Result: {np.mean(results)}')
            
    if len(results)>0 and np.mean(results) > 195:
        print(f'Solved!  Epoc: {epoc} Avg Result: {np.mean(results)}')
        break


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 2]], which is output 0 of AsStridedBackward0, is at version 3; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).