# Lunar Landing Using Policy Gradient

## Imports

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions.categorical import Categorical
from torch.optim import AdamW
import torch.optim as optim
import numpy as np
import random
from collections import deque
from torch.utils.data import IterableDataset
from torch.utils.data import DataLoader
import gym
from gym.spaces import Discrete, Box

import optuna
from optuna.integration.tensorboard import TensorBoardCallback

import warnings
warnings.filterwarnings("ignore")




## Methodology

In [5]:
class Data(IterableDataset):
    def __init__(self, env, policy, steps, gamma):
        self.env = env
        self.policy = policy
        self.steps = steps
        self.gamma = gamma
        self.obs = env.reset()


    def __iter__(self):
        transitions = []

        for step in range(self.steps):
            with torch.no_grad():
                action = self.policy(torch.as_tensor(self.obs, dtype=torch.float32))
            action = action.multinomial(1).cpu().numpy()
            next_obs, reward, done, info = self.env.step(action.flatten())
            transitions.append((self.obs, action, reward, done))
            self.obs = next_obs

        obs_b, action_b, reward_b, done_b = map(np.stack, zip(*transitions))

        running_return = np.zeros(self.env.num_envs, dtype=np.float32)
        return_b = np.zeros_like(reward_b)

        for row in range(self.steps-1,-1,-1):
            running_return = reward_b[row] + (1-done_b[row]) * self.gamma * running_return
            return_b[row] = running_return

        num_samples = self.env.num_envs * self.steps
        obs_b = obs_b.reshape(num_samples, -1)
        action_b = action_b.reshape(num_samples, -1)
        return_b = return_b.reshape(num_samples, -1)

        return_b = (return_b - np.mean(return_b)) / np.std(return_b + 1e-06)

        idx = list(range(num_samples))
        random.shuffle(idx)

        for i in idx:
            yield obs_b[i], action_b[i], return_b[i]

In [6]:
class PolicyNet(nn.Module):
    def __init__(self, input_size, hidden_units=64, output_size=2):
        super(PolicyNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_units),
            nn.ReLU(),
            nn.Linear(hidden_units, hidden_units),
            nn.ReLU(),
            nn.Linear(hidden_units, output_size),
            nn.Softmax(dim=-1)
        )
        
    def forward(self, x):
        probs = self.model(x)
        return probs

    def __call__(self, x):
        out = self.forward(x)
        return out

In [26]:
class PolicyGradient():
    def __init__(self, trials = None):
        self.num_envs = 5
        self.hidden_size = 24
        self.gamma = 0.99
        self.steps = 1500
        self.batch_size = 64
        self.epochs = 1000
        if trials:
            self.optimization(trials)

        self.env_name = "LunarLander-v2"
        self.env = gym.vector.make(self.env_name, num_envs=self.num_envs, asynchronous=False)
        self.obs_dim = self.env.single_observation_space.shape[0]
        self.n_acts = self.env.single_action_space.n
        self.createPolicyNet()
        
        self.data = Data(self.env, self.logits_net, self.steps, self.gamma) # env, policy, steps, gamma
        self.loader = DataLoader(self.data, batch_size=self.batch_size)
    
    def optimization(self, trials):
        self.num_envs = trials.suggest_int("num_envs", 1, 10)
        self.hidden_size = trials.suggest_int("hidden_size", 16, 32)
        self.gamma = trials.suggest_float("gamma", 0.5, 1)
        self.steps = trials.suggest_int("steps", 1000, 2000)
        self.batch_size = trials.suggest_int("batch_size", 32, 64)
        # self.epochs = trials.suggest_int("epochs", 500, 1500)
    
    def createPolicyNet(self):
        self.logits_net = PolicyNet(self.obs_dim, self.hidden_size, self.n_acts)
        self.logits_net.apply(self.initialize_weights)
        self.optimizer = AdamW(self.logits_net.parameters(), lr=0.0003)

    def initialize_weights(self, m):
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_uniform_(m.weight.data,nonlinearity='relu')
            if m.bias is not None:
                nn.init.constant_(m.bias.data, 0)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight.data, 1)
            nn.init.constant_(m.bias.data, 0)
        elif isinstance(m, nn.Linear):
            nn.init.kaiming_uniform_(m.weight.data)
            nn.init.constant_(m.bias.data, 0)

    # make function to compute action distribution
    def get_policy(self, obs):
        probs = self.logits_net(obs)
        return Categorical(probs=probs)
    
    # make action selection function (outputs int actions, sampled from policy)
    def get_action(self, obs):
        return self.get_policy(obs).sample().item()


    # make loss function whose gradient, for the right data, is policy gradient
    def compute_loss(self, obs, act, weights):
        probs = self.logits_net(obs)
        log_probs = torch.log(probs + 1e-6)
        action_log_prob = log_probs.gather(1, act)
        return -(action_log_prob * weights).mean()
    
    def train_one_epoch(self):
        # make some empty lists for logging.
        batch_obs = []          # for observations
        batch_acts = []         # for actions
        batch_weights = []      # for R(tau) weighting in policy gradient

        # collect experience by acting in the environment with current policy
        for batch in self.loader:
            with torch.no_grad():
                batch_obs, batch_acts, batch_weights = batch

            # take a single policy gradient update step
            self.optimizer.zero_grad()
            batch_loss = self.compute_loss(obs=batch_obs, act=batch_acts, weights=batch_weights)
            batch_loss.backward()
            self.optimizer.step()
        return 0
    
    def run_test(self, trajectories):
        env2 = gym.make(self.env_name)
        scores = []
        for trajectory in range(trajectories):
            trajectory_return = 0
            obs = env2.reset()
            done = False
            while not done:
                action = self.get_action(torch.as_tensor(obs, dtype=torch.float32))
                next_obs, reward, done, _ = env2.step(action)
                obs = next_obs
                trajectory_return += reward
            scores.append(trajectory_return)
        del env2
        return np.mean(scores)


## Hyperparameter Optimization

In [27]:
def objective(trial):
    model = PolicyGradient(trial)

    optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'RMSprop', "SGD"])
    optimizer = getattr(optim, optimizer_name)
    for epoch in range(model.epochs):
        model.train_one_epoch()
        score = model.run_test(5)
    
        trial.report(score, epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return score

In [None]:
tensorboard_callback = TensorBoardCallback("logs/", metric_name="score")

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10, timeout = 600, callbacks=[tensorboard_callback])

trial = study.best_trial
print(f"Score: {trial.value}")
print(f"Best Hyperparameters: {trial.params}")

In [None]:
for epoch in range(1000):
    train_one_epoch()
    score = run_test(5, logis_net)
    if epoch>1 and epoch % 1 == 0:
        print(f'Epoch: {epoch}  Score: {score}' )
    if score >= 195:
        print(f'Solved! Epoch: {epoch}  Score: {score}')
        break