# Import Package

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

from IPython import display

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import models
from torch.distributions import Categorical
from tqdm.notebook import tqdm
print(1)

1


# Lunar Lander environment

In [2]:
seed = 543
def fix(env, seed):
    env.action_space.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
import gymnasium as gym
import random
env = gym.make('LunarLander-v2' ,render_mode='rgb_array')
fix(env, seed) # fix the environment Do not revise this !!!

In [3]:
initial_state = env.reset()
print(initial_state)

(array([ 0.00342045,  1.4145806 ,  0.34643883,  0.16267964, -0.00395666,
       -0.07847356,  0.        ,  0.        ], dtype=float32), {})


# PPO Agent

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
N = 3

cuda


### Class: Memory

store the actions, states, rewards, is_termminal, and log probabilities

In [5]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []

    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

## Class: ActorCritic

### 2 fully connected networks
1. actor/action layer: input the states, follow the policy and yields an action
2. critic/value layer: input the states, evaluate the current ? //TODO

### act function

Generate a probability distribution of the actions using action layer taking state as the input, sample one action and store these information in the memory.

### evaluate function

Take a state as input and generate action probability distribution, calculate the entropy of this distribution and evaluate the log probability of the given action. Use the critic to evaluate the expected rewards under current state. Return all these values.

In [6]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, n_latent_var):
        super(ActorCritic, self).__init__()

        # actor
        self.action_layer = nn.Sequential(
                nn.Linear(state_dim, 128),
                nn.ReLU(),
                nn.Linear(128, n_latent_var),
                nn.ReLU(),
                nn.Linear(n_latent_var, action_dim),
                nn.Softmax(dim=-1)
                )

        # critic
        self.value_layer = nn.Sequential(
               nn.Linear(state_dim, 128),
               nn.ReLU(),
               nn.Linear(128, n_latent_var),
               nn.ReLU(),
               nn.Linear(n_latent_var, 1)
               )
    
    def act(self, state, memory):
        state = torch.from_numpy(state).float().to(device)
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        action = dist.sample()

        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))

        return action.item()

    def evaluate(self, state, action):
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()

        state_value = self.value_layer(state)

        return action_logprobs, torch.squeeze(state_value), dist_entropy

## PPO agent

### gamma

if close to 1, future rewards will be considered more
if close to 0, current rewards wiil take more proportion

### state_values

the output of the critic, which evaluates the state and give an expected future reward

### rewards

the static list of discounted rewards calculated by the data in old trajectory (when adopting old policy)

### advantages = rewards - state_values.detach()

how better the old policy perform than current policy. if positive, actual reward is higher than state value, it is better to stick to current action, vice versa

In [7]:
class PPOAgent:
    def __init__(self, state_dim, action_dim, n_latent_var,update_timestep, lr, betas, gamma, K_epochs, eps_clip, c1, c2):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.update_timestep = update_timestep
        self.timestep = 0
        self.memory = Memory()

        self.policy = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()
        self.update_cnt = 0

        self.c1 = c1
        self.c2 = c2


    def update(self):   
        # Monte Carlo estimate of state rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.memory.rewards), reversed(self.memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list to tensor
        old_states = torch.stack(self.memory.states).detach().to(device)
        old_actions = torch.stack(self.memory.actions).detach().to(device)
        old_logprobs = torch.stack(self.memory.logprobs).detach().to(device)

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values 
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss 
            advantages = rewards - state_values.detach()
            
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages # clip to avoid large difference between samples
            loss = -torch.min(surr1, surr2)  + self.c1*self.MseLoss(state_values, rewards) + self.c2*dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.memory.clear_memory()

    def step(self, reward, done):
        self.timestep += 1 
        # Saving reward and is_terminal:
        self.memory.rewards.append(reward)
        self.memory.is_terminals.append(done)

        # update the policy per "update_timestep"
        if self.timestep % self.update_timestep == 0:
            self.update()
            self.memory.clear_memory()
            self.timestep = 0
            self.update_cnt += 1

    def act(self, state):
        return self.policy_old.act(state, self.memory)
    
    def train(self):
        self.policy.train()
        self.policy_old.train()

    def eval(self):
        self.policy.eval()
        self.policy_old.eval()

In [8]:
def moving_average(total_rewards):
    if len(total_rewards) == 0:
        return 0
    if len(total_rewards) < 99:
        return np.mean(total_rewards)
    else:
        return np.mean(total_rewards[-100:])

# Bayesian Optimization

## Trainning Function for Bayesian Optimization

In [9]:
def TrainOnce(c2, eps_clip, agent):
    print(f"Training with c2: {c2}, eps_clip: {eps_clip}")
    total_rewards = []
    final_rewards = []
    moving_average_rewards = []
    num_episodes = 0
    while(moving_average(total_rewards)<200):
        num_episodes += 1
        state = env.reset()[0]
        total_reward = 0
        update_cnt = agent.update_cnt
        # collect trajectory
        while True:
            action = agent.act(state)
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            total_reward += reward
            agent.step(reward, done)
            if done:
                final_rewards.append(reward)
                total_rewards.append(total_reward)
                break
        moving_average_rewards.append(moving_average(total_rewards))
        if (update_cnt != agent.update_cnt):
            print(f"Update Count: {agent.update_cnt}, average rewards: {moving_average(total_rewards)}, episode: {num_episodes}")
        
    print(f"Training finished with {num_episodes} episodes")
    return -num_episodes

## Use BayesianOptimization to find the best pair of c2 and eps_clip

In [None]:
from bayes_opt import BayesianOptimization
state_dim = 8 
action_dim = 4 
n_latent_var = 64
update_timestep = 1000
c1 = 0.5
lr = 0.003                 
betas = (0.9, 0.999)
gamma = 0.99
K_epochs = 4

if (True):
    p_bounds = {'c2': (-0.01, -0.005), 'eps_clip': (0.1,0.3)}

    def PPO_fn(c2, eps_clip):
        agent = PPOAgent(state_dim,action_dim,n_latent_var,update_timestep,lr,betas,gamma,K_epochs,eps_clip,c1,c2)
        agent.train()
        return TrainOnce(c2, eps_clip, agent)
    HyperParamOptimizer = BayesianOptimization(
        f=PPO_fn,
        pbounds=p_bounds,
        random_state=1
    )
    HyperParamOptimizer.probe(
        params=[-0.006, 0.22],
        lazy=True
    )
    HyperParamOptimizer.maximize(
        init_points=2,
        n_iter=10
    )
    print(HyperParamOptimizer.max)
    c2 = HyperParamOptimizer.max['params']['c2']
    eps_clip = HyperParamOptimizer.max['params']['eps_clip']
else:
    c2 = -0.005861493516941971
    eps_clip = 0.22143098426065316

|   iter    |  target   |    c2     | eps_clip  |
-------------------------------------------------
Training with c2: -0.006, eps_clip: 0.22
Update Count: 1, average rewards: -192.57007750642316, episode: 11
Update Count: 2, average rewards: -160.50157114113594, episode: 24
Update Count: 3, average rewards: -191.68583283190193, episode: 36
Update Count: 4, average rewards: -180.9306206687213, episode: 48
Update Count: 5, average rewards: -168.16701282141707, episode: 61
Update Count: 6, average rewards: -155.7310982192528, episode: 74
Update Count: 7, average rewards: -152.17751001914118, episode: 88
Update Count: 8, average rewards: -152.4488462431514, episode: 101
Update Count: 9, average rewards: -147.9803831150762, episode: 114
Update Count: 10, average rewards: -142.99807628151132, episode: 127
Update Count: 11, average rewards: -132.37885492714898, episode: 140
Update Count: 12, average rewards: -137.17446962947943, episode: 152
Update Count: 13, average rewards: -140.19239642599

# Train a excellent model

## Keep training until landing success rate over previous 300 episodes reaches 95%

In [None]:
agent = PPOAgent(state_dim ,action_dim,n_latent_var,update_timestep, lr,betas,gamma,K_epochs,eps_clip,c1,c2)
total_rewards = []
final_rewards = []
moving_average_rewards = []
num_episodes = 0
test_number = 300
success_rate = 0
agent.train()
while True:
    num_episodes += 1
    state = env.reset()[0]
    total_reward = 0
    update_cnt = agent.update_cnt
    while True:
        action = agent.act(state)
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        total_reward += reward
        agent.step(reward, done)
        if done:
            final_rewards.append(reward)
            total_rewards.append(total_reward)
            break
    moving_average_rewards.append(moving_average(total_rewards))
    success_rate = final_rewards[-test_number:].count(100) / test_number

    if (update_cnt != agent.update_cnt):
        print(f"Update Count: {agent.update_cnt}, average rewards: {moving_average(total_rewards)}, episodes: {num_episodes}, success rate: {success_rate}")

    # if (moving_average(total_rewards) > 280 or num_episodes > 5000 or success_rate > 0.95):
    if (success_rate > 0.95):
        break

print(f"Training finished with {num_episodes} episodes")

## Save the model

In [None]:
torch.save(agent.policy_old.state_dict(), f'PPO_LunarLander_{N}.pth')

# Load and evaluation

## load the model

In [None]:
state_dict = torch.load(f'PPO_LunarLander_{N}.pth')
model = PPOAgent(state_dim ,action_dim,n_latent_var,lr,betas,gamma,K_epochs,eps_clip,c1,c2)
model.policy.load_state_dict(state_dict)
model.policy_old.load_state_dict(state_dict)

## Evaluate for 500 times

In [None]:
model.eval()
test_total_rewards = []
test_final_rewards = []
step_used_list = []
success_rate = 0
prg_bar = tqdm(range(500))
for i in prg_bar:
    actions = []
    state = env.reset()[0]
    # img = plt.imshow(env.render())
    total_reward = 0
    done = False
    step = 0
    while not done :
        step +=1
        action = model.act(state)
        actions.append(action)
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
    test_total_rewards.append(total_reward)
    test_final_rewards.append(reward)
    step_used_list.append(step)
success_rate = test_final_rewards.count(100) / 500
print(f"Success rate: {success_rate}")

## Store the evaluation result and training result

In [None]:
import pickle

data = {'total_rewards': total_rewards, 'final_rewards': final_rewards, 'moving_average_rewards': moving_average_rewards, 'test_total_rewards': test_total_rewards, 'test_final_rewards': test_final_rewards, 'step_used_list': step_used_list}

with open(f'PPO_Result_{N}.pkl', 'wb') as f:
    pickle.dump(data, f)

## DEMO
save 10 gif

In [None]:
import matplotlib.animation as animation
import matplotlib.pyplot as plt

fix(env, seed)
fig = plt.figure()
total_rewards = []
animations = []

for i in range(10):
    ims = []
    model.eval()
    actions = []
    state = env.reset()[0]
    total_reward = 0
    done = False
    step = 0
    while not done :
        step +=1
        action = model.act(state)
        actions.append(action)
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        im = plt.imshow(env.render(), animated=True)
        ims.append([im])
    total_rewards.append(total_reward)
    ani = animation.ArtistAnimation(fig, ims, interval=25, blit=True, repeat_delay=1000)
    ani.save(f'./demo_gifs/trial{N}/demo_{N}_{i}.gif')
