# Lab15 Flappy bird using Deep RL - PPO x GAE

## Lab15_109021115 Chia-Chun Wu

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import moviepy.editor as mpy
import skimage.transform
from IPython.display import Image, display

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls

In [5]:
gpus = tf.config.list_physical_devices("GPU") 
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [1]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

test_game = FlappyBird()
test_env = PLE(test_game, fps=30, display_screen=False)
test_env.reset_game()

pygame 2.6.1 (SDL 2.28.4, Python 3.9.20)
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [2]:
path = './movie_f' 
if not os.path.exists(path):
    os.makedirs(path)

In [3]:
hparas = {
    'image_size': 84,
    'num_stack': 4,
    'action_dim': len(env.getActionSet()),
    'hidden_size': 256,
    'lr': 0.0001,
    'gamma': 0.99,
    'lambda': 0.95,
    'clip_val': 0.2,
    'ppo_epochs': 8,
    'test_epochs': 1,
    'num_steps': 512,
    'mini_batch_size': 64,
    'target_reward': 200,
    'max_episode': 30000,
}

In [4]:
# Please do not modify this method
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    
    return clip

In [5]:
def preprocess_screen(screen):
    screen = skimage.transform.rotate(screen, -90, resize=True)
    screen = screen[:400, :]
    screen = skimage.transform.resize(screen, [hparas['image_size'], hparas['image_size'], 1])
    return screen.astype(np.float32)

def frames_to_state(input_frames):  # 1: 1111, 2: 1122, 3: 1233, 4~: 1234 (last 4)
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

In [8]:
class ActorCriticNetwork(tf.keras.Model):
    def __init__(self, hparas):
        super().__init__()

        self.feature_extractor = tf.keras.Sequential([
          # Convolutional Layers
          tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1),
          tf.keras.layers.ReLU(),
          # Embedding Layers
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(hparas['hidden_size']),
          tf.keras.layers.ReLU(),
        ])

        # Actor Network
        self.actor = tf.keras.layers.Dense(hparas['action_dim'], activation='softmax')
        # Critic Network
        self.critic = tf.keras.layers.Dense(1, activation = None)

    def call(self, input):
        x = self.feature_extractor(input)
        action_logits = self.actor(x)
        value = self.critic(x)
        return action_logits, value

In [9]:
class Agent():
    def __init__(self, hparas):
        self.gamma = hparas['gamma']
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=hparas['lr'])
        self.actor_critic = ActorCriticNetwork(hparas)
        self.clip_pram = hparas['clip_val']
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.shape[0]
        for _ in range(batch_size // mini_batch_size):
            rand_ids = tf.convert_to_tensor(np.random.randint(0, batch_size, mini_batch_size), dtype=tf.int32)
            yield tf.gather(states, rand_ids), tf.gather(actions, rand_ids), tf.gather(log_probs, rand_ids), \
             tf.gather(returns, rand_ids), tf.gather(advantage, rand_ids)
    
    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, discount_rewards, advantages):       
        total_actor_loss = 0
        total_critic_loss = 0
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, reward, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, discount_rewards, advantages):
                reward = tf.expand_dims(reward, axis=-1)

                with tf.GradientTape() as tape:
                    prob, value = self.actor_critic(state, training=True)
                    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
                    entropy = tf.math.reduce_mean(dist.entropy())
                    new_log_probs = dist.log_prob(action)

                    # PPO ratio
                    ratio = tf.math.exp(new_log_probs - old_log_probs)
                    surr1 = ratio * advantage
                    surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * advantage

                    actor_loss = tf.math.negative(tf.math.reduce_mean(tf.math.minimum(surr1, surr2))) - 0.1 * entropy
                    critic_loss = 0.5 * tf.math.reduce_mean(kls.mean_squared_error(reward, value))

                    total_loss = actor_loss + critic_loss
            
                # single optimizer
                grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.actor_critic.trainable_variables))
      
                total_actor_loss += actor_loss
                total_critic_loss += critic_loss
        return total_actor_loss, total_critic_loss

In [10]:
# https://arxiv.org/pdf/1506.02438.pdf
# Equation 16
def compute_gae(rewards, masks, values, gamma, LAMBDA):
    gae = 0
    returns = []
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * LAMBDA * masks[i] * gae
        returns.append(gae + values[i])

    returns.reverse()
    return returns

## Testing Environment

In [11]:
def test_reward(test_env, agent):
    total_reward = 0
    # Reset the environment
    test_env.reset_game()
    input_frames = [preprocess_screen(test_env.getScreenGrayscale())]

    while not test_env.game_over():

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        action = np.argmax(prob[0].numpy())
        reward = test_env.act(test_env.getActionSet()[action])
        total_reward += reward

        input_frames.append(preprocess_screen(test_env.getScreenGrayscale()))

    return total_reward

## Training

In [12]:
agent = Agent(hparas)
max_episode = hparas['max_episode']
test_per_n_episode = 10
force_save_per_n_episode = 1000
early_stop_reward = 10

start_s = 0
best_reward = -5.0

checkpoint = tf.train.Checkpoint(
    actor_critic = agent.actor_critic,
    optimizer = agent.optimizer,
)

# Load from old checkpoint
# checkpoint.restore('ckpt_dir/ckpt-?')

In [13]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % test_per_n_episode == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            agent.actor_critic.save('./save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            checkpoint.save(file_prefix = './save/checkpoints/ckpt')

    if s % force_save_per_n_episode == 0:
        agent.actor_critic.save('./save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        checkpoint.save(file_prefix = './save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f/{}_demo-{}.webm".format('Lab15', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

[Episode 0]  Actor loss: 95.77036, Critic loss: 66.21090
Test average reward is -5.0, Current best average reward is -5.0





INFO:tensorflow:Assets written to: ./save/Actor/model_actor_0_-5.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_0_-5.0\assets


Moviepy - Building video movie_f/Lab15_demo-0.webm.
Moviepy - Writing video movie_f/Lab15_demo-0.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-0.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4


  reward = env.act(env.getActionSet()[int(action.numpy())])



[Episode 1]  Actor loss: 42.46453, Critic loss: 29.80165
[Episode 2]  Actor loss: 25.55128, Critic loss: 21.09541
[Episode 3]  Actor loss: 14.49814, Critic loss: 12.05807
[Episode 4]  Actor loss: 11.39384, Critic loss: 6.29236
[Episode 5]  Actor loss: 13.52427, Critic loss: 8.26909
[Episode 6]  Actor loss: 9.28922, Critic loss: 5.41029
[Episode 7]  Actor loss: 9.34751, Critic loss: 5.63035
[Episode 8]  Actor loss: -1.68780, Critic loss: 3.83960
[Episode 9]  Actor loss: 3.70514, Critic loss: 3.59580
[Episode 10]  Actor loss: -0.01939, Critic loss: 2.69737
Test average reward is -5.0, Current best average reward is -5.0

[Episode 11]  Actor loss: 1.45098, Critic loss: 3.48116
[Episode 12]  Actor loss: 2.31742, Critic loss: 3.40674
[Episode 13]  Actor loss: -8.93342, Critic loss: 3.59459
[Episode 14]  Actor loss: -8.59711, Critic loss: 1.90234
[Episode 15]  Actor loss: -5.10448, Critic loss: 2.10117
[Episode 16]  Actor loss: -5.46974, Critic loss: 2.42328
[Episode 17]  Actor loss: -8.9144



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_110_-4.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_110_-4.0\assets
  reward = env.act(env.getActionSet()[int(action.numpy())])



[Episode 111]  Actor loss: -5.51002, Critic loss: 0.75824
[Episode 112]  Actor loss: -6.28099, Critic loss: 0.44567
[Episode 113]  Actor loss: -4.36630, Critic loss: 0.54792
[Episode 114]  Actor loss: -11.56307, Critic loss: 0.40778
[Episode 115]  Actor loss: -7.67972, Critic loss: 0.39647
[Episode 116]  Actor loss: -5.36857, Critic loss: 0.45704
[Episode 117]  Actor loss: -4.08699, Critic loss: 0.46768
[Episode 118]  Actor loss: -2.99168, Critic loss: 0.58902
[Episode 119]  Actor loss: -3.20147, Critic loss: 0.60915
[Episode 120]  Actor loss: -0.44965, Critic loss: 0.43212
Test average reward is -5.0, Current best average reward is -4.0

[Episode 121]  Actor loss: -5.25910, Critic loss: 0.33972
[Episode 122]  Actor loss: -7.19448, Critic loss: 0.21179
[Episode 123]  Actor loss: -3.45953, Critic loss: 0.36992
[Episode 124]  Actor loss: -6.40689, Critic loss: 0.49288
[Episode 125]  Actor loss: -5.80698, Critic loss: 0.63598
[Episode 126]  Actor loss: -6.63951, Critic loss: 0.41718
[Epis



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_800_-3.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_800_-3.0\assets
  reward = env.act(env.getActionSet()[int(action.numpy())])



[Episode 801]  Actor loss: -2.82052, Critic loss: 0.29820
[Episode 802]  Actor loss: -4.51398, Critic loss: 0.50699
[Episode 803]  Actor loss: -10.58025, Critic loss: 1.32067
[Episode 804]  Actor loss: -2.18695, Critic loss: 0.58878
[Episode 805]  Actor loss: -3.17735, Critic loss: 0.39506
[Episode 806]  Actor loss: -2.27302, Critic loss: 0.34386
[Episode 807]  Actor loss: -2.28246, Critic loss: 0.50256
[Episode 808]  Actor loss: -5.20149, Critic loss: 0.38858
[Episode 809]  Actor loss: -6.91489, Critic loss: 0.44948
[Episode 810]  Actor loss: -6.94311, Critic loss: 0.73333
Test average reward is -5.0, Current best average reward is -3.0

[Episode 811]  Actor loss: -5.15033, Critic loss: 0.55354
[Episode 812]  Actor loss: -5.31402, Critic loss: 1.14681
[Episode 813]  Actor loss: -12.63601, Critic loss: 1.06745
[Episode 814]  Actor loss: -2.94326, Critic loss: 0.93245
[Episode 815]  Actor loss: -5.02994, Critic loss: 0.41446
[Episode 816]  Actor loss: -16.31423, Critic loss: 1.02426
[Ep



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1000_-5.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1000_-5.0\assets


Moviepy - Building video movie_f/Lab15_demo-1000.webm.
Moviepy - Writing video movie_f/Lab15_demo-1000.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-1000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4


  reward = env.act(env.getActionSet()[int(action.numpy())])



[Episode 1001]  Actor loss: -9.45737, Critic loss: 1.61506
[Episode 1002]  Actor loss: -11.46352, Critic loss: 0.73694
[Episode 1003]  Actor loss: -6.61490, Critic loss: 2.67365
[Episode 1004]  Actor loss: -8.20386, Critic loss: 1.43185
[Episode 1005]  Actor loss: -17.05931, Critic loss: 3.23460
[Episode 1006]  Actor loss: -0.54192, Critic loss: 1.18283
[Episode 1007]  Actor loss: -7.42268, Critic loss: 1.20137
[Episode 1008]  Actor loss: -10.13559, Critic loss: 2.26494
[Episode 1009]  Actor loss: -2.11936, Critic loss: 1.94148
[Episode 1010]  Actor loss: -4.73791, Critic loss: 1.16036
Test average reward is -5.0, Current best average reward is -3.0

[Episode 1011]  Actor loss: -0.47714, Critic loss: 4.38454
[Episode 1012]  Actor loss: -9.34131, Critic loss: 1.57416
[Episode 1013]  Actor loss: -3.91846, Critic loss: 0.90677
[Episode 1014]  Actor loss: -6.60533, Critic loss: 0.96768
[Episode 1015]  Actor loss: -6.76336, Critic loss: 2.16811
[Episode 1016]  Actor loss: -12.57544, Critic 



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1330_-2.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1330_-2.0\assets
  reward = env.act(env.getActionSet()[int(action.numpy())])



[Episode 1331]  Actor loss: -0.66449, Critic loss: 1.23670
[Episode 1332]  Actor loss: -1.07535, Critic loss: 0.82829
[Episode 1333]  Actor loss: -1.77455, Critic loss: 1.35277
[Episode 1334]  Actor loss: -4.10238, Critic loss: 2.90685
[Episode 1335]  Actor loss: -1.87338, Critic loss: 1.62253
[Episode 1336]  Actor loss: -1.83808, Critic loss: 1.20978
[Episode 1337]  Actor loss: 4.51844, Critic loss: 1.41296
[Episode 1338]  Actor loss: 0.70709, Critic loss: 0.84732
[Episode 1339]  Actor loss: -9.27059, Critic loss: 1.59597
[Episode 1340]  Actor loss: -5.34548, Critic loss: 1.45826
Test average reward is -5.0, Current best average reward is -2.0

[Episode 1341]  Actor loss: -7.10478, Critic loss: 0.76102
[Episode 1342]  Actor loss: -16.47107, Critic loss: 1.01662
[Episode 1343]  Actor loss: -12.26340, Critic loss: 1.04212
[Episode 1344]  Actor loss: -15.04874, Critic loss: 1.16363
[Episode 1345]  Actor loss: -1.61735, Critic loss: 2.06860
[Episode 1346]  Actor loss: 1.49271, Critic loss



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1910_-1.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1910_-1.0\assets
  reward = env.act(env.getActionSet()[int(action.numpy())])



[Episode 1911]  Actor loss: -1.11335, Critic loss: 2.03874
[Episode 1912]  Actor loss: -5.20411, Critic loss: 1.89870
[Episode 1913]  Actor loss: -3.28444, Critic loss: 2.00878
[Episode 1914]  Actor loss: 1.27378, Critic loss: 1.55696
[Episode 1915]  Actor loss: -11.17220, Critic loss: 1.71710
[Episode 1916]  Actor loss: -2.41574, Critic loss: 1.50926
[Episode 1917]  Actor loss: -10.25743, Critic loss: 2.03140
[Episode 1918]  Actor loss: -16.59186, Critic loss: 2.82447
[Episode 1919]  Actor loss: -11.12583, Critic loss: 2.03128
[Episode 1920]  Actor loss: -13.27043, Critic loss: 2.39769
Test average reward is -5.0, Current best average reward is -1.0

[Episode 1921]  Actor loss: -15.80641, Critic loss: 2.28103
[Episode 1922]  Actor loss: -26.81746, Critic loss: 3.13883
[Episode 1923]  Actor loss: -8.13385, Critic loss: 2.54225
[Episode 1924]  Actor loss: 8.46910, Critic loss: 9.08658
[Episode 1925]  Actor loss: -2.95300, Critic loss: 3.95257
[Episode 1926]  Actor loss: 9.49138, Critic 



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2000_-3.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2000_-3.0\assets


Moviepy - Building video movie_f/Lab15_demo-2000.webm.
Moviepy - Writing video movie_f/Lab15_demo-2000.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-2000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4


  reward = env.act(env.getActionSet()[int(action.numpy())])



[Episode 2001]  Actor loss: -13.08610, Critic loss: 4.77412
[Episode 2002]  Actor loss: -8.34508, Critic loss: 4.13744
[Episode 2003]  Actor loss: 1.07308, Critic loss: 2.94477
[Episode 2004]  Actor loss: 7.90027, Critic loss: 2.41804
[Episode 2005]  Actor loss: 6.02581, Critic loss: 4.54065
[Episode 2006]  Actor loss: -1.90485, Critic loss: 2.77097
[Episode 2007]  Actor loss: 6.81762, Critic loss: 2.96025
[Episode 2008]  Actor loss: -0.38676, Critic loss: 3.23603
[Episode 2009]  Actor loss: -18.53974, Critic loss: 2.45356
[Episode 2010]  Actor loss: -12.37646, Critic loss: 3.52900
Test average reward is -5.0, Current best average reward is -1.0

[Episode 2011]  Actor loss: -24.54525, Critic loss: 4.09185
[Episode 2012]  Actor loss: 1.29748, Critic loss: 6.21954
[Episode 2013]  Actor loss: 2.59133, Critic loss: 3.23825
[Episode 2014]  Actor loss: 0.10441, Critic loss: 3.49266
[Episode 2015]  Actor loss: -6.79073, Critic loss: 3.79592
[Episode 2016]  Actor loss: -2.12023, Critic loss: 1



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2070_1.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2070_1.0\assets
  reward = env.act(env.getActionSet()[int(action.numpy())])



[Episode 2071]  Actor loss: -2.98617, Critic loss: 4.48442
[Episode 2072]  Actor loss: -28.94959, Critic loss: 4.13419
[Episode 2073]  Actor loss: -8.23817, Critic loss: 6.11244
[Episode 2074]  Actor loss: -21.01105, Critic loss: 5.67496
[Episode 2075]  Actor loss: -23.49200, Critic loss: 3.53456
[Episode 2076]  Actor loss: -22.65236, Critic loss: 4.02701
[Episode 2077]  Actor loss: 9.85499, Critic loss: 7.09917
[Episode 2078]  Actor loss: 7.32836, Critic loss: 4.76283
[Episode 2079]  Actor loss: -2.78039, Critic loss: 6.08544
[Episode 2080]  Actor loss: 7.74619, Critic loss: 2.84713
Test average reward is -5.0, Current best average reward is 1.0

[Episode 2081]  Actor loss: 19.83716, Critic loss: 3.66871
[Episode 2082]  Actor loss: -0.96908, Critic loss: 1.06267
[Episode 2083]  Actor loss: 3.31367, Critic loss: 1.67696
[Episode 2084]  Actor loss: -7.13065, Critic loss: 1.91946
[Episode 2085]  Actor loss: -9.77501, Critic loss: 1.10077
[Episode 2086]  Actor loss: -20.12813, Critic loss



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2320_4.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2320_4.0\assets
  reward = env.act(env.getActionSet()[int(action.numpy())])



[Episode 2321]  Actor loss: 7.33283, Critic loss: 7.34643
[Episode 2322]  Actor loss: 1.35224, Critic loss: 3.80649
[Episode 2323]  Actor loss: 1.86097, Critic loss: 4.10653
[Episode 2324]  Actor loss: -13.61712, Critic loss: 4.23372
[Episode 2325]  Actor loss: -1.99655, Critic loss: 3.84457
[Episode 2326]  Actor loss: 5.92649, Critic loss: 3.91007
[Episode 2327]  Actor loss: 6.93934, Critic loss: 2.94370
[Episode 2328]  Actor loss: -1.36205, Critic loss: 2.60139
[Episode 2329]  Actor loss: -7.71097, Critic loss: 2.79667
[Episode 2330]  Actor loss: 1.33343, Critic loss: 1.90963
Test average reward is -4.0, Current best average reward is 4.0

[Episode 2331]  Actor loss: -12.75954, Critic loss: 3.11631
[Episode 2332]  Actor loss: -11.85751, Critic loss: 1.40916
[Episode 2333]  Actor loss: -29.98627, Critic loss: 4.45024
[Episode 2334]  Actor loss: -18.24068, Critic loss: 5.21579
[Episode 2335]  Actor loss: -9.89185, Critic loss: 3.68873
[Episode 2336]  Actor loss: -11.62977, Critic loss:



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2400_10.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2400_10.0\assets


## Report

* Since this Lab's topic is mainly on GAE and PPO, I focus on the comprehension of these two parts.
    * **GAE (Generalized Advantage Estimation):**
        * GAE is a method used to estimate the "Advantage Function" to balance bias and variance
        * In policy gradient methods, we need to estimate the "relative value" of performing a specific action in a certain state, which is the advantage function A(s, a)
        * GAE reduces the high variance problem when calculating long-term returns while being more accurate than simple TD methods
        * In **compute_gae** function, it calculates the advantage values for each step in reverse order, which are used for updating the policy and value function
    * **PPO (Proximal Policy Optimization):**
        * PPO is an improved policy gradient method designed to make updates more stable and avoid drastic changes to the policy
        * PPO maximizes the objective function L(θ), while limiting policy updates to prevent instability
        * PPO's clipping ensures updates remain within a safe range, improving stability
        ```
        ratio = tf.math.exp(new_log_probs - old_log_probs)
        surr1 = ratio * advantage
        surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * advantage

        actor_loss = tf.math.negative(tf.math.reduce_mean(tf.math.minimum(surr1, surr2))) - 0.1 * entropy
        ```
        * This part is the key step in PPO, calculating the Actor's loss for stable policy updates
            * ratio: Computes the ratio between new and old policies
            * clip_by_value: Limits the ratio within [1−ϵ,1+ϵ]
            * entropy: Encourages exploration by adding a regularization term to avoid premature convergence
* **Discovery**
    * It takes **2400** episodes to train the policy to make the bird get more than 10 rewards (pass 10 tunnels) in one run.
    * It takes a lot more time to train Flappy Bird to achieve basic performance with GAExPPO technique than SARSA or Q-learning method from last Lab:
        * For SARSA, it takes about 4 hours;
        * For Q-learning, it takes about 7 hours;
        * For GAExPPO, it takes about 8 hours.
    * Nevertheless, the performance:
        * **SARSA = Q-learning >> GAExPPO**
        * I think the reason is that the SARSA and Q-learning perform well on easy task such as Flappy Bird because of the **discrete action space**; yet the GAExPPO technique performs well on more complicated task with **continuous action space**.