In [3]:
# import package needed
%matplotlib inline
import matplotlib.pyplot as plt
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
import numpy as np
import skimage.color
import skimage.transform
from ple.games.flappybird import FlappyBird
from ple import PLE
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game

couldn't import doomish
Couldn't import doom


In [4]:
# define input size
screen_width = 80
screen_height = 80
num_stack = 4

In [5]:
def preprocess(screen):
    #screen = skimage.color.rgb2gray(screen)
    screen = skimage.transform.resize(screen, [screen_width, screen_height])
    return screen

In [6]:
import math
import copy
from collections import defaultdict
MIN_EXPLORING_RATE = 10e-4


class Agent:
    def __init__(self, name, num_action, t=0, discount_factor=0.99):
        self.exploring_rate = 0.1
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.name = name
        with tf.variable_scope(name):
            self.build_model()

    def build_model(self):
        # input: current screen, selected action and reward
        self.input_screen = tf.placeholder(
            tf.float32, shape=[None, screen_width, screen_height, num_stack])
        self.action = tf.placeholder(tf.int32, [None])
        self.reward = tf.placeholder(tf.float32, [None])
        self.is_training = tf.placeholder(tf.bool, shape=[])

        def net(screen, reuse=False):
            with tf.variable_scope(
                    "layers",
                    reuse=reuse,
                    initializer=tf.truncated_normal_initializer(stddev=1e-2)):
                conv1 = tf.layers.conv2d(
                    inputs=screen,
                    filters=32,
                    kernel_size=[8, 8],
                    strides=[4, 4],
                    padding='SAME',
                    activation=tf.nn.relu)
                pool1 = tf.layers.max_pooling2d(
                    conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

                conv2 = tf.layers.conv2d(
                    inputs=pool1,
                    filters=64,
                    kernel_size=[4, 4],
                    strides=[2, 2],
                    padding='SAME',
                    activation=tf.nn.relu)
                conv3 = tf.layers.conv2d(
                    inputs=conv2,
                    filters=64,
                    kernel_size=[3, 3],
                    strides=[1, 1],
                    padding='SAME',
                    activation=tf.nn.relu)
                flat = tf.contrib.layers.flatten(conv3)
                dense = tf.layers.dense(
                    inputs=flat, units=512, activation=tf.nn.relu)
                Q = tf.layers.dense(
                    inputs=dense, units=self.num_action, activation=None)

                return Q

        # optimize
        self.output = net(
            self.input_screen
        )  # Q(s,a,theta) for all a, shape (batch_size, num_action)
        index = tf.stack(
            [tf.range(tf.shape(self.action)[0]), self.action], axis=1)
        self.esti_Q = tf.gather_nd(
            self.output,
            index)  # Q(s,a,theta) for selected action, shape (batch_size, 1)

        self.max_Q = tf.reduce_max(
            self.output, axis=1)  # max(Q(s',a',theta')), shape (batch_size, 1)
        self.tar_Q = tf.placeholder(tf.float32, [None])

        # loss = E[r+max(Q(s',a',theta'))-Q(s,a,theta)]
        self.loss = tf.reduce_mean(
            tf.square(self.reward + self.discount_factor * self.tar_Q -
                      self.esti_Q))

        optimizer = tf.train.AdamOptimizer(learning_rate=1e-5)
        self.g_gvs = optimizer.compute_gradients(
            self.loss,
            var_list=[v for v in tf.global_variables() if self.name in v.name])
        self.train_op = optimizer.apply_gradients(self.g_gvs)
        self.pred = tf.argmax(
            self.output, axis=1
        )  # select action with highest action-value, only used in inference

    def select_action(self, input_screen, sess):
        # epsilon-greedy
        if np.random.rand() < self.exploring_rate:
            action = np.random.choice(num_action)  # Select a random action
        else:
            input_screen = np.array(input_screen).transpose([1, 2, 0])
            feed_dict = {
                self.input_screen: input_screen[None, :],
                self.is_training: False,
            }
            action = sess.run(
                self.pred,
                feed_dict=feed_dict)[0]  # Select the action with the highest q
        return action

    def update_policy(self, input_screens, actions, rewards,
                      input_screens_plum, terminal, target_netwrok):
        # use max_Q estimate from target one to update online one
        feed_dict = {
            target_netwrok.input_screen:
            np.array(input_screens_plum).transpose([0, 2, 3, 1]),
            target_netwrok.is_training:
            True,
        }
        max_Q = sess.run(target_netwrok.max_Q, feed_dict=feed_dict)
        max_Q *= ~np.array(terminal)
        feed_dict = {
            self.input_screen: np.array(input_screens).transpose([0, 2, 3, 1]),
            self.tar_Q: max_Q,
            self.action: actions,
            self.reward: rewards,
            self.is_training: True,
        }
        loss, _ = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
        return loss

    def update_parameters(self, episode):
        if self.exploring_rate > MIN_EXPLORING_RATE:
            self.exploring_rate -= (0.1 - MIN_EXPLORING_RATE) / 3000000

    def shutdown_explore(self):
        # make action selection greedy
        self.exploring_rate = 0


In [7]:
from myUtil import get_nice_session

a = tf.Variable([[1,2,3,4,5], [6,7,8,9,10], [11,12,13,14,15]])
y = tf.Variable([[0,1]])
p = tf.gather_nd(a, y) 

with get_nice_session(0.05) as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(a))
    print(sess.run(p))

[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]]
[2]


In [8]:
def get_update_ops():
    # return operations assign weight to target network
    src_vars = [v for v in tf.global_variables() if 'online' in v.name]
    tar_vars = [v for v in tf.global_variables() if 'target' in v.name]
    update_ops = []
    for src_var, tar_var in zip(src_vars, tar_vars):
        update_ops.append(tar_var.assign(src_var))
    return update_ops


def update_target(update_ops, sess):
    sess.run(update_ops)

In [9]:
# init agent
tf.reset_default_graph()
num_action = len(env.getActionSet())

# agent for frequently updating
online_agent = Agent('online', num_action)

# agent for slow updating
target_agent = Agent('target', num_action)
update_ops = get_update_ops()

In [10]:
class Replay_buffer():
    def __init__(self, buffer_size=50000):
        self.experiences = []
        self.buffer_size = buffer_size

    def add(self, experience):
        if len(self.experiences) >= self.buffer_size:
            self.experiences.pop(0)
        self.experiences.append(experience)

    def sample(self, size):
        """
        sameple experience from buffer
        """
        if size > len(self.experiences):
            experiences_idx = np.random.choice(
                len(self.experiences), size=size)
        else:
            experiences_idx = np.random.choice(
                len(self.experiences), size=size, replace=False)
        # from all sampled experiences, extract a tuple of (s,a,r,s')
        screens = []
        actions = []
        rewards = []
        screens_plum = []
        terminal = []
        for i in range(size):
            screens.append(self.experiences[experiences_idx[i]][0])
            actions.append(self.experiences[experiences_idx[i]][1])
            rewards.append(self.experiences[experiences_idx[i]][2])
            screens_plum.append(self.experiences[experiences_idx[i]][3])
            terminal.append(self.experiences[experiences_idx[i]][4])
        return screens, actions, rewards, screens_plum, terminal


In [11]:
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [17]:
from IPython.display import Image, display

update_every_t_step = 3
print_every_episode = 10
save_video_every_episode = 100
NUM_EPISODE = 1000
NUM_EXPLORE = 20

# we can redefine origin reward function
reward_values = {
    "positive": 1,  # reward pass a pipe
    "tick": 0.1,  # reward per timestamp
    "loss": -1,  # reward of gameover
}

# init buffer
buffer = Replay_buffer()

for episode in range(0, NUM_EPISODE + 1):
    # Reset the environment
    game = FlappyBird()
    # for demo purpose, the following code is trained in the same scene,
    env = PLE(
        game,
        fps=30,
        display_screen=False,
        reward_values=reward_values,
        rng=np.random.RandomState(1))
    env.reset_game()
    env.act(0)  # dummy input to make sure input screen is correct

    # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]

    # for every 500 episodes, shutdown exploration to see performance of greedy action
    if episode % print_every_episode == 0:
        online_agent.shutdown_explore()

    # grayscale input screen for this episode
    input_screens = [preprocess(env.getScreenGrayscale())] * 4

    # experience for this episode, store all (s,a,r,s') tuple
    experience = []

    # cumulate reward for this episode
    cum_reward = 0

    t = 0
    while not env.game_over():

        # feed four previous screen, select an action
        action = online_agent.select_action(input_screens[-4:], sess)

        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])

        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())

        # cumulate reward
        cum_reward += reward

        # append grayscale screen for this episode
        input_screens.append(preprocess(env.getScreenGrayscale()))

        # append experience for this episode
        buffer.add((input_screens[-5:-1], action, reward, input_screens[-4:],
                    env.game_over()))
        t += 1

        # update agent
    if episode > NUM_EXPLORE:
        train_screens, train_actions,\
        train_rewards, train_screens_plum, terminal = buffer.sample(32)
        loss = online_agent.update_policy(train_screens, train_actions,
                                          train_rewards, train_screens_plum,
                                          terminal, target_agent)
    if t % update_every_t_step == 0 and episode > NUM_EXPLORE:
        update_target(update_ops, sess)

    # update explore rating and learning rate
    online_agent.update_parameters(episode)
    target_agent.update_parameters(episode)

    if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
        print(
            "[{}] time live:{}, cumulated reward: {}, exploring rate: {}, loss: {}".
            format(episode, t, cum_reward, target_agent.exploring_rate, loss))

    if episode % save_video_every_episode == 0:  # for every 100 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie/DQN-{}.webm".format(episode), fps=60)

print(type(input_screens))

[MoviePy] >>>> Building video movie/DQN-0.webm
[MoviePy] Writing video movie/DQN-0.webm


100%|██████████| 63/63 [00:01<00:00, 43.28it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-0.webm 






[30] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09999227800000082, loss: 0.030505405738949776
[40] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09999194800000086, loss: 0.0006794719374738634
[50] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09999161800000089, loss: 0.03050941787660122
[60] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09999128800000093, loss: 0.06140551716089249
[70] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09999095800000096, loss: 0.0004684058949351311
[80] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.099990628000001, loss: 0.03038960136473179
[90] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09999029800000103, loss: 0.000461193616501987
[100] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998996800000107, loss: 0.00021198575268499553
[MoviePy] >>>> Building video movie/DQN-

100%|██████████| 63/63 [00:01<00:00, 48.20it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-100.webm 






[110] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999896380000011, loss: 0.06195664033293724
[120] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998930800000114, loss: 4.2418931116117164e-05
[130] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998897800000117, loss: 5.0428425311110914e-05
[140] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999886480000012, loss: 0.0001882569631561637
[150] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998831800000124, loss: 0.0004954629694111645
[160] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998798800000128, loss: 0.06048739701509476
[170] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998765800000131, loss: 0.0004766923375427723
[180] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998732800000135, loss: 0.0005250619724392891
[190] time live:61, cumulate

100%|██████████| 63/63 [00:01<00:00, 48.56it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-200.webm 






[210] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998633800000145, loss: 0.00017205906624440104
[220] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998600800000149, loss: 0.00012447820336092263
[230] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998567800000152, loss: 5.560517456615344e-05
[240] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998534800000156, loss: 8.484514546580613e-05
[250] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998501800000159, loss: 0.00014122258289717138
[260] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998468800000163, loss: 0.00019265152513980865
[270] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998435800000166, loss: 0.030738985165953636
[280] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999840280000017, loss: 0.03065565600991249
[290] time live:61, cumu

100%|██████████| 63/63 [00:01<00:00, 43.63it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-300.webm 






[310] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999830380000018, loss: 0.03092280589044094
[320] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998270800000184, loss: 0.00020142178982496262
[330] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998237800000187, loss: 0.0002301264030393213
[340] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999820480000019, loss: 0.000500628084409982
[350] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998171800000194, loss: 0.00035959726665169
[360] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998138800000198, loss: 0.00027496315306052566
[370] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998105800000201, loss: 0.03050842322409153
[380] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09998072800000204, loss: 0.06085682660341263
[390] time live:61, cumulated rew

100%|██████████| 63/63 [00:01<00:00, 42.49it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-400.webm 






[410] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997973800000215, loss: 0.030805883929133415
[420] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997940800000218, loss: 0.0002122895821230486
[430] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997907800000222, loss: 0.0001553200709167868
[440] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997874800000225, loss: 0.00010027435200754553
[450] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997841800000229, loss: 0.030663495883345604
[460] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997808800000232, loss: 0.00028207831201143563
[470] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997775800000236, loss: 0.0002648380759637803
[480] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999774280000024, loss: 0.0613083615899086
[490] time live:61, cumulate

100%|██████████| 63/63 [00:01<00:00, 45.53it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-500.webm 






[510] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999764380000025, loss: 0.030617745593190193
[520] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997610800000253, loss: 0.00013523291272576898
[530] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997577800000257, loss: 0.00024418008979409933
[540] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999754480000026, loss: 0.030611801892518997
[550] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997511800000264, loss: 0.00026435795007273555
[560] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997478800000267, loss: 0.03050723671913147
[570] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997445800000271, loss: 0.030456319451332092
[580] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997412800000274, loss: 0.0004437807947397232
[590] time live:61, cumulate

100%|██████████| 63/63 [00:01<00:00, 47.85it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-600.webm 






[610] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997313800000285, loss: 0.030364936217665672
[620] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997280800000288, loss: 0.059824101626873016
[630] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997247800000292, loss: 0.0009602685458958149
[640] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997214800000295, loss: 0.03045773319900036
[650] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997181800000299, loss: 0.00019931238784920424
[660] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997148800000302, loss: 0.06131594628095627
[670] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997115800000306, loss: 0.00018304203695151955
[680] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09997082800000309, loss: 0.06131381541490555
[690] time live:61, cumulated 

100%|██████████| 63/63 [00:01<00:00, 44.26it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-700.webm 






[710] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999698380000032, loss: 0.0005352196749299765


KeyboardInterrupt: 

In [18]:
from moviepy.editor import *
clip = VideoFileClip("movie/DQN-700.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

 98%|█████████▊| 63/64 [00:00<00:00, 237.53it/s]


In [19]:
import math
import copy
from collections import defaultdict
MIN_EXPLORING_RATE = 0.01
MIN_LEARNING_RATE = 0.1


class Policy_Gradiebt_Agent:
    def __init__(self, name, num_action, t=0, discount_factor=0.99):
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.name = name
        with tf.variable_scope(name):
            self.build_model()

    def build_model(self):

        # input: current screen, selected action and reward
        self.input_screen = tf.placeholder(
            tf.float32, shape=[None, screen_width, screen_height, num_stack])
        self.action = tf.placeholder(tf.int32, [None])
        self.reward = tf.placeholder(tf.float32, [None])
        self.is_training = tf.placeholder(tf.bool, shape=[])

        def net(screen, reuse=False):
            with tf.variable_scope("layers", reuse=reuse):
                conv1 = tf.layers.conv2d(
                    inputs=screen,
                    filters=32,
                    kernel_size=[8, 8],
                    strides=[4, 4],
                    padding='SAME',
                    activation=tf.nn.relu)
                pool1 = tf.layers.max_pooling2d(
                    conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

                conv2 = tf.layers.conv2d(
                    inputs=pool1,
                    filters=64,
                    kernel_size=[4, 4],
                    strides=[2, 2],
                    padding='SAME',
                    activation=tf.nn.relu)
                conv3 = tf.layers.conv2d(
                    inputs=conv2,
                    filters=64,
                    kernel_size=[3, 3],
                    strides=[1, 1],
                    padding='SAME',
                    activation=tf.nn.relu)
                self.flat = tf.contrib.layers.flatten(conv3)

                self.dense1 = tf.layers.dense(
                    inputs=self.flat, units=512, activation=tf.nn.relu)
                self.dense2 = tf.layers.dense(
                    inputs=self.dense1, units=self.num_action, activation=None)
                return self.dense2

        # optimize
        self.output_logit = net(
            self.input_screen
        )  # logit of probility(P(s,a,theta)) for all a, shape (batch_size, num_action)
        index = tf.stack(
            [tf.range(tf.shape(self.action)[0]), self.action], axis=1)
        self.prob = tf.gather_nd(
            tf.nn.softmax(self.output_logit),
            index)  # P(s,a,theta) for selected action, shape (batch_size, 1)

        # loss = E[log(p(s,a))*r]
        # because we want to maximize objective, add negative sign before loss
        self.loss = -tf.reduce_mean(
            tf.log(self.prob + 0.00000001) * self.reward)
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
        g_gvs = optimizer.compute_gradients(
            self.loss,
            var_list=[v for v in tf.global_variables() if self.name in v.name])
        self.train_op = optimizer.apply_gradients(g_gvs)

        self.pred = tf.multinomial(self.output_logit,
                                   1)  # sample action from distribution

    def select_action(self, input_screen, sess):
        input_screen = np.array(input_screen).transpose([1, 2, 0])
        feed_dict = {
            self.input_screen: input_screen[None, :],
            self.is_training: False,
        }
        action = sess.run(
            self.pred,
            feed_dict=feed_dict)[0][0]  # sameple action from distribution
        return action

    def update_policy(self, input_screens, actions, rewards,
                      input_screens_plum):
        feed_dict = {
            self.input_screen: np.array(input_screens).transpose([0, 2, 3, 1]),
            self.action: actions,
            self.reward: rewards,
            self.is_training: True,
        }
        loss, _ = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
        return loss


In [20]:
# init agent
tf.reset_default_graph()
# agent for frequently updating
pg_agent = Policy_Gradiebt_Agent('PG_Agent', num_action)
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [21]:
[v for v in tf.global_variables() if pg_agent.name in v.name]

[<tf.Variable 'PG_Agent/layers/conv2d/kernel:0' shape=(8, 8, 4, 32) dtype=float32_ref>,
 <tf.Variable 'PG_Agent/layers/conv2d/bias:0' shape=(32,) dtype=float32_ref>,
 <tf.Variable 'PG_Agent/layers/conv2d_1/kernel:0' shape=(4, 4, 32, 64) dtype=float32_ref>,
 <tf.Variable 'PG_Agent/layers/conv2d_1/bias:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'PG_Agent/layers/conv2d_2/kernel:0' shape=(3, 3, 64, 64) dtype=float32_ref>,
 <tf.Variable 'PG_Agent/layers/conv2d_2/bias:0' shape=(64,) dtype=float32_ref>,
 <tf.Variable 'PG_Agent/layers/dense/kernel:0' shape=(1600, 512) dtype=float32_ref>,
 <tf.Variable 'PG_Agent/layers/dense/bias:0' shape=(512,) dtype=float32_ref>,
 <tf.Variable 'PG_Agent/layers/dense_1/kernel:0' shape=(512, 2) dtype=float32_ref>,
 <tf.Variable 'PG_Agent/layers/dense_1/bias:0' shape=(2,) dtype=float32_ref>,
 <tf.Variable 'PG_Agent/beta1_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'PG_Agent/beta2_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'PG_Agent/PG_Agen

In [22]:
from IPython.display import Image, display

update_every_episode = 1
print_every_episode = 10
save_video_every_episode = 100
NUM_EPISODE = 100
NUM_EXPLORE = 10
NUM_PASS = 20
reward_values = {
    "positive": 1,
    "tick": 0.1,  # reward per timestamp
    "loss": -1,
}
for episode in range(0, NUM_EPISODE + 1):

    # Reset the environment
    game = FlappyBird()
    env = PLE(
        game,
        fps=30,
        display_screen=False,
        reward_values=reward_values,
        rng=np.random.RandomState(1))
    env.reset_game()
    env.act(0)  # dummy input to make sure input screen is correct

    # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]

    # grayscale input screen for this episode
    input_screens = [preprocess(env.getScreenGrayscale())] * 4

    # cumulate reward for this episode
    cum_reward = 0

    experiences = []
    t = 0
    while not env.game_over():
        # feed four previous screen, select an action
        action = pg_agent.select_action(input_screens[-4:], sess)

        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])

        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())

        # cumulate reward
        cum_reward += reward

        # append grayscale screen for this episode
        input_screens.append(preprocess(env.getScreenGrayscale()))

        # append experience for this episode
        experiences.append(
            [input_screens[-5:-1], action, reward, input_screens[-4:]])

        t += 1

    def discount_reward(x, discount_rate):
        discounted_r = np.zeros(len(x))
        num_r = len(x)
        for i in range(num_r):
            discounted_r[i] = x[i] * math.pow(discount_rate, i)
        discounted_r = np.cumsum(discounted_r[::-1])
        return discounted_r[::-1]

    rewards = [e[2] for e in experiences]
    discounted_reward = discount_reward(rewards, pg_agent.discount_factor)

    # normalize
    discounted_reward -= np.mean(discounted_reward)
    discounted_reward /= np.std(discounted_reward)
    train_screens = []
    train_actions = []
    train_rewards = []
    train_input_screens_plum = []
    for i in range(len(experiences)):
        experiences[i][2] = discounted_reward[i]
        train_screens.append(experiences[i][0])
        train_actions.append(experiences[i][1])
        train_rewards.append(experiences[i][2])
        train_input_screens_plum.append(experiences[i][3])
    loss = pg_agent.update_policy(train_screens, train_actions, train_rewards,
                                  train_input_screens_plum)

    if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
        print("[{}] time live:{}, cumulated reward: {}, loss: {}".format(
            episode, t, cum_reward, loss))

    if episode % save_video_every_episode == 0 and episode > NUM_EXPLORE:  # for every 5000 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie/pg_{}.webm".format(episode), fps=60)
        #display(clip.ipython_display(fps=60, autoplay=1, loop=1))


[20] time live:53, cumulated reward: 4.299999999999997, loss: -0.010702799074351788
[30] time live:61, cumulated reward: 5.099999999999994, loss: 0.0015100885648280382
[40] time live:54, cumulated reward: 4.399999999999997, loss: -0.004711168818175793
[50] time live:42, cumulated reward: 3.200000000000001, loss: 0.010399137623608112
[60] time live:61, cumulated reward: 5.099999999999994, loss: -0.024041535332798958
[70] time live:57, cumulated reward: 4.699999999999996, loss: 0.0027483219746500254
[80] time live:49, cumulated reward: 3.8999999999999986, loss: -0.014750772155821323
[90] time live:58, cumulated reward: 4.799999999999995, loss: -0.0070993490517139435
[100] time live:43, cumulated reward: 3.3000000000000007, loss: 0.0008063205168582499
[MoviePy] >>>> Building video movie/pg_100.webm
[MoviePy] Writing video movie/pg_100.webm


 98%|█████████▊| 44/45 [00:01<00:00, 41.65it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_100.webm 



In [23]:
from moviepy.editor import *
clip = VideoFileClip("movie/pg_100.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))


100%|██████████| 44/44 [00:00<00:00, 181.16it/s]


In [27]:
class Actor_critic:
    def __init__(self, name, num_action, discount_factor=0.99):
        self.exploring_rate = 0.1
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.name = name
        with tf.variable_scope(name):
            self.build_model()

    def build_model(self):
        # input: current screen, selected action and reward
        self.input_screen = tf.placeholder(
            tf.float32, shape=[None, screen_width, screen_height, num_stack])
        self.action = tf.placeholder(tf.int32, [None])
        self.reward = tf.placeholder(tf.float32, [None])
        self.is_training = tf.placeholder(tf.bool, shape=[])

        def value_net(screen, reuse=False):
            with tf.variable_scope(
                    "value_net",
                    reuse=reuse,
                    initializer=tf.truncated_normal_initializer(stddev=1e-2)):
                conv1 = tf.layers.conv2d(
                    inputs=screen,
                    filters=32,
                    kernel_size=[8, 8],
                    strides=[4, 4],
                    padding='SAME',
                    activation=tf.nn.relu)
                pool1 = tf.layers.max_pooling2d(
                    conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

                conv2 = tf.layers.conv2d(
                    inputs=pool1,
                    filters=64,
                    kernel_size=[4, 4],
                    strides=[2, 2],
                    padding='SAME',
                    activation=tf.nn.relu)
                conv3 = tf.layers.conv2d(
                    inputs=conv2,
                    filters=64,
                    kernel_size=[3, 3],
                    strides=[1, 1],
                    padding='SAME',
                    activation=tf.nn.relu)
                flat = tf.contrib.layers.flatten(conv3)
                dense = tf.layers.dense(
                    inputs=flat, units=512, activation=tf.nn.relu)
                V = tf.layers.dense(inputs=dense, units=1, activation=None)
                return V

        def policy_net(screen, reuse=False):
            with tf.variable_scope("policy_net", reuse=reuse):
                conv1 = tf.layers.conv2d(
                    inputs=screen,
                    filters=32,
                    kernel_size=[8, 8],
                    strides=[4, 4],
                    padding='SAME',
                    activation=tf.nn.relu)
                pool1 = tf.layers.max_pooling2d(
                    conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

                conv2 = tf.layers.conv2d(
                    inputs=pool1,
                    filters=64,
                    kernel_size=[4, 4],
                    strides=[2, 2],
                    padding='SAME',
                    activation=tf.nn.relu)
                conv3 = tf.layers.conv2d(
                    inputs=conv2,
                    filters=64,
                    kernel_size=[3, 3],
                    strides=[1, 1],
                    padding='SAME',
                    activation=tf.nn.relu)
                self.flat = tf.contrib.layers.flatten(conv3)

                self.dense1 = tf.layers.dense(
                    inputs=self.flat, units=512, activation=tf.nn.relu)
                self.dense2 = tf.layers.dense(
                    inputs=self.dense1, units=self.num_action, activation=None)
                return self.dense2

        # value
        self.v_output = value_net(
            self.input_screen
        )  # Q(s,a,theta) for all a, shape (batch_size, num_action)
        self.tar_V = tf.placeholder(tf.float32, [None])
        self.V_loss = tf.reduce_mean(
            tf.square(self.reward + self.discount_factor * self.tar_V -
                      self.v_output))
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
        g_gvs = optimizer.compute_gradients(
            self.V_loss,
            var_list=[
                v for v in tf.global_variables() if 'value_net' in v.name
            ])
        self.V_train_op = optimizer.apply_gradients(g_gvs)

        # policy
        self.policy_logit = policy_net(
            self.input_screen
        )  # logit of probility(P(s,a,theta)) for all a, shape (batch_size, num_action)
        index = tf.stack(
            [tf.range(tf.shape(self.action)[0]), self.action], axis=1)
        self.prob = tf.gather_nd(
            tf.nn.softmax(self.policy_logit),
            index)  # P(s,a,theta) for selected action, shape (batch_size, 1)

        # loss = E[log(p(s,a))*r]
        self.policy_loss = -tf.reduce_mean(
            tf.log(self.prob + 0.00000001) * self.reward
        )
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
        g_gvs = optimizer.compute_gradients(
            self.policy_loss,
            var_list=[
                v for v in tf.global_variables() if 'policy_net' in v.name
            ])
        self.train_op = optimizer.apply_gradients(g_gvs)
        self.pred = tf.multinomial(self.policy_logit,
                                   1)  # sample action from distribution

    def select_action(self, input_screen, sess):
        input_screen = np.array(input_screen).transpose([1, 2, 0])
        feed_dict = {
            self.input_screen: input_screen[None, :],
        }
        action = sess.run(
            self.pred,
            feed_dict=feed_dict)[0][0]  # sameple action from distribution
        return action

    def update_policy(self, input_screens, actions, rewards,
                      input_screens_plum):
        esti_V = sess.run(
            self.v_output, feed_dict={
                self.input_screen: np.array(input_screens_plum).transpose([0, 2, 3, 1]),
            }
        ).flatten()
        td_target = rewards + self.discount_factor * esti_V

        esti_V = sess.run(
            self.v_output, feed_dict={
                self.input_screen: np.array(input_screens).transpose([0, 2, 3, 1]),
            }
        ).flatten()
        td_error = td_target - esti_V

        V_loss, _ = sess.run(
            [self.V_loss, self.V_train_op], feed_dict={
                self.input_screen: np.array(input_screens).transpose([0, 2, 3, 1]),
                self.tar_V: td_target,
                self.reward: rewards,
            }
        )

        policy_loss, _ = sess.run(
            [self.policy_loss, self.train_op],
            feed_dict={
                self.input_screen: np.array(input_screens).transpose([0, 2, 3, 1]),
                self.action: actions,
                self.reward: td_error
            }
        )
        return V_loss, policy_loss

    def update_parameters(self, episode):
        if self.exploring_rate > MIN_EXPLORING_RATE:
            self.exploring_rate -= (0.1 - MIN_EXPLORING_RATE) / 3000000

    def shutdown_explore(self):
        # make action selection greedy
        self.exploring_rate = 0

In [28]:
# init agent
tf.reset_default_graph()
# agent for frequently updating
ac_agent = Actor_critic('PG_Agent', num_action)
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [29]:
from IPython.display import Image, display

update_every_episode = 1
print_every_episode = 10
save_video_every_episode = 100
NUM_EPISODE = 100
NUM_EXPLORE = 0
reward_values = {
    "positive": 1,
    "tick": 0.1,  # reward per timestamp
    "loss": -1,
}
for episode in range(0, NUM_EPISODE + 1):

    # Reset the environment
    game = FlappyBird()
    env = PLE(
        game,
        fps=30,
        display_screen=False,
        reward_values=reward_values,
        rng=np.random.RandomState(1))
    env.reset_game()
    env.act(0)  # dummy input to make sure input screen is correct

    # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]

    # grayscale input screen for this episode
    input_screens = [preprocess(env.getScreenGrayscale())] * 4

    # cumulate reward for this episode
    cum_reward = 0

    experiences = []
    t = 0
    while not env.game_over():
        # feed four previous screen, select an action
        action = ac_agent.select_action(input_screens[-4:], sess)

        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])

        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())

        # cumulate reward
        cum_reward += reward

        # append grayscale screen for this episode
        input_screens.append(preprocess(env.getScreenGrayscale()))

        # append experience for this episode
        experiences.append(
            [input_screens[-5:-1], action, reward, input_screens[-4:]])

        t += 1

    def discount_reward(x, discount_rate):
        discounted_r = np.zeros(len(x))
        num_r = len(x)
        for i in range(num_r):
            discounted_r[i] = x[i] * math.pow(discount_rate, i)
        discounted_r = np.cumsum(discounted_r[::-1])
        return discounted_r[::-1]

    rewards = [e[2] for e in experiences]
    discounted_reward = discount_reward(rewards, ac_agent.discount_factor)

    # normalize
    discounted_reward -= np.mean(discounted_reward)
    discounted_reward /= np.std(discounted_reward)
    train_screens = []
    train_actions = []
    train_rewards = []
    train_input_screens_plum = []
    for i in range(len(experiences)):
        experiences[i][2] = discounted_reward[i]
        train_screens.append(experiences[i][0])
        train_actions.append(experiences[i][1])
        train_rewards.append(experiences[i][2])
        train_input_screens_plum.append(experiences[i][3])
    loss = ac_agent.update_policy(train_screens, train_actions, train_rewards,
                                  train_input_screens_plum)

    if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
        print("[{}] time live:{}, cumulated reward: {}, loss: {}".format(
            episode, t, cum_reward, loss))

    if episode % save_video_every_episode == 0 and episode > NUM_EXPLORE:  # for every 5000 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie/ac_{}.webm".format(episode), fps=60)
        #display(clip.ipython_display(fps=60, autoplay=1, loop=1))

[10] time live:62, cumulated reward: 5.199999999999994, loss: (3.9600937, 0.0059588035)
[20] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601042, 0.01066937)
[30] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601014, -0.0039891838)
[40] time live:38, cumulated reward: 2.8000000000000025, loss: (3.9600999, -0.014589799)
[50] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600956, -0.010471678)
[60] time live:45, cumulated reward: 3.5, loss: (3.9601038, 0.0020485348)
[70] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9601028, -0.002130928)
[80] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600999, -0.01099922)
[90] time live:43, cumulated reward: 3.3000000000000007, loss: (3.960104, 0.0063050734)
[100] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600995, 0.018882439)
[MoviePy] >>>> Building video movie/ac_100.webm
[MoviePy] Writing video movie/ac_100.webm


 98%|█████████▊| 56/57 [00:01<00:00, 41.31it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_100.webm 






In [30]:
from moviepy.editor import *
clip = VideoFileClip("movie/ac_100.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

100%|██████████| 56/56 [00:00<00:00, 206.42it/s]
