In [1]:
import random
from collections import deque

import tensorflow as tf
import gym
import numpy as np
import os

resume = True
CHECKPOINT_PATH = 'deep_q_pong'
ACTIONS_COUNT = 3
SCREEN_WIDTH, SCREEN_HEIGHT = (80, 80)
FUTURE_REWARD_DISCOUNT = 0.99
OBSERVATION_STEPS = 50000
EXPLORE_STEPS = 2000000
INITIAL_RANDOM_ACTION_PROB = 1.0 
FINAL_RANDOM_ACTION_PROB = 0.05  
MEMORY_SIZE = 100000  
MINI_BATCH_SIZE = 100  
STATE_FRAMES = 2
OBS_LAST_STATE_INDEX, OBS_ACTION_INDEX, OBS_REWARD_INDEX, OBS_CURRENT_STATE_INDEX, OBS_TERMINAL_INDEX = range(5)
SAVE_EVERY_X_STEPS = 10000
LEARN_RATE = 1e-6
STORE_SCORES_LEN = 1000.
verbose_logging = True


def _create_network():
    # 네트워크 웨이트 설정 
    convolution_weights_1 = tf.Variable(tf.truncated_normal([8, 8, STATE_FRAMES, 32], stddev=0.01))
    convolution_bias_1 = tf.Variable(tf.constant(0.01, shape=[32]))

    convolution_weights_2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.01))
    convolution_bias_2 = tf.Variable(tf.constant(0.01, shape=[64]))

    convolution_weights_3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.01))
    convolution_bias_3 = tf.Variable(tf.constant(0.01, shape=[64]))

    feed_forward_weights_1 = tf.Variable(tf.truncated_normal([256, 256], stddev=0.01))
    feed_forward_bias_1 = tf.Variable(tf.constant(0.01, shape=[256]))

    feed_forward_weights_2 = tf.Variable(tf.truncated_normal([256, ACTIONS_COUNT], stddev=0.01))
    feed_forward_bias_2 = tf.Variable(tf.constant(0.01, shape=[ACTIONS_COUNT]))

    input_layer = tf.placeholder("float", [None, SCREEN_WIDTH, SCREEN_HEIGHT,
                                           STATE_FRAMES])

    hidden_convolutional_layer_1 = tf.nn.relu(
        tf.nn.conv2d(input_layer, convolution_weights_1, strides=[1, 4, 4, 1], padding="SAME") + convolution_bias_1)

    hidden_max_pooling_layer_1 = tf.nn.max_pool(hidden_convolutional_layer_1, ksize=[1, 2, 2, 1],
                                                strides=[1, 2, 2, 1], padding="SAME")

    hidden_convolutional_layer_2 = tf.nn.relu(
        tf.nn.conv2d(hidden_max_pooling_layer_1, convolution_weights_2, strides=[1, 2, 2, 1],
                     padding="SAME") + convolution_bias_2)

    hidden_max_pooling_layer_2 = tf.nn.max_pool(hidden_convolutional_layer_2, ksize=[1, 2, 2, 1],
                                                strides=[1, 2, 2, 1], padding="SAME")

    hidden_convolutional_layer_3 = tf.nn.relu(
        tf.nn.conv2d(hidden_max_pooling_layer_2, convolution_weights_3,
                     strides=[1, 1, 1, 1], padding="SAME") + convolution_bias_3)

    hidden_max_pooling_layer_3 = tf.nn.max_pool(hidden_convolutional_layer_3, ksize=[1, 2, 2, 1],
                                                strides=[1, 2, 2, 1], padding="SAME")

    hidden_convolutional_layer_3_flat = tf.reshape(hidden_max_pooling_layer_3, [-1, 256])

    final_hidden_activations = tf.nn.relu(
        tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_1) + feed_forward_bias_1)

    output_layer = tf.matmul(final_hidden_activations, feed_forward_weights_2) + feed_forward_bias_2

    return input_layer, output_layer


_session = tf.Session()
_input_layer, _output_layer = _create_network()

_action = tf.placeholder("float", [None, ACTIONS_COUNT])
_target = tf.placeholder("float", [None])

readout_action = tf.reduce_sum(tf.multiply(_output_layer, _action), reduction_indices=1)

cost = tf.reduce_mean(tf.square(_target - readout_action))
_train_operation = tf.train.AdamOptimizer(LEARN_RATE).minimize(cost)

_observations = deque()
_last_scores = deque()

# 첫 번째 액션에서는 아무것도 하지않도록 설정 
_last_action = np.zeros(ACTIONS_COUNT)
_last_action[1] = 1

_last_state = None
_probability_of_random_action = INITIAL_RANDOM_ACTION_PROB
_time = 0

_session.run(tf.initialize_all_variables())

saver = tf.train.Saver()

if not os.path.exists(CHECKPOINT_PATH):
    os.mkdir(CHECKPOINT_PATH)

if resume:
    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_PATH)
    if checkpoint:
        saver.restore(_session, checkpoint.model_checkpoint_path)


def _choose_next_action():
    new_action = np.zeros([ACTIONS_COUNT])

    if random.random() <= _probability_of_random_action:
        # 무작위로 액션 선택한다. 
        action_index = random.randrange(ACTIONS_COUNT)
    else:
        # 마지막 상태를 반영하여 액션을 선택한다. 
        readout_t = _session.run(_output_layer, feed_dict={_input_layer: [_last_state]})[0]
        if verbose_logging:
            print("Action Q-Values are %s" % readout_t)
        action_index = np.argmax(readout_t)

    new_action[action_index] = 1
    return new_action


def pre_process(screen_image):
    """  210x160x3 uint8 프레임을 6400(80x80) 소수점 타입으로 변경 """
    screen_image = screen_image[35:195]  # 크롭
    screen_image = screen_image[::2, ::2, 0]  # 2배로 줄임 
    screen_image[screen_image == 144] = 0  # 배경화면 삭제 (배경화면 타입1)
    screen_image[screen_image == 109] = 0  # 배경화면 삭제 (배경화면 타입2)
    screen_image[screen_image != 0] = 1  # 작대기, 공을 제외하고 모두 1로 바꾼다. 
    return screen_image.astype(np.float)


def _key_presses_from_action(action_set):
    # 1 = 유지
    # 2 = 위로
    # 3 = 아래로

    if action_set[0] == 1:
        return 1
    elif action_set[1] == 1:
        return 2
    elif action_set[2] == 1:
        return 3
    raise Exception("원하지않는 액션발생 ")


def _train():
    # 훈련시킬 mini_batch를 샘플링한다. 
    mini_batch = random.sample(_observations, MINI_BATCH_SIZE)

    # 배치변수를 가져온다. 
    previous_states = [d[OBS_LAST_STATE_INDEX] for d in mini_batch]
    actions = [d[OBS_ACTION_INDEX] for d in mini_batch]
    rewards = [d[OBS_REWARD_INDEX] for d in mini_batch]
    current_states = [d[OBS_CURRENT_STATE_INDEX] for d in mini_batch]
    agents_expected_reward = []

    # 에이전트에게 액션을 수행했을 때 얻을 수 있는 보상에 대한 정보를 알려준다.
    agents_reward_per_action = _session.run(_output_layer, feed_dict={_input_layer: current_states})
    for i in range(len(mini_batch)):
        if mini_batch[i][OBS_TERMINAL_INDEX]:
            # 종료를 선언하게 되므로, 더 이상 리워드를 얻을 수 없다. 
            agents_expected_reward.append(rewards[i])
        else:
            agents_expected_reward.append(
                rewards[i] + FUTURE_REWARD_DISCOUNT * np.max(agents_reward_per_action[i]))

    # 리워드를 얻게 해줄 지금 상태 내의 액션을 학습한다. 
    _session.run(_train_operation, feed_dict={
        _input_layer: previous_states,
        _action: actions,
        _target: agents_expected_reward})

    # 나중을 위해 체크해야할 지점들을 저장한다.
    if _time % SAVE_EVERY_X_STEPS == 0:
        saver.save(_session, CHECKPOINT_PATH + '/network', global_step=_time)

env = gym.make("Pong-v0")
observation = env.reset()
next_action = 1

while True:
    env.render()

    observation, reward, done, info = env.step(next_action)

    if done:
        env.reset()

    terminal = False

    screen_binary = pre_process(observation)

    if reward != 0.0:
        terminal = True
        _last_scores.append(reward)
        if len(_last_scores) > STORE_SCORES_LEN:
            _last_scores.popleft()

    # 첫 번째 프레임은 실행방법은 따로 결정한다. 
    if _last_state is None:
        # _last_state 은 마지막 self.STATE_FRAMES 프레임의 이미지 데이터를 가지고 있다. 
        _last_state = np.stack(tuple(screen_binary for _ in range(STATE_FRAMES)), axis=2)
        next_action = _key_presses_from_action(_last_action)
    else:
        screen_binary = np.reshape(screen_binary,
                                   (SCREEN_WIDTH, SCREEN_HEIGHT, 1))
        current_state = np.append(_last_state[:, :, 1:], screen_binary, axis=2)

        # 기존 관찰결과에서 넘어오는 기록들을 저장한다. 
        _observations.append((_last_state, _last_action, reward, current_state, terminal))

        if len(_observations) > MEMORY_SIZE:
            _observations.popleft()

        # 관찰이 끝난 경우에만 트레이닝을 시작한다. 
        if len(_observations) > OBSERVATION_STEPS:
            _train()
            _time += 1

        # 기존 값을 업데이트한다. 
        _last_state = current_state

        _last_action = _choose_next_action()

        # 무작위로 뽑는 액션 비율을 점진적으로 감소시킨다. 
        if _probability_of_random_action > FINAL_RANDOM_ACTION_PROB \
                and len(_observations) > OBSERVATION_STEPS:
            _probability_of_random_action -= \
                (INITIAL_RANDOM_ACTION_PROB - FINAL_RANDOM_ACTION_PROB) / EXPLORE_STEPS

        print("Time: %s random_action_prob: %s reward %s scores differential %s" %
              (_time, _probability_of_random_action, reward,
               sum(_last_scores) / STORE_SCORES_LEN))

        next_action = _key_presses_from_action(_last_action)

Instructions for updating:
Use `tf.global_variables_initializer` instead.


  result = entry_point.load(False)


Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential 0.0
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential

Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.002
Time: 0 random_actio

Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.004
Time: 0 random_actio

Time: 0 random_action_prob: 1.0 reward -1.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.005
Time: 0 random_acti

Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.008
Time: 0 random_actio

Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_action_prob: 1.0 reward 0.0 scores differential -0.011
Time: 0 random_actio

KeyboardInterrupt: 