In [None]:

import random
from collections import deque

# 텐서플로우를 반드시 gym 라이브러리를 불러오기전에 불러와야한다. 
import tensorflow as tf
import gym
import numpy as np

env = gym.make('CartPole-v0')

ACTIONS_COUNT = 2
FUTURE_REWARD_DISCOUNT = 0.9
OBSERVATION_STEPS = 5000  # 훈련 이전에 관찰을 위한 시간 단위 
EXPLORE_STEPS = 15000
INITIAL_RANDOM_ACTION_PROB = 1.0  # 무작위의 액션 중 처음 시작할 액션 
FINAL_RANDOM_ACTION_PROB = 0.0  # 무작위의 액션 중 최종 선택된 것 
MEMORY_SIZE = 20000  # 기록할 관찰 횟수 
MINI_BATCH_SIZE = 100  # 미니배치 크기 
OBS_LAST_STATE_INDEX, OBS_ACTION_INDEX, OBS_REWARD_INDEX, OBS_CURRENT_STATE_INDEX, OBS_TERMINAL_INDEX = range(5)
LEARN_RATE = 1e-3
STORE_SCORES_LEN = 100
INPUT_NODES = env.observation_space.shape[0]
HIDDEN_NODES = 20

session = tf.Session()

feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, HIDDEN_NODES], stddev=0.01))
feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[HIDDEN_NODES]))

feed_forward_weights_2 = tf.Variable(tf.truncated_normal([HIDDEN_NODES, ACTIONS_COUNT], stddev=0.01))
feed_forward_bias_2 = tf.Variable(tf.constant(0.0, shape=[ACTIONS_COUNT]))

input_placeholder = tf.placeholder("float", [None, INPUT_NODES])
hidden_layer = tf.nn.tanh(tf.matmul(input_placeholder, feed_forward_weights_1) + feed_forward_bias_1)
output_layer = tf.matmul(hidden_layer, feed_forward_weights_2) + feed_forward_bias_2

action_placeholder = tf.placeholder("float", [None, ACTIONS_COUNT])
target_placeholder = tf.placeholder("float", [None])

readout_action = tf.reduce_sum(tf.multiply(output_layer, action_placeholder), reduction_indices=1)

cost = tf.reduce_mean(tf.square(target_placeholder - readout_action))
train_operation = tf.train.AdamOptimizer(LEARN_RATE).minimize(cost)

observations = deque(maxlen=MEMORY_SIZE)

scores = deque(maxlen=STORE_SCORES_LEN)


# set the first action to do nothing
last_action = np.zeros(ACTIONS_COUNT)
last_action[1] = 1

probability_of_random_action = INITIAL_RANDOM_ACTION_PROB
time = 0

session.run(tf.initialize_all_variables())


def choose_next_action(state):
    new_action = np.zeros([ACTIONS_COUNT])

    if random.random() <= probability_of_random_action:
        # 액션을 무작위로 선택한다. 
        action_index = random.randrange(ACTIONS_COUNT)
    else:
        # 현재 주어진 상태를 반영해서 액션을 선택한다. 
        action_values = session.run(output_layer, feed_dict={input_placeholder: [state]})[0]
        # 가장 높은 값을 가진 액션을 실행한다. 
        action_index = np.argmax(action_values)

    new_action[action_index] = 1
    return new_action


def train():
    # 훈련할 미니배치를 샘플링한다. 
    mini_batch = random.sample(observations, MINI_BATCH_SIZE)

    # 배치변수를 가져온다. 
    previous_states = [d[OBS_LAST_STATE_INDEX] for d in mini_batch]
    actions = [d[OBS_ACTION_INDEX] for d in mini_batch]
    rewards = [d[OBS_REWARD_INDEX] for d in mini_batch]
    current_states = [d[OBS_CURRENT_STATE_INDEX] for d in mini_batch]
    agents_expected_reward = []
    
    # 에이전트에게 액션을 수행했을 때 얻을 수 있는 보상에 대한 정보를 알려준다. 
    agents_reward_per_action = session.run(output_layer, feed_dict={input_placeholder: current_states})
    for i in range(len(mini_batch)):
        if mini_batch[i][OBS_TERMINAL_INDEX]:
            # 종료를 선언하게 되므로, 더 이상 리워드를 얻을 수 없다. 
            agents_expected_reward.append(rewards[i])
        else:
            agents_expected_reward.append(
                rewards[i] + FUTURE_REWARD_DISCOUNT * np.max(agents_reward_per_action[i]))

    # 리워드를 얻게 해줄 지금 상태 내의 액션을 학습한다. 
    session.run(train_operation, feed_dict={
        input_placeholder: previous_states,
        action_placeholder: actions,
        target_placeholder: agents_expected_reward})


last_state = env.reset()
total_reward = 0

while True:
    env.render()
    last_action = choose_next_action(last_state)
    current_state, reward, terminal, info = env.step(np.argmax(last_action))
    total_reward += reward

    if terminal:
        reward = -1.
        scores.append(total_reward)

        print("Time: %s random_action_prob: %s reward %s scores differential %s" %
              (time, probability_of_random_action, total_reward,
               np.mean(scores)))
        total_reward = 0

    # 기존의 관찰에서 넘어온 것을 저장한다.
    observations.append((last_state, last_action, reward, current_state, terminal))

    # 관찰이 끝난 경우에만 트레이닝을 시작한다. 
    if len(observations) > OBSERVATION_STEPS:
        train()
        time += 1

    # 기존 값을 업데이트한다. 
    if terminal:
        last_state = env.reset()
    else:
        last_state = current_state

    # 무작위로 뽑는 액션 비율을 점진적으로 감소시킨다. 
    if probability_of_random_action > FINAL_RANDOM_ACTION_PROB \
            and len(observations) > OBSERVATION_STEPS:
        probability_of_random_action -= \
            (INITIAL_RANDOM_ACTION_PROB - FINAL_RANDOM_ACTION_PROB) / EXPLORE_STEPS

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Time: 0 random_action_prob: 1.0 reward 17.0 scores differential 17.0
Time: 0 random_action_prob: 1.0 reward 18.0 scores differential 17.5
Time: 0 random_action_prob: 1.0 reward 25.0 scores differential 20.0
Time: 0 random_action_prob: 1.0 reward 28.0 scores differential 22.0
Time: 0 random_action_prob: 1.0 reward 12.0 scores differential 20.0
Time: 0 random_action_prob: 1.0 reward 15.0 scores differential 19.166666666666668
Time: 0 random_action_prob: 1.0 reward 25.0 scores differential 20.0
Time: 0 random_action_prob: 1.0 reward 35.0 scores differential 21.875
Time: 0 random_action_prob: 1.0 reward 40.0 scores differential 23.88888888888889
Time: 0 random_action_prob: 1.0 reward 30.0 scores differential 24.5
Time: 0 random_action_prob: 1.0 reward 12.0 scores differential 23.363636363636363
Time: 0 random_action_prob: 1.0 reward 44.0 scores differential 25.083333333333332
Time: 0 random_action_prob: 1.0 reward 9.

Time: 0 random_action_prob: 1.0 reward 23.0 scores differential 23.02
Time: 0 random_action_prob: 1.0 reward 14.0 scores differential 22.91
Time: 0 random_action_prob: 1.0 reward 32.0 scores differential 22.88
Time: 0 random_action_prob: 1.0 reward 10.0 scores differential 22.58
Time: 0 random_action_prob: 1.0 reward 29.0 scores differential 22.57
Time: 0 random_action_prob: 1.0 reward 32.0 scores differential 22.77
Time: 0 random_action_prob: 1.0 reward 35.0 scores differential 22.68
Time: 0 random_action_prob: 1.0 reward 28.0 scores differential 22.87
Time: 0 random_action_prob: 1.0 reward 14.0 scores differential 22.86
Time: 0 random_action_prob: 1.0 reward 12.0 scores differential 22.85
Time: 0 random_action_prob: 1.0 reward 24.0 scores differential 22.99
Time: 0 random_action_prob: 1.0 reward 14.0 scores differential 22.8
Time: 0 random_action_prob: 1.0 reward 25.0 scores differential 22.7
Time: 0 random_action_prob: 1.0 reward 11.0 scores differential 22.64
Time: 0 random_action_

Time: 0 random_action_prob: 1.0 reward 15.0 scores differential 20.84
Time: 0 random_action_prob: 1.0 reward 40.0 scores differential 20.94
Time: 1 random_action_prob: 0.9999333333333333 reward 17.0 scores differential 20.62
Time: 37 random_action_prob: 0.9975333333333336 reward 36.0 scores differential 20.88
Time: 58 random_action_prob: 0.9961333333333338 reward 21.0 scores differential 20.98
Time: 74 random_action_prob: 0.9950666666666672 reward 16.0 scores differential 20.98
Time: 87 random_action_prob: 0.9942000000000006 reward 13.0 scores differential 20.99
Time: 133 random_action_prob: 0.9911333333333343 reward 46.0 scores differential 21.29
Time: 168 random_action_prob: 0.9888000000000012 reward 35.0 scores differential 21.45
Time: 196 random_action_prob: 0.9869333333333348 reward 28.0 scores differential 21.05
Time: 212 random_action_prob: 0.9858666666666682 reward 16.0 scores differential 21.1
Time: 229 random_action_prob: 0.984733333333335 reward 17.0 scores differential 21.1

Time: 2178 random_action_prob: 0.854800000000016 reward 59.0 scores differential 23.12
Time: 2234 random_action_prob: 0.8510666666666831 reward 56.0 scores differential 23.55
Time: 2267 random_action_prob: 0.8488666666666833 reward 33.0 scores differential 23.75
Time: 2281 random_action_prob: 0.8479333333333501 reward 14.0 scores differential 23.52
Time: 2314 random_action_prob: 0.8457333333333503 reward 33.0 scores differential 23.7
Time: 2337 random_action_prob: 0.8442000000000172 reward 23.0 scores differential 23.53
Time: 2361 random_action_prob: 0.8426000000000173 reward 24.0 scores differential 23.6
Time: 2411 random_action_prob: 0.8392666666666844 reward 50.0 scores differential 23.74
Time: 2440 random_action_prob: 0.8373333333333512 reward 29.0 scores differential 23.82
Time: 2465 random_action_prob: 0.8356666666666848 reward 25.0 scores differential 23.91
Time: 2491 random_action_prob: 0.8339333333333516 reward 26.0 scores differential 24.04
Time: 2502 random_action_prob: 0.83

Time: 7154 random_action_prob: 0.5230666666667192 reward 135.0 scores differential 51.44
Time: 7189 random_action_prob: 0.5207333333333861 reward 35.0 scores differential 51.66
Time: 7317 random_action_prob: 0.5122000000000537 reward 128.0 scores differential 52.67
Time: 7459 random_action_prob: 0.5027333333333881 reward 142.0 scores differential 53.92
Time: 7659 random_action_prob: 0.48940000000005623 reward 200.0 scores differential 55.58
Time: 7859 random_action_prob: 0.47606666666672437 reward 200.0 scores differential 57.4
Time: 8059 random_action_prob: 0.4627333333333925 reward 200.0 scores differential 58.81
Time: 8244 random_action_prob: 0.45040000000006053 reward 185.0 scores differential 60.1
Time: 8361 random_action_prob: 0.4426000000000614 reward 117.0 scores differential 60.94
Time: 8557 random_action_prob: 0.42953333333339616 reward 196.0 scores differential 62.76
Time: 8757 random_action_prob: 0.4162000000000643 reward 200.0 scores differential 64.43
Time: 8863 random_ac