In [62]:
!pip install tf-agents[reverb]



In [63]:
import abc
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories import trajectory     # 1월 12일
from tf_agents.specs import array_spec # --> obs_space, action_space

In [64]:
# 직접 환경 클래스 만들기
# ITA_RL_WJK_lecture_98slides 12page MDP
class RecyclerEnv(py_environment.PyEnvironment):
  def __init__(self):
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=2, name='action'
    )
    # shape=() 스칼라 값 하나
    # 정수 타입
    # 최소 0; 최대 2; 0-search, 1-wait, 2-rescue
    # full observability일 땐 agent가 state 전체를 봅니다
    # 즉, state가 observation
    self._state_spec = array_spec.BoundedArraySpec(
        shape=(1,), dtype=np.int32, minimum=0, maximum=1, name='observation'
    )
    self._state = 1
    self._done = False
    self._gamma = 0.99
    self._alpha = 0.7
    self._beta = 0.95
    # 상태가 LOW인데 critical번 search를 나가면 종료(done=True)
    self._low = 0
    self._critical = 5

    self._reward = 0    # cans
    return

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._state_spec

  def _reset(self):
    self._state = 1     # 1=충전된 채로 시작
                        # 확률적으로 0 또는 1을 반환해도 됨
    self._done = False
    self._low = 0
    self._reward = 0
    return ts.restart(np.array([self._state], dtype=np.int32))

  # 이론적으론 step 함수만 있어도 됨
  # 위의 다른 함수들은 코드에서 손실/기울기 등 계산을 하기 위해
  # 또는 action/state 차원(모양)을 알기 위해-->신경망 입출력단 설계에 쓰임
  def _step(self, action):
    if self._reward > 100:
      self._done = True
    if self._done:
      return self._reset()

    # action마다 무슨 일이 일어날지
    if action == 0:   # search
      # 확률에 따른 상태 이동 구현 방법
      # self._state = np.random.choice(2, 1, p=[self._beta, 1-self._beta])
      # 2: 0 또는 1이 출력됨
      # 1: 난수 몇개 생성할지
      # p: 0이 나올 확률, 1이 나올 확률
      p = np.random.rand()    # uniform distribution[0, 1]
      if self._state == 0:    # low에서 search
        self._low += 1
        if (self._low > self._critical): # low에서 search를 너무 많이 나갔을 때
          self._done = True
          reward = -10.0
          self._reward += reward   # 1월 12일
          return ts.termination(np.array([self._state], dtype=np.int32), reward)
        if p < self._beta:    # low에서 search (beta)
          self._state = 0
          reward = 1
        else:                 # low에서 search (1-beta)
          self._state = 1
          reward = -3
      else:                   # high에서 search
        #self._state = np.random.choice(2, 1, p=[1.0 - self._alpha, self._alpha])
        #reward = np.random.choice([1, 1], p=[1.0-self._alpha, self._alpha])
        sp_and_r = np.array([[0, 1], [1, 1]])
        idx = np.random.choice(len(sp_and_r),
                              p=[1.0-self._alpha,self._alpha])
        self._state, reward = sp_and_r[idx][0], sp_and_r[idx][1]
      # search했을 때 함수 호출 한번으로 상태 이동 정의
      self._reward += reward       # 1월 12일
      return ts.transition(np.array([self._state], dtype=np.int32),
                           reward, discount=self._gamma)
    elif action == 1: # wait
      if self._state == 0:
        self._state = 0
        reward = -0.1
      else:
        self._state = 0
        reward = -0.1
      self._reward += reward       # 1월 12일
      return ts.transition(np.array([self._state], dtype=np.int32),
                    reward, discount=self._gamma)
    else:             # recharge
      self._done = False
      if self._state == 0:
        self._state = 1

      reward = 0.0
      # transition(상태 이동)
      self._reward += reward       # 1월 12일
      return ts.transition(
          np.array([self._state], dtype=np.int32), reward, discount=self._gamma)

In [65]:
_env = RecyclerEnv()

In [66]:
utils.validate_py_environment(_env, episodes=5)

In [67]:
recycler_env = tf_py_environment.TFPyEnvironment(_env)

In [68]:
# TF-agents DQN을 가져와서 위 환경에서 학습
from tf_agents.agents.dqn import dqn_agent
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import random_tf_policy
from tf_agents.policies import py_tf_eager_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.specs import tensor_spec
from tf_agents.utils import common

In [69]:
# 학습 파라미터
# 강화학습 특정 하이퍼파라미터
num_iter = 10000
random_episodes = 50
collect_per_iteration = 1
replay_buffer_size = 100
n_step_update = 2

# 강화학습 외의 구성 요소 (딥러닝) 하이퍼파라미터
batch_size = 33
learning_rate = 0.001
log_interval = 200

num_eval_episodes = 10      # evaluation을 몇 episode에 걸쳐서 할지
eval_interval = 100         # evaluation 몇 에피소드마다 할지

In [70]:
# TF-agents DQN
fc_layer_params = (16, 32, 8)        # Dense layer 폭들의 어레이

action_tensor_spec = tensor_spec.from_spec(recycler_env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
# num_actions # search, wait, rescue

def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation="relu")

dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None)

deepqn = sequential.Sequential(dense_layers + [q_values_layer])
# keras Sequential은 인자로 리스트를 받습니다
# [list1] + [list2] = [list1, list2]
# 리스트의 원소는 각각 layer [Dense(), Dense(), Dense(), Dense(num_actions)]

In [71]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
train_step = tf.Variable(0)   # 지금 몇번째 스텝인지 추적하기 위한 0


### TF-Agents DQN ###
agent = dqn_agent.DqnAgent(
    recycler_env.time_step_spec(),
    recycler_env.action_spec(),
    q_network=deepqn,
    optimizer=optimizer,
    n_step_update=n_step_update,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter= train_step)

agent.initialize()

In [72]:
# 할일
# Replay Buffer
# Average Return 계산
# PyDriver로 루프 --> for loop 돌기에 최적화된 클래스
# 데이터 축적
def avg_return(env, policy, num_episodes=10):
  # print(env.__dir__())
  # print(type(env))
  # print(env._reset().is_last())
  total_return = 0.0 # 누적 합
  time_step = env._reset()
  # done = env._done
  for _ in range(num_episodes):
    time_step = env._reset()
    episode_return = 0.0

    #while not time_step.is_last(): # 이 에피소드의 마지막 타임스텝까지
    for _ in range(10):

      action_step = policy.action(time_step)
      time_step = env._step(action_step.action)
      episode_return += time_step.reward # 보상들의 합
      # done = env._done
    total_return += episode_return
    time_step = env._reset()
  avg = total_return / num_episodes
  return avg.numpy()[0]

In [73]:
# 1월 12일
from tf_agents.replay_buffers import tf_uniform_replay_buffer

In [74]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=recycler_env.batch_size,
    max_length=replay_buffer_size)

random_policy = random_tf_policy.RandomTFPolicy(recycler_env.time_step_spec(),
                                                recycler_env.action_spec())

def collect_step(environment, policy):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  replay_buffer.add_batch(traj)

for _ in range(random_episodes):
  collect_step(recycler_env, random_policy)

In [77]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, sample_batch_size=batch_size,
    num_steps=n_step_update+1).prefetch(3)

iterator = iter(dataset) # 버퍼에서 샘플링한 경험을 순회하는 객체

Instructions for updating:
Use `tf.data.Dataset.counter(...)` instead.
Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [78]:
agent.train_step_counter.assign(0)

avg_return_value = avg_return(recycler_env, agent.policy, num_eval_episodes)
returns = [avg_return_value]

for _ in range(num_iter):

  # Collect a few steps using collect_policy and save to the replay buffer.
  for _ in range(collect_per_iteration):
    collect_step(recycler_env, agent.collect_policy)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator) # 그 다음 경험에 대한 학습
  train_loss = agent.train(experience) # (s, a, r, s')

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss.loss))

  if step % eval_interval == 0:
    avg_return_value = avg_return(recycler_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1:.2f}'.format(step, avg_return_value))
    returns.append(avg_return_value)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


step = 100: Average Return = 2.60
step = 200: loss = 20.205564498901367
step = 200: Average Return = 7.70
step = 300: Average Return = 8.50
step = 400: loss = 42.7346076965332
step = 400: Average Return = 0.00
step = 500: Average Return = 0.00
step = 600: loss = 6.958929538726807
step = 600: Average Return = 0.00
step = 700: Average Return = 7.80
step = 800: loss = 7.691587448120117
step = 800: Average Return = 8.10
step = 900: Average Return = 7.50
step = 1000: loss = 1.3839343786239624
step = 1000: Average Return = -0.50
step = 1100: Average Return = 7.70
step = 1200: loss = 0.6158702969551086
step = 1200: Average Return = 7.60
step = 1300: Average Return = 7.80
step = 1400: loss = 0.24681100249290466
step = 1400: Average Return = 7.70
step = 1500: Average Return = -0.50
step = 1600: loss = 0.20735685527324677
step = 1600: Average Return = 7.70
step = 1700: Average Return = 8.10
step = 1800: loss = 0.34770673513412476
step = 1800: Average Return = 8.20
step = 1900: Average Return = 8

KeyboardInterrupt: ignored