# Tic Tac Toe
Ein simpler Test von RL mit einem 2D Environnement, wobei die States 2D sind. Das würde für unser Drawing System ein einfachreres System ermöglichen.

## Imports

In [1]:
import abc
import tensorflow as tf
import numpy as np
import random

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
from tf_agents.environments import utils

import reverb

## Tic Tac Toe Environment

In [170]:
class Game(py_environment.PyEnvironment):
    def __init__(self):
        self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=8, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(9,), dtype=np.int32, name='observation')
        self._state = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.status = np.array(self._state).reshape(3, 3)
        self._episode_ended = False

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self._state = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        self.status = np.array(self._state).reshape(3, 3)
        self._episode_ended = False
        return ts.restart(np.array(self._state, dtype=np.int32))

    def _step(self, action):

        self.status = np.array(self._state).reshape(3, 3)

        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
             # a new episode.
            return self.reset()

        reward = 0.0
        #
        # Tic Tac Toe Logic
        #
        
        y = action // 3
        x = action % 3
        if self.status[y][x] > 0:
            reward -= 1
        else:
            self.status[y][x] = 1
        
        full = True
        for i in self.status:
            for e in i:
                if e == 0:
                    full = False
        
        win = [False, False]
        for i in range(1, 3):
            # Row
            for e in self.status:
                if e[0] == i and e[1] == i and e[2] == i:
                    win[i-1] = True
            
            # Column
            for e in range(3):
                if self.status[0][e] == i and self.status[1][e] == i and self.status[2][e] == i:
                    win[i-1] = True
            
            # Diagonal
            if self.status[0][0] == i and self.status[1][1] == i and self.status[2][2] == i:
                win[i-1] = True
                
            # Anti Diagonal
            if self.status[0][2] == i and self.status[1][1] == i and self.status[2][0] == i:
                win[i-1] = True
        
        if full:
            self._episode_ended = True
        if win[0]:
            self._episode_ended = True
            reward += 20
        if win[1]:
            self._episode_ended = True
            reward -= 20
        
        if self._episode_ended:
            return ts.termination(np.array(self._state, dtype=np.int32), reward)
            
        while True:
            x = random.randint(0, 2)
            y = random.randint(0, 2)
            if self.status[y][x] == 0:
                self.status[y][x] = 2
                break

        self._state = list(self.status.reshape(9,))
        
        return ts.transition(np.array(self._state, dtype=np.int32), reward=reward, discount=1.0)
    
    def render(self):
        for i in self.status:
            row = ""
            for e in i:
                row += str(e) + "|"
            print(row)
            print("-|-|-")

In [171]:
env_py = Game()
env = tf_py_environment.TFPyEnvironment(env_py)

In [172]:
env_py.render()

0|0|0|
-|-|-
0|0|0|
-|-|-
0|0|0|
-|-|-


In [173]:
timestep = env_py._step(2)
timestep

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([2, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int32),
 'reward': array(0., dtype=float32),
 'step_type': array(1, dtype=int32)})

In [174]:
timestep.is_last()

False

In [175]:
env_py._state

[2, 0, 1, 0, 0, 0, 0, 0, 0]

In [176]:
env_py.reset()

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})

### Testing Env Bounds

In [177]:
env.action_spec(), env.observation_spec(), env.time_step_spec()

(BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(8, dtype=int32)),
 BoundedTensorSpec(shape=(9,), dtype=tf.int32, name='observation', minimum=array(-2147483648, dtype=int32), maximum=array(2147483647, dtype=int32)),
 TimeStep(
 {'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
  'observation': BoundedTensorSpec(shape=(9,), dtype=tf.int32, name='observation', minimum=array(-2147483648, dtype=int32), maximum=array(2147483647, dtype=int32)),
  'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
  'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}))

In [178]:
env.reset()

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>})

In [179]:
env._step(6)

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[0, 0, 0, 0, 2, 0, 1, 0, 0]], dtype=int32)>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>})

In [180]:
env.time_step_spec()

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': BoundedTensorSpec(shape=(9,), dtype=tf.int32, name='observation', minimum=array(-2147483648, dtype=int32), maximum=array(2147483647, dtype=int32)),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

## Model and Agent Creation

In [181]:
fc_layer_params = (100, 50) # Die Anzahl der Dense Units in einem Layer
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
print("Num Actions: ", num_actions)

# Helper function um die Dense Layer zu kreeieren.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# Create the dense layer array
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]

# Create the output layer
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))

# Sequentialize the layers into a tensorflow model
q_net = sequential.Sequential(dense_layers + [q_values_layer])

Num Actions:  9


In [182]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    env.time_step_spec(),
    env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

In [183]:
def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
            total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

In [187]:
compute_avg_return(env,
                   random_tf_policy.RandomTFPolicy(env.time_step_spec(),
                                                   env.action_spec()),
                   1000)

-18.72

In [188]:
table_name = 'uniform_table'
replay_buffer_signature = tensor_spec.from_spec(
      agent.collect_data_spec)
replay_buffer_signature = tensor_spec.add_outer_dim(
    replay_buffer_signature)

table = reverb.Table(
    table_name,
    max_size=100000,
    sampler=reverb.selectors.Uniform(),
    remover=reverb.selectors.Fifo(),
    rate_limiter=reverb.rate_limiters.MinSize(1),
    signature=replay_buffer_signature)

reverb_server = reverb.Server([table])

replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
    agent.collect_data_spec,
    table_name=table_name,
    sequence_length=2,
    local_server=reverb_server)

rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
    replay_buffer.py_client,
    table_name,
    sequence_length=2)

[reverb/cc/platform/tfrecord_checkpointer.cc:150]  Initializing TFRecordCheckpointer in /tmp/tmp5p0xhj0_.
[reverb/cc/platform/tfrecord_checkpointer.cc:386] Loading latest checkpoint from /tmp/tmp5p0xhj0_
[reverb/cc/platform/default/server.cc:71] Started replay server on port 18906


In [192]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=64,
    num_steps=2).prefetch(3)
iterator = iter(dataset)

## AI Training

In [189]:
agent.train = common.function(agent.train)
agent.train_step_counter.assign(0)

<tf.Variable 'UnreadVariable' shape=() dtype=int32, numpy=0>

In [190]:
time_step = env_py.reset()

In [191]:
# Create a driver to collect experience.
collect_driver = py_driver.PyDriver(
    env_py,
    py_tf_eager_policy.PyTFEagerPolicy(
        agent.collect_policy, use_tf_function=True),
    [rb_observer],
    max_steps=1)

In [193]:
returns = []
for _ in range(1):
    
  # Collect a few steps and save to the replay buffer.
  time_step, _ = collect_driver.run(time_step)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  if step % 10 == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))

  if step % 10 == 0:
    avg_return = compute_avg_return(env, agent.policy, 100)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)

[reverb/cc/client.cc:165] Sampler and server are owned by the same process (11220) so Table uniform_table is accessed directly without gRPC.
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (11220) so Table uniform_table is accessed directly without gRPC.
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (11220) so Table uniform_table is accessed directly without gRPC.
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (11220) so Table uniform_table is accessed directly without gRPC.
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (11220) so Table uniform_table is accessed directly without gRPC.
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (11220) so Table uniform_table is accessed directly without gRPC.
