# Tic Tac Toe
Ein simpler Test von RL mit einem 2D Environnement, wobei die States 2D sind. Das würde für unser Drawing System ein einfachreres System ermöglichen.

## Imports

In [15]:
import abc
import tensorflow as tf
import numpy as np
import random

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
from tf_agents.environments import utils

import reverb

## Tic Tac Toe Environment

In [20]:
class Game(py_environment.PyEnvironment):
    def __init__(self):
        self._action_spec = array_spec.BoundedArraySpec(
        shape=(2,), dtype=np.int32, minimum=[0, 0], maximum=[2, 2], name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(3, 3), dtype=np.int32, minimum=[[0, 0, 0], [0, 0, 0], [0, 0, 0]], maximum=[[2, 2, 2], [2, 2, 2], [2, 2, 2]], name='observation')
        self._state = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
        self._episode_ended = False

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self._state = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
        self._episode_ended = False
        return ts.restart(np.array([self._state], dtype=np.int32))

    def _step(self, action):

        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
             # a new episode.
            return self.reset()

        reward = 0.0
        #
        # Tic Tac Toe Logic
        #
        
        if self._state[action[0]][action[1]] > 0:
            reward -= 1
        else:
            self._state[action[0]][action[1]] = 1
        
        full = True
        for i in self._state:
            for e in i:
                if e == 0:
                    full = False
        
        win = [False, False]
        for i in range(1, 3):
            # Row
            for e in self._state:
                if e[0] == i and e[1] == i and e[2] == i:
                    win[i-1] = True
            
            # Column
            for e in range(3):
                if self._state[0][e] == i and self._state[1][e] == i and self._state[2][e] == i:
                    win[i-1] = True
            
            # Diagonal
            if self._state[0][0] == i and self._state[1][1] == i and self._state[2][2] == i:
                win[i-1] = True
                
            # Anti Diagonal
            if self._state[0][2] == i and self._state[1][1] == i and self._state[2][0] == i:
                win[i-1] = True
        
        if full:
            self._episode_ended = True
        if win[0]:
            self._episode_ended = True
            reward += 20
        if win[1]:
            self._episode_ended = True
            reward -= 20
            
        if not self._episode_ended:
            while True:
                x = random.randint(0, 2)
                y = random.randint(0, 2)
                if self._state[y][x] == 0:
                    self._state[y][x] = 2
                    break
            
        return ts.termination(np.array([self._state], dtype=np.int32), reward)
    
    def render(self):
        for i in self._state:
            row = ""
            for e in i:
                row += str(e) + "|"
            print(row)
            print("-|-|-")

In [21]:
env_py = Game()
env = tf_py_environment.TFPyEnvironment(env_py)

In [31]:
env_py.render()

0|0|0|
-|-|-
0|0|0|
-|-|-
0|0|0|
-|-|-


In [27]:
env_py._step([2, 2])

TimeStep(
{'discount': array(0., dtype=float32),
 'observation': array([[[1, 0, 2],
        [0, 1, 0],
        [2, 0, 1]]], dtype=int32),
 'reward': array(20., dtype=float32),
 'step_type': array(2, dtype=int32)})

In [29]:
env_py._episode_ended

True

In [30]:
env_py.reset()

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]], dtype=int32),
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})