<a href="https://colab.research.google.com/github/HEP-Dexan3327/AI-Diary-3-ReinforcementLearning/blob/main/code/ai2048.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://github.com/korakot/kora/releases/download/v0.10/py310.sh
!bash ./py310.sh -b -f -p /usr/local
!python -m ipykernel install --name "py310" --user

--2023-01-10 12:07:34--  https://github.com/korakot/kora/releases/download/v0.10/py310.sh
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/266951884/0d0623be-3dec-4820-9e7b-69a3a5a75ef7?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230110T120734Z&X-Amz-Expires=300&X-Amz-Signature=4a4b70e2b52090e2eba41184764e19552482f1e37fa2c280023510ad98958aa2&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=266951884&response-content-disposition=attachment%3B%20filename%3Dpy310.sh&response-content-type=application%2Foctet-stream [following]
--2023-01-10 12:07:34--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/266951884/0d0623be-3dec-4820-9e7b-69a3a5a75ef7?X-Amz-Algorithm=AWS4-HMAC-S

In [None]:
!python --version

Python 3.10.6


In [None]:
!pip install pygame

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame
  Using cached pygame-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.9 MB)
Installing collected packages: pygame
Successfully installed pygame-2.1.2
[0m

In [50]:
import time

import gym
from gym import spaces
import numpy as np


"Arrows Key to play, Ctrl + Z to undo. You can only undo once consecutively."


class Typical2048Env(gym.Env):
    metadata = {"render_modes": ["ai", "human", "rgb_array"], "render_fps": 20, "window_size": 16}

    def __init__(self, render_mode=None, size=4, window_size=16):
        self._grid = None
        self._last_grid = None
        self._merged = None
        self._epoch = 0

        self.size = size  # The size of the square grid
        self.ws = window_size
        self.window_size = 512 * window_size / 16  # The size of the PyGame window
        self.bar_size = 100 * window_size / 16
        self.bar = np.array((0, self.bar_size))

        self.prob = (.9, .1)  # (.9, .1)
        self.action = -1

        self.reward_list_length = 1
        self.score = 0
        self.undo_score = 0
        self.reward = 0
        self.rewards = [0,]
        self.max_score = 0
        self.undo_unused = True
        self.punishment = -50

        self.available_dirs = np.array([True, True, True, True])

        # Observations are 16-element lists, storing the numbers at each cell.
        # There are 16 possible numbers, ranging from 2**1 to 2**16.
        # The id 0 is reserved for EMPTY cell.
        self.observation_space = spaces.Box(0, size * size, shape=(size * size,), dtype=int)

        # We have 4 actions, corresponding to "right", "down", "left", "up" and "undo"
        self.action_space = spaces.Discrete(4)

        """
        The following dictionary maps abstract actions from `self.action_space` to 
        the direction we will walk in if that action is taken.
        I.e. 0 corresponds to "right", 1 to "down" etc.
        """
        self._action_to_direction = {
            0: np.array([1, 1]),  # right  1st axis
            1: np.array([0, 1]),  # down   0th axis
            2: np.array([1, -1]),  # left   1st axis
            3: np.array([0, -1]),  # up     0th axis
        }

        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode
        self.ai = render_mode == 'ai'
        if self.ai:
            self.render_mode = 'human'

        """
        If human-rendering is used, `self.window` will be a reference
        to the window that we draw to. `self.clock` will be a clock that is used
        to ensure that the environment is rendered at the correct framerate in
        human-mode. They will remain `None` until human-mode is used for the
        first time.
        """
        self.window = None
        self.clock = None

    def _get_obs(self):
        return self._grid

    def _get_info(self):
        return {
            "highTile": max(self._grid),
            "score": self.score,
            "available_dir": self.available_dirs
        }

    def reset(self, seed=None, options=None):
        # We need the following line to seed self.np_random
        super().reset()
        # self.np_random.integers
        self._epoch += 1

        self.max_score = max(self.max_score, self.score)

        self.punishment = -50
        self.action = -1
        self.score = 0
        self.reward = 0
        self.undo_score = 0
        self.rewards = [0,]
        self.available_dirs = np.array([True, True, True, True])
        self.undo_unused = True
        # Spawn the grid with 2 random tiles (2 or 4, i.e. code = 1 or 2)
        self._grid = np.random.permutation((0,) * (self.size ** 2 - 2) +
                                           tuple(np.random.choice((1, 2), size=(2,), p=self.prob)))
        self._last_grid = self._grid.copy()
        self._merged = np.zeros((self.size**2,))

        observation = self._get_obs()
        info = self._get_info()

        if self.render_mode == "human":
            self._render_frame()

        return observation, info

    def _move_row(self, p: np.ndarray, m: np.ndarray, direction: int, do_reward=False):
        """
        :param p: an ndarray row representing the numbers in a row (or column)
        :param m: an ndarray row representing whether the number is JUST merged.
        :param direction: an integer signifying whether to slide in the POSITIVE (+1) or NEGATIVE (-1) direction.
        :param do_reward: an bool signifying if we update the reward value.
        """
        out = np.zeros_like(p)
        mout = np.zeros_like(m)
        last = 0
        lastidx = 0
        direction *= -1
        # if direction is -1, i.e., towards the LEFT, then we DON'T need to reverse the array.
        # similarly, we NEED to reverse the row if direction is 1.

        idx = -1
        for i, e in enumerate(p[::direction]):
            if e == 0:
                continue
            if e != last or m[i] != 0 or m[lastidx] != 0:  # either not equal, or one of them is used.
                idx += 1
                out[idx] = e
                mout[idx] = m[i]
                last = int(e)
            else:
                out[idx] = last+1   # merge tiles
                mout[idx] = 1
                if do_reward:
                    self.reward += 2 ** (last + 1)
                    self.score += 2 ** (last + 1)
                last = 0
        m[:] = mout
        return not np.all(p-out[::direction] == 0), out[::direction]

    def _move_tiles(self, grid: np.ndarray, merge_grid: np.ndarray, action=0, do_reward=False) -> bool:
        g = grid.reshape((self.size, self.size))
        mg = merge_grid.reshape((self.size, self.size))
        # Map the action (element of {0,1,2,3}) to the direction we walk in
        direction = self._action_to_direction[action]
        # direction[0]: axis.
        # direction[1]: +/- value for that axis.
        OUT = False
        for i in range(self.size):
            p = g[:, i] if direction[0] else g[i, :]
            m = mg[:, i] if direction[0] else mg[i, :]
            out, p[:] = self._move_row(p, m, direction[1], do_reward=do_reward)
            if out:
                OUT = True
                if not do_reward:
                    return True

        grid[:] = g.reshape((-1,))
        return OUT

    def is_full(self) -> bool:
        ar = [self._move_tiles(self._grid.copy(), self._merged.copy(), action=action) for action in range(4)]
        self.available_dirs = np.array(ar)# + [self.undo_unused])
        return not (np.any(self.available_dirs))

    def _spawn_tile(self):
        empty_tiles = self._grid[self._grid == 0]
        self._grid[self._grid == 0] = np.random.permutation((0,)*(len(empty_tiles)-1) +
                                                            tuple(np.random.choice((1, 2), size=(1,), p=self.prob)))

    def step(self, action):
        self.reward = 0
        self.action = action
        if action == 4:
            if self.undo_unused:
                self._grid[:] = self._last_grid.copy()
                self.score = self.undo_score
                self.undo_unused = False
                self.reward = self.punishment / 10
            else:
                self.reward = self.punishment
                self.punishment -= 50
        elif action is not None:
            self._last_grid = self._grid.copy()
            self.undo_score = self.score
            self.undo_unused = True
            if not self.is_full():
                moved = 0
                while self._move_tiles(self._grid, self._merged, action=action, do_reward=True):
                    moved = 1
                self._merged = np.zeros_like(self._merged)
                if moved:
                    self._spawn_tile()
                else:
                    self.reward = 0
        # An episode is done iff the agent has reached the target
        terminated = self.is_full()
        if terminated:
            self.reward -= 10
        self.rewards += [self.reward]
        if len(self.rewards) > self.reward_list_length:
            self.rewards = self.rewards[1:]
        observation = self._get_obs()
        info = self._get_info()

        if self.render_mode == "human":
            self._render_frame()

        return observation, self.reward, terminated, False, info

    def _get_color(self, tile: int):
        colors = (
            (237, 228, 218),  # 2,      1
            (236, 223, 199),  # 4,      2
            (243, 177, 121),  # 8,      3
            (245, 149, 99),   # 16,     4
            (245, 124, 97),   # 32,     5
            (237, 87, 55),    # 64,     6
            (236, 206, 113),  # 128,    7
            (237, 204, 98),   # 256,    8
            (236, 199, 80),   # 512,    9
            (236, 197, 64),   # 1024,   10
            (236, 197, 1),    # 2048,   11
            (94, 220, 151),   # 4096,   12
            (236, 77, 88),    # 8192,   13
            (37, 186, 99),    # 16384,  14
            (0, 124, 189),    # 32768,  15
            (0, 0, 0)         # 65536,  16
        )
        return colors[tile-1]

    def render(self):
        if self.render_mode == "rgb_array":
            return self._render_frame()

    def _render_frame(self):
        pass

    def close(self):
        pass


In [32]:
def check(func):
        def inner(*args, **kwargs):
            if not Output.output:
                return
            return func(*args, **kwargs)
        return inner

In [34]:
import gc
import os
import sys

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from itertools import filterfalse


class Output:
    has_epsilon = False

    def __init__(self, file_input: str, mode: str = 'results', output_every_n=100, random_overlay=False):
        """ mode: either 'results' or 'model'.
            ... -o '<path>/<file>.txt'  # outputs experiment data to txt format
            ... -o '<path>/<file>.csv'  # outputs experiment data to csv format
            ... -o '<path>/<file>.png'  # outputs experiment data chart (provided matplotlib) to png or jpg format
            ... -o '<path>/<file>.h5'   # when mode == 'model', outputs neural network model here.
            ... -o '<path>'             # when there is no file extension (.xx),
                                        # a .txt file, a .csv file and (if mode == 'model')
                                            a list of .h5 file checkpoints are stored in this folder.
            ... -o None
             or -o 'None'               # show plotted chart on screen

            # by default, there will be printing onto the console. there is no way to turn it off.
            """
        self.mode = mode
        Output.output = (file_input is not None)
        if not self.output:
            return

        file = file_input.rsplit(".", 1)
        file[0] = "./" + file[0]
        self.files = {}
        self.output_every_n = output_every_n
        self.random_overlay = random_overlay
        self.df = pd.DataFrame({'episode': [],
                                'step': [],
                                'score': [], 'epsilon': [], 'good': [], 'best': []})
        if file_input.endswith(".txt"):
            self.files.update({"txt": file[0].rsplit('/', 1)[1]})
        elif file_input.endswith(".csv"):
            self.files.update({"csv": file[0].rsplit('/', 1)[1]})
        elif file_input.endswith(".png"):
            self.files.update({"png": file[0].rsplit('/', 1)[1]})
        elif file_input.endswith(".jpg"):
            self.files.update({"jpg": file[0].rsplit('/', 1)[1]})
        elif file_input.endswith(".h5") and mode == 'model':
            self.save_to_single = True
            self.files.update({"h5": file[0].rsplit('/', 1)[1]})
        elif file_input.endswith(".h5s") and mode == 'model':
            self.save_to_single = False
            self.files.update({"h5": file[0].rsplit('/', 1)[1]})
        else:
            file[0] = file[0][2:] if file[0].startswith("./") else file[0]
            if "." not in file_input: # is a directory
                self.files.update({"txt": "data", "csv": "data", "png": "evaluation_graph"})
                if mode == 'model':
                    self.files.update({"h5": "model"})
                file = [file_input+"/"]
            else:
                print(f"File format {file[1]} not supported. "
                      f"Train data and files (if any) is now saved to the directory {file[0]}.")
            try:
                os.mkdir(file[0])
            except Exception as e:
                print(e)
        file[0] = file[0][2:] if file[0].startswith("./") else file[0]
        file_input = file[0].rsplit("/", 1) if "/" in file[0] else (".", file[0])
        # ignore file format if the format is not supported
        self.dir = file_input[0]
        print(f"The following items will be outputted in folder {self.dir}:\n"
              + '\n'.join(map(lambda x: '\t- '+x[1]+'.'+x[0], self.files.items())))

        self.actions = {"txt": self.output_txt, "csv": self.output_csv, "png": self.output_img,
                        "jpg": self.output_img, "h5": self.output_model}
        self.actions = dict(list(filterfalse(lambda x: x[0] not in self.files, self.actions.items())))

        matplotlib.use('Agg')


    @check
    def output_txt(self, output="", **kwargs):
        with open(f"{self.dir}/{self.files['txt']}.txt", "a+") as f:
            f.write(output + "\n")

    @check
    def output_csv(self, episode=0, step=0, info=None, **kwargs):
        self.df.to_csv(f"{self.dir}/{self.files['csv']}.csv")

    @check
    def output_img(self, episode=0, output_every_n=1, do_output=True, **kwargs):
        if self.mode != 'model' and episode >= 0 or not do_output:
            return
        """
        # there is some issue with plt and pygame.
        # plt resizes the pygame window by ignoring Window's screen resize ratio (200% in my display)
        # and thus shrinks the window size of pygame after every plt call.
        # Currently, I cannot find any resources online that could help me solve this problem, so
        # this would NOT be implemented directly. Only the last data.png is saved.
        """
        if episode % output_every_n != output_every_n-1 and episode >= 0:
            return
        mode = 'png' if 'png' in self.files else 'jpg'

        fig, axs = plt.subplots(nrows=3 if self.has_epsilon else 2, ncols=2,
                                figsize=(8, 9 if self.has_epsilon else 6), num=1, clear=True)
        fig.tight_layout()
        self.df = self.df.sort_values(by='episode')
        self.df.set_index('episode')
        rolling = self.df.rolling(10, on='episode').mean()

        # 'best' is used as 'random_score' when self.random_overlay is True.
        axs[0, 0].plot(rolling['score'].dropna())
        try:
            if self.random_overlay:
                axs[0, 0].plot(rolling['best'], alpha=.5)
                axs[0, 0].legend(['score', 'random'])
            else:
                axs[0, 0].plot(self.df['good'].dropna())
                axs[0, 0].plot(self.df['best'].dropna())
                axs[0, 0].legend(['score', 'good', 'best'])
        except KeyError:
            pass
        magicmax = max(5000, (self.df['score'].max()//1000+1) * 1000)
        self.df['score'].hist(ax=axs[0, 1], range=[0, magicmax], bins=100)
        if self.random_overlay:
            self.df['best'].hist(ax=axs[0, 1], range=[0, magicmax], bins=100, alpha=0.5)
        axs[0, 1].set_xlabel("score")
        axs[0, 1].set_ylabel("frequency")

        # 'good' is used as 'random_step' when self.random_overlay is True.
        axs[1, 0].plot(rolling['step'].dropna())
        if self.random_overlay:
            axs[1, 0].plot(rolling['good'], alpha=.5)
            axs[1, 0].legend(['step', 'random'])
        self.df['step'].hist(ax=axs[1, 1], range=[-5, 2000], bins=100)
        if self.random_overlay:
            self.df['good'].hist(ax=axs[1, 1], range=[-5, 2000], bins=100, alpha=0.5)
        axs[1, 1].set_xlabel("step")
        axs[1, 1].set_ylabel("frequency")
        if self.has_epsilon:
            axs[2, 0].set_ylim([0, 1])
            for i in range(15, -1, -1):
                self.df[f'epsilon{i}'].plot(ax=axs[2, 0], label=f"{2**i}")
                self.df[f'epsilon{i}'].hist(ax=axs[2, 1], range=[0, 1], bins=100, label=f"{2**i}", alpha=.5)
            axs[2, 1].set_xlabel("epsilon")
            axs[2, 1].set_ylabel("frequency")
            axs[2, 1].legend()
        del rolling

        plt.savefig(f"{self.dir}/{self.files[mode]}.{mode}")
        plt.close(fig)
        plt.close("all")
        gc.collect()

    @check
    def output_model(self, episode=0, model=None, output_every_n=1, **kwargs):
        if episode % output_every_n != output_every_n-1 and episode >= 0:
            return
        if episode == -1:
            filename = f"{self.dir}/{self.files['h5']}_best.h5"
        elif episode < 0:
            filename = f"{self.dir}/{self.files['h5']}_best_replacedAt{-episode}.h5"
        else:
            filename = f"{self.dir}/{self.files['h5']}_epoch{episode}.h5"
        model.save(filename)

    def log(self, done, episode, step, info, model=None, do_output=True, epsilon: list = [-1.], training=False,
            best: int = None):
        if done:
            output = f"Episode {episode} succeeded in {step} steps with score {info['score']}... epsilon {epsilon} ... training {training}"
        else:
            output = f"Episode {episode} truncated ... in {step} steps with score {info['score']} ... epsilon {epsilon}"

        print(output)

        if not self.output:
            return
        self.has_epsilon = epsilon[0] != -1
        entry = {'episode': [episode], 'step': [step],
                 'score': [info['score']], 'best': [np.NAN], 'good': [np.NAN]}
        entry.update({f'epsilon{i}': eps for i, eps in enumerate(epsilon)})
        self.concat(entry)
        [a(output=output, episode=episode, step=step, info=info, output_every_n=self.output_every_n,
           model=model, do_output=do_output) for a in self.actions.values()]

    @check
    def logs(self, output: str):
        if "txt" in self.actions:
            self.output_txt(output)

    def concat(self, dic: dict):
        self.df = pd.concat(
            [self.df, pd.DataFrame(dic)],
            ignore_index=True
        ).groupby('episode').sum(numeric_only=True, min_count=1).reset_index()

In [35]:
import collections

import numpy as np

Experience = collections.namedtuple('Experience',
                                    field_names=['state', 'action', 'reward', 'done', 'new_state'])


class ExperienceReplay:
    """
    Reference: https://towardsdatascience.com/deep-q-network-dqn-ii-b6bf911b6b2c
    There is not much that I could improve the code. @credit: Jordi TORRES.AI
    """
    def __init__(self, capacity, best_capacity=256):
        self.buffer = collections.deque(maxlen=capacity)
        self.best = []
        self.age = 0
        self.best_capacity = best_capacity

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.age += 1
        self.buffer.append(experience)
        # based on the concept that high reward should be prioritized
        # we would make sure that older experiences with high reward values is not dumped
        self.best += [(self.age, experience)]
        self.best = sorted(self.best, key=lambda x: x[1][2]+x[0]/10000, reverse=True)
        if len(self.best) > self.best_capacity:
            self.best.pop()
        # i.e. reward + age/1000

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size,
                                   replace=False)
        states, actions, rewards, dones, next_states \
            = zip(*([self.buffer[idx] for idx in indices]
                    + [x[1] for x in self.best]))

        return np.array(states), np.array(actions), \
               np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), \
               np.array(next_states)

In [36]:
import gym
import numpy as np
import tensorflow as tf


def choose(env: gym.Env, _q_values: tf.Tensor, available_dirs: np.ndarray) -> np.ndarray:
    if env.is_full():
        return np.array(-1)
    li = (_q_values[0] * available_dirs).numpy()
    li[available_dirs*1 == 0] = np.nan
    return np.nanargmax(li)


num_inps = 4


def vectorize(_state: tf.Tensor, available_dirs: np.ndarray, type='normal', normalized=True, expand=True) -> tf.Tensor:
    """ turns the state into a vector before feeding into the neural network.
    :param available_dirs: available directions.
    :param _state:         input observations, in a shape (16,)
    :param type:           either 'normal' (default) or 'one-hot'.
                            output shape is (1,16,1) for 'normal', and is (1,16,16=#options) for 'one-hot'.
    """
    if type in ('one-hot', 'one-hot-17'):
        options_per_cell = 17 if type == 'one-hot-17' else 16
        out = tf.math.multiply(tf.expand_dims(tf.one_hot(_state, options_per_cell), 0),
                               available_dirs.reshape((num_inps, 1, 1)))
        if expand:
            out = tf.expand_dims(out, 0)
        if normalized:
            out /= options_per_cell
        return out
    out = tf.math.multiply(tf.expand_dims(_state, 0),
                           available_dirs.reshape((num_inps, 1)))
    if expand:
        out = tf.expand_dims(out, 0)
    return out


def get_input_type(shape: tuple) -> str:
    return 'one-hot' if shape[2] == 16 else \
        'one-hot-17' if shape[2] == 17 else 'normal'

In [60]:
# Test the agent
test_episodes = 10000
max_steps = 10000

In [None]:
import time
from collections import Counter

import gym

import sys
import os

import matplotlib.pyplot as plt
import numpy as np
import random

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Reshape, Conv2D
from keras.optimizers import Adam

import copy
import threading

size = 4
mode = 'rgb_array'
ws   = 16

save_interval = 500

learning_rate = .00075
lr_decay = 1
opt = Adam(learning_rate=learning_rate)
gamma = .95  # or .95
epsilon_decay = 1-5*1e-4  # 1-5*1e-4
local_epsilon = [.5]*16

# for random agent, use play v1 script with -m human_rand
assert mode != "human_rand"

window_size = 16

env = Typical2048Env(render_mode=mode, size=size, window_size=window_size)

env.metadata["render_fps"] = 1000000000

env.action_space.seed(None)


#threading
num_threads = 10
envs = [copy.deepcopy(env) for i in range(num_threads)]

buffer_capacity = 32768
best_capacity = 0
min_buffer_length = 256
buffer = ExperienceReplay(buffer_capacity, best_capacity=best_capacity)

ofile = f"data/qtable_{time.strftime('%Y%m%d%H%M')}"
output = Output(ofile, 'model', output_every_n=save_interval)  # does any output job.
folder = output.dir

# DQN model
num_classes = 4
options_per_cell = 16  # 16 if onehot / all models on or before 202212110239
train_type = 'one-hot'  #'one-hot' if output 16
input_shape = (num_classes, size ** 2, options_per_cell)
epsilon_min = 0
epsilon = .5

file = None
input_file = None

if file is not None:
    model = tf.keras.models.load_model(file, compile=False)
    print(model.summary())
    train_type = get_input_type(model.shape)
else:
    "Dimensionality reduction by obtaining Q-value rows by using a neural network."
    model = Sequential(
        [
            tf.keras.Input(shape=input_shape),
            Reshape((num_classes, size, size, options_per_cell)),
            Conv2D(128, kernel_size=(2, 2), activation="relu", padding='SAME'),
            Conv2D(32, kernel_size=(2, 2), activation="relu", padding='SAME'),
            Reshape((-1,)),
            Dense(128, activation='relu'),
            Dense(num_classes)
        ]
    )
    model.build()


print(model.summary())
with open(f"{folder}/model_structure.txt", "a+") as f:
    model.summary(print_fn=lambda x: f.write(x + "\n"))

with open(f"{folder}/model_structure.txt", "a+") as f:
    f.write(f"Model trained from loaded file: {input_file}\n")
    f.write(f"Parameters: \tbuffer size: {buffer_capacity}\n")
    f.write(f"\t\t\t\t* best capacity: {best_capacity}\n")
    f.write(f"\t\t\t\t* gamma: {gamma}, epsilon: {epsilon} (decay = {epsilon_decay}, min = {epsilon_min})\n")
    f.write(f"\t\t\t\t* lr: {learning_rate} (decay = {lr_decay})\n")
    model.summary(print_fn=lambda x: f.write(x + "\n"))


def end():
    if episode > 0:
        print(f"Average score: {total_score / episode:.2f}\n" +
              f"Maximum score: {max_score:d}\n" + f"Highest tile: {2 ** high_tile:d}\n" +
              f"Average steps: {total_steps / episode:.2f} ([{min_steps_achieved} to {max_steps_achieved}])")

    env.close()

episode = 0
total_score = 0
max_score = 0
high_tile = 0
total_steps = 0
min_steps_achieved = (2 << 15)
max_steps_achieved = 0

len_top_tiles = 10  # maximum number of games that we are keeping track of (the high score)

running = True
step = 0

top_tiles = []

# threading
class PlayModel (threading.Thread):
    def __init__(self, env, episode):
        threading.Thread.__init__(self)
        self.env = env
        self.episode = episode
    def run(self):
        global max_score, high_tile, total_score, top_tiles, epsilon, \
            max_steps_achieved, min_steps_achieved, total_steps, output
        env = self.env
        episode = self.episode

        info = {'available_dir': np.array([True, True, True, True]), 'score': 0, 'highTile': 0}

        
        state = env.reset(seed=None)[0]  # [0] for observation only
        state = vectorize(state, info['available_dir'], type=train_type)
        total_testing_rewards = 0

        for step in range(max_steps):
            "Obtain Q-values from network."
            q_values = model(state)

            "Select action using epsilon-greedy strategy."
            sample_epsilon = np.random.rand()
            thisgame_hi = info['highTile']
            self_epsilon = local_epsilon[thisgame_hi]
            if sample_epsilon <= self_epsilon:
                action = env.action_space.sample(mask=info['available_dir'].astype(np.int8))
            else:
                action = choose(env, q_values, info['available_dir'])
            "Obtain q-value for the selected action."
            q_value = q_values[0, action]

            "Deterimine next state."
            new_state, reward, done, truncated, info = env.step(action)  # take action and get reward
            new_state = vectorize(new_state, info['available_dir'], type=train_type)
            buffer.append(Experience(state, action, reward, done, new_state))

            state = new_state

            "From the Q-learning update formula, we have:"
            "   Q'(S, A) = Q(S, A) + a * {R + λ argmax[a, Q(S', a)] - Q(S, A)}"
            "Target of Q' is given by: "
            "   R + λ argmax[a, Q(S', a)]"
            "Hence, MSE loss function is given by: "
            "   L(w) = E[(R + λ argmax[a, Q(S', a, w)] - Q(S, a, w))**2]"
            next_q_values = model(new_state)
            next_action = choose(env, next_q_values, info['available_dir'])
            next_q_value = next_q_values[0, next_action]

            observed_q_value = reward + (gamma * next_q_value)
            loss = (observed_q_value - q_value) ** 2

            def decay(ep: float) -> float:
                ep *= epsilon_decay
                return max(ep, epsilon_min)

            self_epsilon = decay(self_epsilon)
            epsilon = decay(epsilon)
            for i in range(thisgame_hi+1):
                local_epsilon[i] = min(decay(local_epsilon[i]), decay(local_epsilon[thisgame_hi]))

            # print(state, action)
            if done or truncated:
                total_score += info['score']
                max_score = max(max_score, info['score'])
                high_tile = max(high_tile, info['highTile'])
                top_tiles += [2 ** info['highTile']]
                if len(top_tiles) > len_top_tiles:
                    del top_tiles[0]

                soutput = f"Episode {episode} succeeded in {step} steps with score {info['score']}," \
                          f" high tile {2 ** info['highTile']}..., \n" \
                          f"Highest tile frequencies: {top_tiles}" \
                          f"\nepsilon: {self_epsilon}; q_values: {q_values}"
                print(soutput)

                with open(f"{folder}/_descriptions.txt", "a+") as f:
                    f.write(soutput + "\n")
                with open(f"{folder}/_data.txt", "a+") as f:
                    f.write(f"{episode}\t{step}\t{info['score']}\t{info['highTile']}\n")

                output.log(done, episode, step, info, model=model, do_output=False, epsilon=local_epsilon)

                total_steps += step
                max_steps_achieved = max(max_steps_achieved, step)
                min_steps_achieved = min(min_steps_achieved, step)
                break
    def join(self):
        threading.Thread.join(self)

class TrainModel (threading.Thread):

    def __init__(self, experience, episode):
        threading.Thread.__init__(self)
        self.experience = experience
        self.episode = episode

    @staticmethod
    def squeeze(inp: np.ndarray):
        return np.squeeze(inp, axis=1)

    def run(self):
        global max_score, total_score, epsilon, \
            max_steps_achieved, min_steps_achieved, total_steps, output, high
        experience = self.experience
        episode = self.episode
        state, action, reward, done, new_state = experience

        with tf.GradientTape() as tape:  # tracing and computing the gradients ourselves.
            "Obtain Q-values from network."
            q_values = model(self.squeeze(state))

            "Obtain q-value for the selected action."
            q_value = tf.gather(q_values, tf.constant(action), axis=1)#q_values[action]
            #print(q_values)

            "From the Q-learning update formula, we have:"
            "   Q'(S, A) = Q(S, A) + a * {R + λ argmax[a, Q(S', a)] - Q(S, A)}"
            "Target of Q' is given by: "
            "   R + λ argmax[a, Q(S', a)]"
            "Hence, MSE loss function is given by: "
            "   L(w) = E[(R + λ argmax[a, Q(S', a, w)] - Q(S, a, w))**2]"
            next_q_values = tf.stop_gradient(model(self.squeeze(new_state)))
            next_actions = tf.math.argmax(next_q_values, 1)
            next_q_value = tf.gather(next_q_values, next_actions, axis=1)

            observed_q_value = reward + (gamma * next_q_value)
            loss = (observed_q_value - q_value) ** 2

            "Computing and applying gradients"
            grads = tape.gradient(loss, model.trainable_variables)
            opt.apply_gradients(zip(grads, model.trainable_variables))

    def join(self):
        threading.Thread.join(self)

save_interval = 50
train_episode = 0

print("Training started ...")
trainThreads = []
for episode in range(test_episodes):
    thread = PlayModel(envs[episode % num_threads], episode)
    thread.start()
    trainThreads.append(thread)
    if episode % num_threads == num_threads-1:
        [trainThread.join() for trainThread in trainThreads]
        trainThreads = []
    if episode % save_interval > save_interval - 5:
        output.concat({'episode': [episode], 'best': [np.NAN], 'good': [np.NAN]})
        output.output_img(episode=-1)

    if len(buffer) > min_buffer_length:
        for j in range(num_threads):
            thread = TrainModel(buffer.sample(512), train_episode)
            thread.start()
            trainThreads.append(thread)
            if j == num_threads - 1:
                [trainThread.join() for trainThread in trainThreads]
                trainThreads = []
            if train_episode % save_interval > save_interval - 5:
                output.concat({'episode': [episode], 'best': [np.NAN], 'good': [np.NAN]})
                output.output_img(episode=-1)
            train_episode += 1


end()


[Errno 17] File exists: 'data/qtable_202301101231/'
The following items will be outputted in folder data/qtable_202301101231:
	- data.txt
	- data.csv
	- evaluation_graph.png
	- model.h5
Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape_32 (Reshape)        (None, 4, 4, 4, 16)       0         
                                                                 
 conv2d_32 (Conv2D)          (None, 4, 4, 4, 128)      8320      
                                                                 
 conv2d_33 (Conv2D)          (None, 4, 4, 4, 32)       16416     
                                                                 
 reshape_33 (Reshape)        (None, 2048)              0         
                                                                 
 dense_32 (Dense)            (None, 128)               262272    
                                                                 