In [1]:
import numpy as np
import gym
import random
import sys

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Embedding, Reshape, Input, Conv2D, MaxPool2D
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

from Engine import Blockudoku

print("Python: "+str(sys.version))
print("Tensorflow version: "+tf.__version__)
print("Keras version: "+tf.keras.__version__)


pygame 2.0.2 (SDL 2.0.16, Python 3.9.6)
Hello from the pygame community. https://www.pygame.org/contribute.html
Python: 3.9.6 (tags/v3.9.6:db3ff76, Jun 28 2021, 15:26:21) [MSC v.1929 64 bit (AMD64)]
Tensorflow version: 2.6.2
Keras version: 2.6.0


In [2]:
from rl.core import Processor
class CustomProcessor(Processor):
    '''
    acts as a coupling mechanism between the agent and the environment
    '''

    def process_state_batch(self, batch):
        '''
        Given a state batch, I want to remove the second dimension, because it's
        useless and prevents me from feeding the tensor into my CNN
        '''
        return np.squeeze(batch, axis=1)

In [3]:
env = Blockudoku()
env.render()

+-----+-----+-----+
|[0;30;42m [0;0m: : | : : | : : |
| : : | : : | : : |
| : : | : : | : : |
+-----+-----+-----+
| : : | : : | : : |
| : : | : : | : : |
| : : | : : | : : |
+-----+-----+-----+
| : : | : : | : : |
| : : | : : | : : |
| : : | : : | : : |
+-----+-----+-----+


In [4]:
print("Number of actions: %d" % env.action_space.n)
print("Number of states: %d" % env.observation_space.n)

Number of actions: 5
Number of states: 512


In [5]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [6]:
np.random.seed(123)
env.seed(123)

In [7]:
env.reset()
env.step(env.action_space.sample())[0]

array([[[1., 1.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[1., 0.],
        [0., 1.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.]

In [8]:
layers = [Input(shape=(env.state.shape[0], env.state.shape[1], env.state.shape[2])),
                      Conv2D(16, 3, activation="relu", padding="same" , name="Conv2D_layer1"),
                      MaxPool2D(),
                      Conv2D(32, 3, activation="relu", padding="same", name="Conv2D_layer2"),
                      MaxPool2D(),
                      Dense(69, activation="relu", name="Dense_layer1"),
                      Dense(69, activation="relu", name="Dense_layer2"),
                      Flatten(),
                      Dense(action_size, activation="linear", name="output")]


# layers = [Input(shape=(len(env.state.flatten()),)),
#                       Dense(69, activation="relu", name="Dense_layer1"),
#                       Dense(69, activation="relu", name="Dense_layer2"),
#                       Dense(action_size, activation="linear", name="output")]

model = Sequential(layers)

model.compile(loss='mse', optimizer="adam")

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv2D_layer1 (Conv2D)       (None, 9, 9, 16)          304       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 4, 4, 16)          0         
_________________________________________________________________
Conv2D_layer2 (Conv2D)       (None, 4, 4, 32)          4640      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 2, 2, 32)          0         
_________________________________________________________________
Dense_layer1 (Dense)         (None, 2, 2, 69)          2277      
_________________________________________________________________
Dense_layer2 (Dense)         (None, 2, 2, 69)          4830      
_________________________________________________________________
flatten (Flatten)            (None, 276)               0

In [None]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=model, nb_actions=action_size, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy, processor=CustomProcessor())
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=1000000, visualize=False, verbose=1, nb_max_episode_steps=99, log_interval=100000)

Training for 1000000 steps ...
Interval 1 (0 steps performed)
     1/100000 [..............................] - ETA: 5:01:36 - reward: -10.0000



1010 episodes - episode_reward: -230.889 [-948.000, 6.000] - loss: 91.268 - mae: 28.536 - mean_q: 35.693

Interval 2 (100000 steps performed)
1010 episodes - episode_reward: -522.265 [-966.000, -40.000] - loss: 154322.609 - mae: 1560.881 - mean_q: 2072.901

Interval 3 (200000 steps performed)

In [None]:
dqn.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=99)

In [None]:
dqn.save_weights('dqn_{}_weights.h5f'.format("Blockudoku"), overwrite=True)