In [2]:
import numpy as np
from scene import Scene
import tensorflow as tf
import tensorflow.keras as keras
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten

scene = Scene(init_randomly=True)
N_OUTPUTS = 4


In [17]:
model = keras.Sequential([
        Conv2D(32, (3, 3), activation='relu', padding="same", kernel_initializer='he_normal', input_shape=(scene.height, scene.width, scene.elements_count)),
        Conv2D(16, (3, 3), activation='relu', padding="valid", kernel_initializer='he_normal'),
        Flatten(),    
        Dense(32, activation='relu', kernel_initializer='he_normal'),
        Dense(4, activation='linear')
    ])

model.summary()

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

target = keras.models.clone_model(model)
target.set_weights(model.get_weights())


Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_20 (Conv2D)          (None, 5, 5, 32)          1184      
                                                                 
 conv2d_21 (Conv2D)          (None, 3, 3, 16)          4624      
                                                                 
 flatten_12 (Flatten)        (None, 144)               0         
                                                                 
 dense_23 (Dense)            (None, 32)                4640      
                                                                 
 dense_24 (Dense)            (None, 4)                 132       
                                                                 
Total params: 10580 (41.33 KB)
Trainable params: 10580 (41.33 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(N_OUTPUTS)
    else:
        Q_values = model.predict(state[np.newaxis], verbose=0)
        return np.argmax(Q_values[0])


In [10]:
from collections import deque

# (state, action, reward, next_state, done)
replay_memory = deque(maxlen=20000)

def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_memory), size=batch_size)
    batch = [replay_memory[index] for index in indices]
    states, actions, rewards, next_states, dones = [np.array([experience[field_index] for experience in batch])
                                                            for field_index in range(5)]
    return states, actions, rewards, next_states, dones


In [11]:
def play_one_step(scene, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done = scene.move(action)
    next_state = tf.one_hot(next_state, scene.elements_count)
    replay_memory.append((state, action, reward, next_state, done))
    return next_state, reward, done


In [12]:
batch_size = 32
discount_rate = 0.99
optimizer = keras.optimizers.legacy.Adam(learning_rate=1e-3)
loss_fn = keras.losses.mean_squared_error

def training_step(batch_size):
    states, actions, rewards, next_states, dones = sample_experiences(batch_size)
    next_Q_values = target.predict(next_states, verbose=False)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards + (1 - dones) * discount_rate * max_next_Q_values).reshape(-1, 1)
    mask = tf.one_hot(actions, N_OUTPUTS)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))


In [13]:
# Training
episodes_count = 1500
reward_per_batch = 0
epsilons = np.linspace(1, 0.01, episodes_count)
max_steps = 50

for episode in range(1, episodes_count + 1):
    scene.reset()
    state = scene.scene_as_matrix()
    state = tf.one_hot(state, scene.elements_count)
    epsilon = epsilons[episode - 1]
    done = False
    
    steps = 0
    while not done and steps < max_steps:
        steps += 1
        state, reward, done = play_one_step(scene, state, epsilon)
        reward_per_batch += reward
        
    if episode >= 50:
        training_step(batch_size)
        if episode % 50 == 0:
            target.set_weights(model.get_weights())
    
    if episode % 50 == 0:
        print("Episode number: ", episode)
        print("Average reward: ", reward_per_batch / 50)
        reward_per_batch = 0

    if (episode + 1) % 500 == 0:
        model.save(f"models/1p_{episode + 1}_episodes.h5")


Episode number:  50
Average reward:  -16.0
Episode number:  100
Average reward:  -14.28
Episode number:  150
Average reward:  -13.34
Episode number:  200
Average reward:  -15.1
Episode number:  250
Average reward:  -14.66
Episode number:  300
Average reward:  -14.58
Episode number:  350
Average reward:  -14.36
Episode number:  400
Average reward:  -14.78
Episode number:  450
Average reward:  -13.56


  saving_api.save_model(


Episode number:  500
Average reward:  -14.14
Episode number:  550
Average reward:  -16.1
Episode number:  600
Average reward:  -14.72
Episode number:  650
Average reward:  -14.04
Episode number:  700
Average reward:  -14.96
Episode number:  750
Average reward:  -14.16
Episode number:  800
Average reward:  -15.7
Episode number:  850
Average reward:  -13.6
Episode number:  900
Average reward:  -13.48
Episode number:  950
Average reward:  -13.84
Episode number:  1000
Average reward:  -15.64
Episode number:  1050
Average reward:  -14.62
Episode number:  1100
Average reward:  -13.24
Episode number:  1150
Average reward:  -13.42
Episode number:  1200
Average reward:  -15.62
Episode number:  1250
Average reward:  -11.26
Episode number:  1300
Average reward:  -10.6
Episode number:  1350
Average reward:  -11.82
Episode number:  1400
Average reward:  -3.28
Episode number:  1450
Average reward:  -6.22
Episode number:  1500
Average reward:  8.4
