In [1]:
import numpy as np
from scene import Scene
import tensorflow as tf
import tensorflow.keras as keras
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten

scene = Scene(init_randomly=True)
N_OUTPUTS = 4


In [2]:
model = keras.Sequential([
        Conv2D(12, (3, 3), activation='relu', padding="same", kernel_initializer='he_normal', input_shape=(scene.height, scene.width, scene.elements_count)),
        Conv2D(12, (3, 3), activation='relu', strides=2, padding="same", kernel_initializer='he_normal'),
        Flatten(),    
        Dense(16, activation='relu', kernel_initializer='he_normal'),
        Dense(4, activation='linear')
    ])

model.summary()

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

target = keras.models.clone_model(model)
target.set_weights(model.get_weights())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 5, 5, 12)          444       
                                                                 
 conv2d_1 (Conv2D)           (None, 3, 3, 12)          1308      
                                                                 
 flatten (Flatten)           (None, 108)               0         
                                                                 
 dense (Dense)               (None, 16)                1744      
                                                                 
 dense_1 (Dense)             (None, 4)                 68        
                                                                 
Total params: 3564 (13.92 KB)
Trainable params: 3564 (13.92 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [3]:
model = keras.Sequential([
    Conv2D(16, (3, 3), activation='relu', padding="same", kernel_initializer='he_normal', input_shape=(scene.height, scene.width, scene.elements_count)),
    MaxPooling2D(2),
    Conv2D(32, (3, 3), activation='relu', padding="same", kernel_initializer='he_normal'),
    MaxPooling2D(2),
    Flatten(),    
    Dense(16, activation='relu', kernel_initializer='he_normal'),
    Dense(4, activation='linear')
])

model.summary()

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

target = keras.models.clone_model(model)
target.set_weights(model.get_weights())


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 5, 5, 16)          592       
                                                                 
 max_pooling2d (MaxPooling2  (None, 2, 2, 16)          0         
 D)                                                              
                                                                 
 conv2d_3 (Conv2D)           (None, 2, 2, 32)          4640      
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 1, 1, 32)          0         
 g2D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 16)               

In [3]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(N_OUTPUTS)
    else:
        Q_values = model.predict(state[np.newaxis], verbose=0)
        return np.argmax(Q_values[0])


In [4]:
from collections import deque

# (state, action, reward, next_state, done)
replay_memory = deque(maxlen=20000)

def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_memory), size=batch_size)
    batch = [replay_memory[index] for index in indices]
    states, actions, rewards, next_states, dones = [np.array([experience[field_index] for experience in batch])
                                                            for field_index in range(5)]
    return states, actions, rewards, next_states, dones


In [5]:
def play_one_step(scene, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done = scene.move(action)
    next_state = tf.one_hot(next_state, scene.elements_count)
    replay_memory.append((state, action, reward, next_state, done))
    return next_state, reward, done


In [7]:
batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.legacy.Adam(learning_rate=1e-3)
loss_fn = keras.losses.mean_squared_error

def training_step(batch_size):
    states, actions, rewards, next_states, dones = sample_experiences(batch_size)
    next_Q_values = target.predict(next_states, verbose=False)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards + (1 - dones) * discount_rate * max_next_Q_values).reshape(-1, 1)
    mask = tf.one_hot(actions, N_OUTPUTS)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))


In [8]:
# Training
episodes_count = 1500
reward_per_batch = 0

for episode in range(1, episodes_count + 1):
    #scene.reset()
    state = scene.scene_as_matrix()
    state = tf.one_hot(state, scene.elements_count)
    # aux = episode // 20
    # epsilon = 1 - aux / 20
    epsilon = 1 / (np.linspace(1, 20, episodes_count))[episode - 1] #(1 / (np.linspace(1, 8, episodes_count)**(1/2)))[episode - 1]
    done = False
    
    steps = 0
    while not done and steps < 50:
        steps += 1
        state, reward, done = play_one_step(scene, state, epsilon)
        reward_per_batch += reward
        
    if episode >= 50:
        training_step(batch_size)
        if episode % 50 == 0:
            target.set_weights(model.get_weights())
            reward_per_batch = 0
    
    if episode % 50 == 0:
        print("Episode number: ", episode)
        print("Average reward: ", reward_per_batch / 50)

    if (episode + 1) % 500 == 0:
        model.save(f"models/1p_{episode + 1}_episodes.h5")


Episode number:  50
Average reward:  0.0
Episode number:  100
Average reward:  0.0
Episode number:  150
Average reward:  0.0


KeyboardInterrupt: 