In [1]:
import numpy as np
from scene import Scene
import tensorflow as tf
import tensorflow.keras as keras
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten

scene = Scene(using_cnn=False, init_randomly=True)

pygame 2.1.3 (SDL 2.0.22, Python 3.11.4)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
input_shape = scene.feature_count
model = keras.Sequential([
    Dense(input_shape, activation='relu', kernel_initializer='he_normal', input_shape=(input_shape,)),
    Dense(4, activation='linear')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

target = keras.models.clone_model(model)
target.set_weights(model.get_weights())

In [3]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(4)
    else:
        Q_values = model.predict(state[np.newaxis], verbose=False)
        return np.argmax(Q_values[0])

In [4]:
from collections import deque

# (state, action, reward, next_state, done)
replay_memory = deque(maxlen=2000)

def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_memory), size=batch_size)
    batch = [replay_memory[index] for index in indices]
    states, actions, rewards, next_states, dones = [np.array([experience[field_index]
                                                            for experience in batch])
                                                            for field_index in range(5)]
    return states, actions, rewards, next_states, dones

In [5]:
def play_one_step(scene, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    scene.snake.change_direction(action)
    next_state, reward, done = scene.move()
    replay_memory.append((state, action, reward, next_state, done))
    return next_state, reward, done

In [6]:
batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.legacy.Adam(learning_rate=1e-3)
loss_fn = keras.losses.mean_squared_error

def training_step(batch_size):
    states, actions, rewards, next_states, dones = sample_experiences(batch_size)
    next_Q_values = model.predict(next_states, verbose=False)
    best_next_actions = np.argmax(next_Q_values, axis=1)
    next_mask = tf.one_hot(best_next_actions, 4).numpy()
    next_best_Q_values = (target.predict(next_states, verbose=False) * next_mask).sum(axis=1)
    target_Q_values = (rewards + (1 - dones) * discount_rate * next_best_Q_values).reshape(-1, 1)
    mask = tf.one_hot(actions, 4)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [7]:
episodes_count = 8000
reward_per_batch = 0

for episode in range(1, episodes_count + 1):
    state = scene.scene_as_feature_vector()
    epsilon = (1 / (np.linspace(1, 8, episodes_count)**(1/2)))[episode - 1]
    done = False
    
    steps = 0
    while not done:
        steps += 1
        state, reward, done = play_one_step(scene, state, epsilon)
        reward_per_batch += reward
        
    if episode > 50:
        training_step(batch_size)
        if episode % 50 == 0:
            print("Episode number: ", episode)
            target.set_weights(model.get_weights())
            print("Average reward: ", reward_per_batch / 50)
            reward_per_batch = 0
            
    if episode % 1000 == 0:
        model.save(f"models/features/{episode}.h5")

Episode number:  100
Average reward:  -62.06
Episode number:  150
Average reward:  -30.4
Episode number:  200
Average reward:  -30.44
Episode number:  250
Average reward:  -29.44
Episode number:  300
Average reward:  -30.44
Episode number:  350
Average reward:  -29.14
Episode number:  400
Average reward:  -31.5
Episode number:  450
Average reward:  -30.98
Episode number:  500
Average reward:  -29.2
Episode number:  550
Average reward:  -29.46
Episode number:  600
Average reward:  -29.9
Episode number:  650
Average reward:  -27.88
Episode number:  700
Average reward:  -25.06
Episode number:  750
Average reward:  -27.46
Episode number:  800
Average reward:  -26.02
Episode number:  850
Average reward:  -26.78
Episode number:  900
Average reward:  -24.88
Episode number:  950
Average reward:  -25.24
Episode number:  1000
Average reward:  -24.22


  saving_api.save_model(


Episode number:  1050
Average reward:  -21.7
Episode number:  1100
Average reward:  -25.06
Episode number:  1150
Average reward:  -21.78
Episode number:  1200
Average reward:  -24.24
Episode number:  1250
Average reward:  -21.62
Episode number:  1300
Average reward:  -23.86
Episode number:  1350
Average reward:  -18.06
Episode number:  1400
Average reward:  -19.86
Episode number:  1450
Average reward:  -16.12
Episode number:  1500
Average reward:  -16.58
Episode number:  1550
Average reward:  -17.4
Episode number:  1600
Average reward:  -20.06
Episode number:  1650
Average reward:  -17.58
Episode number:  1700
Average reward:  -22.54
Episode number:  1750
Average reward:  -17.62
Episode number:  1800
Average reward:  -20.0
Episode number:  1850
Average reward:  -16.06
Episode number:  1900
Average reward:  -14.64
Episode number:  1950
Average reward:  -18.6
Episode number:  2000
Average reward:  -16.8
Episode number:  2050
Average reward:  -16.78
Episode number:  2100
Average reward:  

KeyboardInterrupt: 