In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from Environment import Tetris
from pygame.time import Clock
from collections import deque

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
tf.random.set_seed(42)
np.random.seed(42)

In [3]:
env = Tetris()

In [4]:
observation_shape = env.observation_shape()
n_inputs = observation_shape[0]*observation_shape[1]

In [5]:
n_outputs = env.action_scope_size()

In [6]:
model = keras.Sequential([keras.layers.Dense(512, activation='relu', input_shape=[n_inputs]),
                          keras.layers.Dense(256, activation='relu'),
                          keras.layers.Dense(n_outputs)
                         ])

In [7]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        Q_values = model.predict(obs.reshape(1, -1))
    
    return np.argmax(Q_values)

In [8]:
replay_memory = deque(maxlen=10000)

In [9]:
try:
    dataset = pd.read_csv('./checkpoints/dataset_tetris.csv')

    dataset.drop('Unnamed: 0', axis=1, inplace=True)

    state_string = dataset['state'].to_numpy()
    action = dataset['action'].to_numpy()
    reward = dataset['reward'].to_numpy()
    next_state_string = dataset['next_state'].to_numpy()
    done = dataset['done'].to_numpy()

    state = []
    next_state = []

    for i in range(len(state_string)):
        list_a = []
        list_b = []
        for j in range(len(state_string[i])):
            list_a.append(int(state_string[i][j]))
            list_b.append(int(next_state_string[i][j]))

        state.append(np.array(list_a, dtype=np.float32))
        next_state.append(np.array(list_b, dtype=np.float32))

    replay_memory.append((np.array(state), action, reward, np.array(next_state), done))
except FileNotFoundError:
    print('None training memory. Starting from the beginning.')

In [10]:
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done = env.step(action)
    replay_memory.append((state, action, reward, next_state.flatten(), done))
    return next_state, reward, done

In [11]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_memory), size=batch_size)
    batch = [replay_memory[index] for index in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(5)]
    return states, actions, rewards, next_states, dones

In [12]:
discount_rate = 0.95
optimizer = keras.optimizers.Adam(lr=1e-3)
loss_fn = keras.losses.mean_squared_error

In [13]:
def training_step(batch_size):
    states, actions, rewards, next_states, dones = sample_experiences(batch_size)
#     print(len(next_states[0]))
    next_states.shape()
    next_Q_values = model.predict(next_states)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards +
                       (1 - dones) * discount_rate * max_next_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [14]:
best_score = 0
batch_size = 500

In [15]:
first_episode = 0

for episode in range(0, 50000, 1000):
    try:
        model.load_weights('./checkpoints/' + str(episode) + '_episodes_tetris')
        first_episode = episode + 1
    except tf.errors.NotFoundError:
        break

In [16]:
def save_memory(episode):
    state, action, reward, next_state, done = [[replay_memory[index][field_index] for index in range(len(replay_memory))] for field_index in range(5)]
    
    state_string = []
    next_state_string = []

    for i in range(len(state)):
        string_a = ""
        string_b = ""
        for j in range(200):
            string_a += str(int(state[i][j]))
            string_b += str(int(next_state[i][j]))
        state_string.append(string_a)
        next_state_string.append(string_b)
        
    dataset = pd.DataFrame(columns=['state', 'action', 'reward', 'next_state', 'done'])
    dataset['state'] = state_string
    dataset['action'] = action
    dataset['reward'] = reward
    dataset['next_state'] = next_state_string
    dataset['done'] = done
    
    dataset.to_csv('./checkpoints/dataset_tetris.csv', mode='w')

In [17]:
for episode in range(first_episode, 50000):
    obs = env.reset()
    state = obs.flatten()
    
    score = 0
    for step in range(1000):
        epsilon = max(1 - episode / 41900, 0.01)
        obs, reward, done = play_one_step(env, state, epsilon)
        if done:
            break
        else:
            score = env.get_score()
    
    if score > best_score:
        best_weights = model.get_weights()
        best_score = score
    print("\rEpisode: {}, Steps: {},  Score: {}, Best score: {}, eps: {:.3f}".format(episode, step + 1, score,best_score, epsilon), end="")
    
    if episode > 4190:
        training_step(batch_size)
        
    if episode % 1000 == 0:
        model.save_weights('./checkpoints/' + str(episode) + '_episodes_tetris')
        save_memory(episode)

        
model.set_weights(best_weights)

Episode: 5001, Steps: 160,  Score: 159, Best score: 159, eps: 0.881

ValueError: Error when checking input: expected dense_input to have shape (200,) but got array with shape (1,)

In [None]:
# clock = Clock()

In [None]:
# obs = env.reset()
# done = False

# while not done:
#     env.render()
    
#     state = obs.flatten()
#     action = epsilon_greedy_policy(state)
    
#     obs, reward, done = env.step(action)
    
#     clock.tick(16)
    
# env.close()

In [None]:
# model.save_weights('./checkpoints/10000_episodes_tetris')