In [2]:
import random
import os
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import Adam
from keras import backend as K
from envs.tetris_env import TetrisEnv

env = TetrisEnv()

learning_rate = 0.001

model = Sequential()
model.add(Dense(24, input_shape=env.state_size, activation='relu'))
model.add(Flatten())
model.add(Dense(24, activation='relu'))
model.add(Dense(env.action_size, activation='linear'))
model.compile(loss='mse', optimizer=Adam(lr=learning_rate))

memory = deque(maxlen=2000)
gamma = 0.95    
epsilon = 1.0  # exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995

Using TensorFlow backend.


In [12]:
observetime = 500
ob = env.reset()
print(ob)
state = np.expand_dims(ob, axis=0) 
print(state)
print(state.shape)
done = False
for t in range(observetime):
    if np.random.rand() <= epsilon:
        action = np.random.randint(0, env.action_size, size=1)[0]
    else:
        Q = model.predict(state)          # Q-values predictions
        action = np.argmax(Q)             # Move with highest Q-value is the chosen one
    observation_new, reward, done, info = env.step(action)     # See state of the game, reward... after performing the action
    state_new = np.expand_dims(observation_new, axis=0)          # (Formatting issues)
    memory.append((state, action, reward, state_new, done))         # 'Remember' action and consequence
    state = state_new         # Update state
    if done:
        observation = env.reset()           # Restart game if it's finished
        state = np.expand_dims(observation, axis=0)     # (Formatting issues) Making the observation the first element of a batch of inputs 
        
print('Observing Finished')

[[0 1 1 0 0]
 [0 1 1 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
[[[0 1 1 0 0]
  [0 1 1 0 0]
  [0 0 0 0 0]
  [0 0 0 0 0]
  [0 0 0 0 0]
  [0 0 0 0 0]
  [0 0 0 0 0]
  [0 0 0 0 0]
  [0 0 0 0 0]
  [0 0 0 0 0]]]
(1, 10, 5)
Observing Finished


In [13]:
mb_size = 50
minibatch = random.sample(memory, mb_size)                              # Sample some moves
print(minibatch[10])
inputs_shape = (mb_size,) + state.shape[1:]
inputs = np.zeros(inputs_shape)
targets = np.zeros((mb_size, env.action_size))

for i in range(0, mb_size):
    state = minibatch[i][0]
    action = minibatch[i][1]
    reward = minibatch[i][2]
    state_new = minibatch[i][3]
    done = minibatch[i][4]
    
# Build Bellman equation for the Q function
    inputs[i:i+1] = np.expand_dims(state, axis=0)
    targets[i] = model.predict(state)
    Q_sa = model.predict(state_new)
    
    if done:
        targets[i, action] = reward
    else:
        targets[i, action] = reward + gamma * np.max(Q_sa)

# Train network to output the Q function
    model.train_on_batch(inputs, targets)
print('Learning Finished')

(array([[[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 1, 1, 0, 0],
        [0, 1, 1, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 1, 1]]]), 2, 0, array([[[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 1, 1, 0, 0],
        [0, 1, 1, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 1, 1]]]), False)
Learning Finished


In [14]:
observation = env.reset()
state = np.expand_dims(observation, axis=0)
done = False
tot_reward = 0.0
while not done:
    Q = model.predict(state)        
    action = np.argmax(Q)         
    observation, reward, done, info = env.step(action)
    state = np.expand_dims(observation, axis=0)   
    tot_reward += reward
print('Game ended! Total reward: {}'.format(reward))

Game ended! Total reward: 0
