In [2]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
#import matplotlib.pyplot as plt
from collections import deque
from time import time

In [3]:
seed = 1546847731  # or try a new seed by using: seed = int(time())
random.seed(seed)
print('Seed: {}'.format(seed))

Seed: 1546847731



### Game design
The game the Q-agents will need to learn is made of a board with 4 cells. The agent will receive a +1 reward every time it fills a vacant cell, and will receive a -1 penalty when it tries to fill an already filled cell. Game ends when the board is full.

In [4]:
class Game:
    board = None
    board_size = 0
    
    def __init__(self, board_size=4):
        self.board_size = board_size
        self.reset()
    
    def reset(self):
        self.board = np.zeros(self.board_size)
    
    def play(self, cell):
        # returns a tuple: (reward, game_over?)
        if self.board[cell] == 0:
            self.board[cell] = 1
            game_over = len(np.where(self.board == 0)[0]) == 0
            return (1,game_over)
        else:
            return (-1,False)

In [6]:
def state_to_str(state):
    return str(list(map(int,state.tolist())))

all_states = list()
for i in range(2):
    for j in range(2):
        for k in range(2):
            for l in range(2):
                s = np.array([i,j,k,l])
                all_states.append(state_to_str(s))
                
print('All possible states:')
for s in all_states:
    print(s)

All possible states:
[0, 0, 0, 0]
[0, 0, 0, 1]
[0, 0, 1, 0]
[0, 0, 1, 1]
[0, 1, 0, 0]
[0, 1, 0, 1]
[0, 1, 1, 0]
[0, 1, 1, 1]
[1, 0, 0, 0]
[1, 0, 0, 1]
[1, 0, 1, 0]
[1, 0, 1, 1]
[1, 1, 0, 0]
[1, 1, 0, 1]
[1, 1, 1, 0]
[1, 1, 1, 1]


In [7]:
game = Game()

### Q Learning
Starting of with a table-based Q-learning algorithm.

In [8]:
num_of_games = 2000
epsilon = 0.1
gamma = 1

In [9]:
# Initializing the Q-table
q_table = pd.DataFrame(0, index=np.arange(4), columns=all_states)

In [10]:
#Letting the agent play and learn:

r_list = []  # store the total reward of each game so we can plot it later

for g in range(num_of_games):
    game_over = False
    game.reset()
    total_reward = 0
    while not game_over:
        state = np.copy(game.board)
        if random.random() < epsilon:
            action = random.randint(0,3)
        else:
            action = q_table[state_to_str(state)].idxmax()
        reward, game_over = game.play(action)
        total_reward += reward
        if np.sum(game.board) == 4:  # terminal state
            next_state_max_q_value = 0
        else:
            next_state = np.copy(game.board)
            next_state_max_q_value = q_table[state_to_str(next_state)].max()
        q_table.loc[action,state_to_str(state)] = reward + gamma * next_state_max_q_value
    r_list.append(total_reward)
    
    
q_table

Unnamed: 0,"[0, 0, 0, 0]","[0, 0, 0, 1]","[0, 0, 1, 0]","[0, 0, 1, 1]","[0, 1, 0, 0]","[0, 1, 0, 1]","[0, 1, 1, 0]","[0, 1, 1, 1]","[1, 0, 0, 0]","[1, 0, 0, 1]","[1, 0, 1, 0]","[1, 0, 1, 1]","[1, 1, 0, 0]","[1, 1, 0, 1]","[1, 1, 1, 0]","[1, 1, 1, 1]"
0,4,3,3,2,3,2,2,0,2,1,1,-1,1,0,0,0
1,4,1,0,0,0,0,0,0,3,2,2,1,1,0,0,0
2,4,3,2,0,3,0,-1,0,3,2,1,0,2,1,0,0
3,4,2,3,0,0,0,0,0,3,0,2,0,2,0,1,0


In [11]:
# Let's verify that the agent indeed learned a correct startegy by seeing what action it will choose in each one of the possible states:
for i in range(2):
    for j in range(2):
        for k in range(2):
            for l in range(2):
                b = np.array([i,j,k,l])
                if len(np.where(b == 0)[0]) != 0:
                    action = q_table[state_to_str(b)].idxmax()
                    pred = q_table[state_to_str(b)].tolist()
                    print('board: {b}\tpredicted Q values: {p} \tbest action: {a}\tcorrect action? {s}'
                          .format(b=b,p=pred,a=action,s=b[action]==0))

board: [0 0 0 0]	predicted Q values: [4, 4, 4, 4] 	best action: 0	correct action? True
board: [0 0 0 1]	predicted Q values: [3, 1, 3, 2] 	best action: 0	correct action? True
board: [0 0 1 0]	predicted Q values: [3, 0, 2, 3] 	best action: 0	correct action? True
board: [0 0 1 1]	predicted Q values: [2, 0, 0, 0] 	best action: 0	correct action? True
board: [0 1 0 0]	predicted Q values: [3, 0, 3, 0] 	best action: 0	correct action? True
board: [0 1 0 1]	predicted Q values: [2, 0, 0, 0] 	best action: 0	correct action? True
board: [0 1 1 0]	predicted Q values: [2, 0, -1, 0] 	best action: 0	correct action? True
board: [0 1 1 1]	predicted Q values: [0, 0, 0, 0] 	best action: 0	correct action? True
board: [1 0 0 0]	predicted Q values: [2, 3, 3, 3] 	best action: 1	correct action? True
board: [1 0 0 1]	predicted Q values: [1, 2, 2, 0] 	best action: 1	correct action? True
board: [1 0 1 0]	predicted Q values: [1, 2, 1, 2] 	best action: 1	correct action? True
board: [1 0 1 1]	predicted Q values: [-1, 