# Simulation

In [1]:
# %load ./../../../games/connect-four/connect-four.py
import numpy as np

class ConnectFourSimulator:
	"""Creates a connect-4 board and simulates it, returning states and rewards for any taken action.

	The creates board is a 6 x 7 (rows x cols) array. Empty fields are denoted by 0.
	Tokens placed by player one are denoted by '1' and player two uses '-1'.
	Every field is part of the state and has it's own index, simply counting from 0 to 41 along the rows
	like so [
		[0, 1, 2, 3, 4, 5, 6],
		[7, 8, 9, 10, 11, 12, 13],
		...
		[35, 36, 37, 38, 39, 40, 41]
	]
	"""
	def __init__(self):
		self.width = 7
		self.height = 6
		self.board = np.zeros(shape=(self.height, self.width))
		self.PLAYER1 = 1
		self.PLAYER2 = -1
		self.DRAW = 0
		self.current_player = self.PLAYER1
		self.__game_over = False

	def take_action(self, action):
		"""Executes the action and returns the next state and the received reward."""
		active_player = self.current_player
		inactive_player = self.__negated_player(active_player)
		if not self.__action_is_valid(action):
			return self.__game_over, self.board, active_player, -2, inactive_player, 0

		self.__play_move(action)

		self.__game_over = self.__game_is_over(action)
		if self.__game_over:
			winner = self.__winner(action)
			if winner == self.DRAW:
				return self.__game_over, self.board, active_player, 0, inactive_player, 0
			elif winner == self.PLAYER1:
				return self.__game_over, self.board, active_player, 10, inactive_player, -10
			else:
				return self.__game_over, self.board, active_player, -10, inactive_player, 10

		return self.__game_over, self.board, active_player, 0, inactive_player, 0

	def print_board(self):
		print(self.board)

	def __play_move(self, action):
		"""Takes an action and executes it."""
		x, y = self.__coordinates_from_action(action)
		self.board[y][x] = self.current_player
		self.current_player = self.__negated_player(self.current_player)

	def __action_is_valid(self, action):
		"""Checks if the intended action is a valid one or if it breaks the rules of the game."""
		if action < 0:
			return False
		x, y = self.__coordinates_from_action(action)
		if x >= self.width or y >= self.height:
			return False

		height_x = self.__column_height(x)

		if y != height_x:
			return False
		return True

	def __column_height(self, x):
		"""Returns the height of a column which is equal to the amount of tokens placed."""
		column = self.board[:, x]
		return np.count_nonzero(column)

	def __game_is_over(self, last_action):
		"""Returns True if the game is over and False otherwise."""
		if np.count_nonzero(self.board) == 0:
			return True

		lines = self.__extract_lines(last_action)

		for line in lines:
			if self.__winner_in_line(line) != 0:
				return True

		return False

	def __extract_lines(self, last_action):
		"""Extracts the horizontal, vertical and the diagonal lines going through the last action"""
		x, y = self.__coordinates_from_action(last_action)

		row = self.board[y]
		column = self.board[:, x]
		top_down_diagonal = self.board.diagonal(x - y)

		mirrored_x = self.width - 1 - x
		bot_up_diagonal = np.fliplr(self.board).diagonal(mirrored_x - y)

		return row, column, top_down_diagonal, bot_up_diagonal

	def __winner(self, last_action):
		"""Returns the winner's number or 0 if the game resulted in a draw (Requires the game to have ended)."""
		lines = self.__extract_lines(last_action)

		for line in lines:
			winner = self.__winner_in_line(line)
			if winner != 0:
				return winner

		return 0

	def __winner_in_line(self, line):
		"""Checks if a line contains a winner and returns his number if yes and 0 otherwise."""
		token_sum = 0
		for token in line:
			token_sum += token
			if token_sum == 4 * self.PLAYER1:
				return self.PLAYER1
			if token_sum == 4 * self.PLAYER2:
				return self.PLAYER2
			if token_sum < 0 < token or token_sum > 0 > token:
				token_sum = 0
		return 0

	def __coordinates_from_action(self, action):
		"""Translates an action into (x, y) / (column, row) coordinates."""
		x = action % self.width
		y = action // self.width
		return x, y

	def __negated_player(self, player):
		"""Returns the player not passed to the function (Player1 if Player2 is passed and the other way around)."""
		return self.PLAYER2 if self.current_player == self.PLAYER1 else self.PLAYER1


In [2]:
game = ConnectFourSimulator()

val = input()
while val != "q":
    game_over, board, _, _, _, _ = game.take_action(int(val))
    print(game_over)
    print(board)
    print("------------------------------------")
    val = input()

print(game.take_action(3))
print(game.take_action(4))
print(game.take_action(10))
print(game.take_action(5))
print(game.take_action(17))
print(game.take_action(6))
print(game.take_action(24))

# Agent

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(42, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(42)
        
    def forward(self, x):
        x = F.sigmoid(self.fc1(x))
        x = F.sigmoid(self.fc2(x))
        x = self.fc3(x)
        return x

In [3]:
import random
import tensorflow as tf
import numpy as np

import torch.optim as optim
'''
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update
'''
class DeepQTensorflowAgent:
    def __init__(self, learning_rate=0.1, discount=0.95, exploration_rate=1.0, iterations=10_000):
        self.q_table = np.zeros(shape=(42, 42))
        self.learning_rate = learning_rate
        self.discount = discount
        self.exploration_rate = exploration_rate
        self.exploration_delta = exploration_rate / iterations
        
        self.input_count = 42
        self.output_count = 42
        
        self.session = tf.Session()
        self.define_model()
        self.session.run(self.initializer)
    
    def define_model(self):
        self.model_input = tf.placeholder(dtype=tf.float32, shape=[ None, self.input_count ])
        
        fc1 = tf.layers.dense(self.model_input, 16, activation=tf.sigmoid, kernel_initializer=tf.constant_initializer(np.zeros((self.input_count, 5))))
        fc2 = tf.layers.dense(fc1, 16, activation=tf.sigmoid, kernel_initializer=tf.constant_initializer(np.zeros((6, self.output_count))))
        
        self.model_output = tf.layers.dense(fc2, self.output_count)
        
        self.target_output = tf.placeholder(shape=[ None, self.output_count ], dtype=tf.float32)
        loss = tf.losses.mean_squared_error(self.target_output, self.model_output)
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(loss)
        
        self.initializer = tf.global_variables_initializer()
    
    def get_Q(self, state):
        return self.session.run(self.model_output, feed_dict={ self.model_input: [state.flatten()] })[0]# Batching!! Dimensions!
        
    def next_action(self, state):
        if random.random() < self.exploration_rate:
            return self.random_action()
        else:
            return self.greedy_action(state)
        
    def random_action(self):
        return random.randrange(0, 42) # Maybe change the probability distribution?
    
    def greedy_action(self, state):
        return np.argmax(self.get_Q(state))
    
    def update(self, old_state, new_state, action, reward):
        self.train(old_state, new_state, action, reward)
        # TODO: Maybe change algorithm?
        if self.exploration_rate > 0:
            self.exploration_rate = max(0.2, self.exploration_rate - self.exploration_delta)
        
    def train(self, old_state, new_state, action, reward):
        old_state_values = self.get_Q(old_state)
        new_state_values = self.get_Q(new_state)
        
        new_reward = reward + self.discount * np.amax(new_state_values)
        old_state_values[action] = new_reward
        
        training_input = [old_state.flatten()]
        target_output = [ old_state_values ]
        training_data = { self.model_input: training_input, self.target_output: target_output }
        
        self.session.run(self.optimizer, feed_dict=training_data)
        

# Orchestration

In [4]:
iterations = 10000

In [5]:
deep_Q_learning = DeepQTensorflowAgent(iterations=iterations)
deep_Q_dummy = DeepQTensorflowAgent(iterations=iterations)

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [6]:
game = ConnectFourSimulator()

In [7]:
#print("Input:", input())

In [8]:
invalids = []
invalid = 0
for step in range(10000):
    if step % 250 == 0:
        print("Still running at iteration %d", step)
        
    
    old_state = game.board
    action = deep_Q_learning.next_action(old_state)
    game_over, new_state, cur_player, cur_reward, _, _ = game.take_action(action)
    
    if (step + 1) % 100 == 0:
            invalids.append(invalid)
            invalid = 0
    if cur_reward < 0: # Invalid action
        deep_Q_learning.update(old_state, new_state, action, cur_reward)
        invalid += 1
        continue
        
    #print(game_over)
    #print(new_state)
    #print(cur_player)
    #print(cur_reward)
    
    if game_over:
        deep_Q_learning.update(old_state, new_state, action, cur_reward)
        game = ConnectFourSimulator()
        continue
    
    next_action = deep_Q_dummy.next_action(new_state)
    game_over, next_state, _, active_reward, passive_player, passive_reward = game.take_action(action)
    
    counting_stars = 0
    while active_reward < 0: # Invalid move (infinite loop possible?)
        next_action = deep_Q_dummy.next_action(new_state)
        game_over, next_state, _, active_reward, passive_player, passive_reward = game.take_action(next_action)
        counting_stars += 1
        if counting_stars % 1000 == 0:
            print("Counting:", counting_stars)
            print("Using action:", action)
    
    #print(game_over)
    #print(next_state)
    #print(passive_player)
    #print(passive_reward)
    
    if game_over:
        deep_Q_learning.update(old_state, new_state, action, passive_reward)
        game = ConnectFourSimulator()
    else:
        deep_Q_learning.update(old_state, new_state, action, cur_reward)
    # Missing: check for invalid move
    # Maybe add passive mode to game (for the 2nd player)
    #print("----------------------------")
        
print("Invalids:", invalids)
print("Total:", sum(invalids))

Still running at iteration %d 0
Still running at iteration %d 250
Still running at iteration %d 500
Still running at iteration %d 750
Still running at iteration %d 1000
Still running at iteration %d 1250
Still running at iteration %d 1500
Still running at iteration %d 1750
Still running at iteration %d 2000
Still running at iteration %d 2250
Still running at iteration %d 2500
Still running at iteration %d 2750
Still running at iteration %d 3000
Still running at iteration %d 3250
Still running at iteration %d 3500
Still running at iteration %d 3750
Still running at iteration %d 4000
Still running at iteration %d 4250
Still running at iteration %d 4500
Still running at iteration %d 4750
Still running at iteration %d 5000
Still running at iteration %d 5250
Still running at iteration %d 5500
Still running at iteration %d 5750
Still running at iteration %d 6000
Still running at iteration %d 6250
Still running at iteration %d 6500
Still running at iteration %d 6750
Still running at iteration