# Simulation

In [1]:
# %load ./../../../games/connect-four/connect-four.py
import numpy as np

class ConnectFourSimulator:
	"""Creates a connect-4 board and simulates it, returning states and rewards for any taken action.

	The creates board is a 6 x 7 (rows x cols) array. Empty fields are denoted by 0.
	Tokens placed by player one are denoted by '1' and player two uses '-1'.
	Every field is part of the state and has it's own index, simply counting from 0 to 41 along the rows
	like so [
		[0, 1, 2, 3, 4, 5, 6],
		[7, 8, 9, 10, 11, 12, 13],
		...
		[35, 36, 37, 38, 39, 40, 41]
	]
	"""
	def __init__(self):
		self.width = 7
		self.height = 6
		self.board = np.zeros(shape=(self.height, self.width))
		self.PLAYER1 = 1
		self.PLAYER2 = -1
		self.DRAW = 0
		self.current_player = self.PLAYER1
		self.valid_actions = list(range(self.width))
		self.__game_over = False

	def take_action(self, action):
		"""Executes the action and returns the next state and the received reward."""
		active_player = self.current_player
		inactive_player = self.__negated_player(active_player)
		if not self.__action_is_valid(action):
			return self.__game_over, self.board, active_player, -2, inactive_player, 0

		self.__play_move(action)

		self.__game_over = self.__game_is_over(action)
		if self.__game_over:
			winner = self.__winner(action)
			if winner == self.DRAW:
				return self.__game_over, self.board, active_player, 0, inactive_player, 0
			#elif winner == self.PLAYER1:
			#	return self.__game_over, self.board, active_player, 1, inactive_player, -1
			#else:
			#	return self.__game_over, self.board, active_player, -1, inactive_player, 1
			return self.__game_over, self.board, active_player, 1, inactive_player, -1

		return self.__game_over, self.board, active_player, 0, inactive_player, 0

	def print_board(self):
		board = self.board
		board = np.where(board == 1, "X", board)
		board = np.where(board == "-1.0", "O", board)
		print(np.where(board == "0.0", "-", board))

	def __play_move(self, action):
		"""Takes an action and executes it."""
		x, y = self.__coordinates_from_action(action)
		self.board[y][x] = self.current_player
		self.current_player = self.__negated_player(self.current_player)

	def __action_is_valid(self, action):
		"""Checks if the intended action is a valid one or if it breaks the rules of the game."""
		# if 41 > action < 0:
		# 	return False
		# x, y = self.__coordinates_from_action(action)
		# if x >= self.width or y >= self.height:
		# 	return False
		#
		# height_x = self.__column_height(x)
		#
		# if y != height_x:
		# 	return False
		# return True
		is_valid = action in self.valid_actions

		if is_valid:
			next_valid_action = action + self.width
			if next_valid_action < self.width * self.height:
				self.valid_actions.append(next_valid_action)
			self.valid_actions.remove(action)
		return is_valid

	def __column_height(self, x):
		"""Returns the height of a column which is equal to the amount of tokens placed."""
		column = self.board[:, x]
		return np.count_nonzero(column)

	def __game_is_over(self, last_action):
		"""Returns True if the game is over and False otherwise."""
		if np.count_nonzero(self.board) >= 42:
			return True

		lines = self.__extract_lines(last_action)

		for line in lines:
			if self.__winner_in_line(line) != 0:
				return True

		return False

	def __extract_lines(self, last_action):
		"""Extracts the horizontal, vertical and the diagonal lines going through the last action"""
		x, y = self.__coordinates_from_action(last_action)

		row = self.board[y]
		column = self.board[:, x]
		top_down_diagonal = self.board.diagonal(x - y)

		mirrored_x = self.width - 1 - x
		bot_up_diagonal = np.fliplr(self.board).diagonal(mirrored_x - y)

		return row, column, top_down_diagonal, bot_up_diagonal

	def __winner(self, last_action):
		"""Returns the winner's number or 0 if the game resulted in a draw (Requires the game to have ended)."""
		lines = self.__extract_lines(last_action)

		for line in lines:
			winner = self.__winner_in_line(line)
			if winner != 0:
				return winner

		return 0

	def __winner_in_line(self, line):
		"""Checks if a line contains a winner and returns his number if yes and 0 otherwise."""
		token_sum = 0
		for token in line:
			token_sum += token
			if token_sum == 4 * self.PLAYER1:
				return self.PLAYER1
			if token_sum == 4 * self.PLAYER2:
				return self.PLAYER2
			if token_sum < 0 < token or token_sum > 0 > token:
				token_sum = 0
		return 0

	def __coordinates_from_action(self, action):
		"""Translates an action into (x, y) / (column, row) coordinates."""
		x = action % self.width
		y = action // self.width
		return x, y

	def __negated_player(self, player):
		"""Returns the player not passed to the function (Player1 if Player2 is passed and the other way around)."""
		return self.PLAYER2 if self.current_player == self.PLAYER1 else self.PLAYER1


In [2]:
game = ConnectFourSimulator()

val = input()
while val != "q":
    game_over, board, _, _, _, _ = game.take_action(int(val))
    print(game_over)
    print(board)
    print("------------------------------------")
    val = input()

print(game.take_action(3))
print(game.take_action(4))
print(game.take_action(10))
print(game.take_action(5))
print(game.take_action(17))
print(game.take_action(6))
print(game.take_action(24))

# Agent

In [3]:
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(42, 64)
        #self.fc1.weight.data.fill_(0.0)
        #self.fc1.bias.data.fill_(0.0)
        self.fc2 = nn.Linear(64, 64)
        #self.fc2.weight.data.fill_(0.0)
        #self.fc2.bias.data.fill_(0.0)
        self.fc3 = nn.Linear(64, 42)
        #self.fc3.weight.data.fill_(0.0)
        #self.fc3.bias.data.fill_(0.0)
        
    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        x = self.fc3(x)
        return x

In [4]:
import random

import torch
import torch.optim as optim

class DeepQPytorchAgent:
    def __init__(self, learning_rate=0.0001, discount=0.95, exploration_rate=1.0, iterations=10_000, trained_model=None):
        self.q_table = np.zeros(shape=(42, 42))
        self.learning_rate = learning_rate
        self.discount = discount
        self.exploration_rate = exploration_rate
        self.exploration_delta = exploration_rate / iterations
        
        self.input_count = 42
        self.output_count = 42
        
        self.define_model(trained_model)
    
    def define_model(self, trained_model):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        if trained_model:
            self.model = trained_model.to(self.device)
        else:
            self.model = Model().to(self.device)
        
        #self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
    
    def get_Q(self, state_batch):
        return self.model(state_batch)
        
    def next_action(self, state, valid_actions):
        if random.random() < self.exploration_rate:
            return self.random_action(valid_actions)
        else:
            return self.greedy_action(state, valid_actions)
        
    def random_action(self, valid_actions):
        action = random.randrange(0, 42)
        while not action in valid_actions:
            action = random.randrange(0, 42)
        return action
    
    def greedy_action(self, state_batch, valid_actions):
        #print("States before:", state_batch)
        #print("Greedy:", torch.max(self.get_Q(state_batch), 1)[1])
        Q_values = self.get_Q(state_batch)[0]
        Q_values = self.normalized_Q(Q_values, valid_actions)
        action = torch.max(Q_values, 0)[1]
        assert action in valid_actions, "Only valid actions may be selected"
        return action
        #return torch.max(self.get_Q(state_batch), 1)[1]
    
    def normalized_Q(self, Q_values, valid_actions):
        '''Takes a single Q value array and sets invalid actions to -1.'''
        for x in range(0, 41):
            if not x in valid_actions:
                Q_values[x] = -1.0
        return Q_values
        
    def update(self, old_states, new_states, actions, rewards, valid_actions_batch):
        self.train(old_states, new_states, actions, rewards, valid_actions_batch)
        # TODO: Maybe change algorithm?
        if self.exploration_rate > 0:
            self.exploration_rate = max(0.2, self.exploration_rate - self.exploration_delta)
        
    def train(self, old_states, next_states, actions, rewards, valid_actions_batch):
        old_state_values = self.get_Q(old_states)
        next_state_values = self.get_Q(next_states).detach()
        
        #print("Max:", torch.max(next_state_values, dim=1)[0])
        for x in range(len(next_state_values)):
            valid_actions = valid_actions_batch[x]
            next_state_values[x] = self.normalized_Q(next_state_values[x], valid_actions)
            
        new_rewards = rewards + self.discount * torch.max(next_state_values, dim=1)[0]
        updated_state_values = old_state_values.clone().detach() # Check if detach could cause problems
        for index, (reward, action) in enumerate(zip(new_rewards, actions)):
            updated_state_values[index][action] = reward
        
        #print("Old state values:", old_state_values)
        #print("New reward:", new_rewards)
        #print("Updated:", updated_state_values)
        #print("Actions:", actions)
        #print("SelectedByActions:", updated_state_values[actions])
        #updated_state_values[actions] = new_rewards
        
        # in your training loop:
        self.optimizer.zero_grad()   # zero the gradient buffers
        loss = F.smooth_l1_loss(old_state_values, updated_state_values)
        loss.backward()
        self.optimizer.step()    # Does the update

# Orchestration

In [None]:
from collections import namedtuple

Transition = namedtuple('Transition',
                        ('old_state', 'next_state', 'action', 'reward', 'valid_actions'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
def transition(game, active, passive):
    old_state = np.copy(game.board)
    old_state = torch.tensor(old_state.flatten(), device=active.device).float()
    valid_actions = torch.tensor(game.valid_actions, device=active.device).clone().long()
    
    action = active.next_action(torch.unsqueeze(old_state, dim=0), valid_actions)
    #action = torch.tensor(action, device=active.device).long()
    
    game_over, next_state, _, reward, _, _ = game.take_action(action)
    next_state = torch.tensor(next_state.flatten(), device=active.device).float()
    reward = torch.tensor(reward, device=active.device).float()
        
    if game_over:
        return True, old_state, next_state, action, reward, valid_actions
            
    # if the move was invalid, add data and repeat
    if reward < 0:
        return False, old_state, next_state, action, reward, valid_actions
        
    # Play another move until the move is a right one and add the data to the memory
    passive_reward = -1
    counting_stars = 0
    while passive_reward < 0:
        passive_action = passive.next_action(torch.unsqueeze(next_state, dim=0), game.valid_actions)
        game_over, _, _, passive_reward, _, cur_reward = game.take_action(passive_action)
        
        counting_stars += 1
        if counting_stars % 1000 == 0:
                print("Counting:", counting_stars)
        
    cur_reward = torch.tensor(cur_reward, device=active.device).float()
    if game_over:
        return True, old_state, next_state, action, cur_reward, valid_actions
    return False, old_state, next_state, action, reward, valid_actions

In [None]:
def optimize_model(active, passive, memory, batch_size=128):
    if len(memory) < batch_size:
        return

    batch = memory.sample(batch_size)
    batch = Transition(*zip(*batch))
    #print("States before:\n", batch.old_state)
    #print("Next states before:\n", batch.next_state)
    #print("Actions before:\n", batch.action)
    #print("Rewards before:\n", batch.reward)
    old_state_batch = torch.stack(batch.old_state, dim=0)
    next_state_batch = torch.stack(batch.next_state, dim=0)
    action_batch = torch.tensor(batch.action, device=active.device)
    reward_batch = torch.tensor(batch.reward, device=active.device)
    #valid_actions_batch = torch.stack(batch.valid_actions, dim=0)
    valid_actions_batch = batch.valid_actions
    #action_batch = torch.stack(batch.action, dim=0)
    #reward_batch = torch.stack(batch.reward, dim=0)
    
    return active.update(old_state_batch, next_state_batch, action_batch, reward_batch, valid_actions_batch)

In [None]:
game = ConnectFourSimulator()

In [None]:
#%matplotlib notebook
import matplotlib.pyplot as plt

In [None]:
epochs = 150 # Number of games to play
batch_size = 128
memory = ReplayMemory(10000)
active = DeepQPytorchAgent(iterations=epochs*batch_size*20)
passive = DeepQPytorchAgent(iterations=epochs*batch_size*20)

In [None]:
example_board = np.array([
    [0,0,0,1,-1,0,0],
    [0,0,0,1,-1,0,0],
    [0,0,0,1,-1,0,0],
    [0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0]
])
example_board = torch.tensor(example_board.flatten(), device=active.device).float()

In [None]:
y_batch = torch.unsqueeze(example_board, dim=0)
print(y_batch)
print(active.next_action(y_batch, [0, 1, 2, 3, 4, 5, 6]))
print(active.get_Q(y_batch))
print(torch.max(active.normalized_Q(active.get_Q(y_batch)[0], list(range(7))), 0)[1])

tensor([[ 0.,  0.,  0.,  1., -1.,  0.,  0.,  0.,  0.,  0.,  1., -1.,  0.,  0.,
          0.,  0.,  0.,  1., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]],
       device='cuda:0')
3
tensor([[ 9.0921e-01, -2.6158e-01, -9.2689e-02,  4.4667e-01,  2.0170e-01,
         -1.0691e-01,  1.4898e-02, -5.5755e-02, -5.1560e-02, -7.2132e-01,
         -1.6313e-01,  2.0654e-03, -5.5427e-02, -6.0464e-01, -8.1798e-04,
         -6.1640e-01,  1.1654e-01,  3.3788e-01,  4.7357e-01, -4.5214e-01,
         -3.1527e-02, -6.9223e-01, -1.3575e-01,  1.7507e-01,  3.2202e-02,
         -1.0946e-01, -4.6527e-01, -6.0050e-01,  1.5571e-01, -8.6690e-03,
         -8.8917e-01, -5.1978e-01, -2.1948e-01,  4.3289e-01,  1.6174e-01,
         -5.1315e-02,  3.3754e-01,  9.2839e-02, -2.2890e-01,  4.0602e-02,
         -2.4755e-01,  4.2710e-01]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor(0, device='cuda:0')


In [None]:
import time

# Using memory replay
total_rewards = [ [torch.tensor(0)] for epoch in range(epochs) ]
total = [torch.tensor(0)]
print(total_rewards)
start = int(round(time.time() * 1000))
for epoch in range(epochs):
    #invalids = []
    #invalid = 0
    #for iteration in range(1, iterations + 1):
    print("Epoch:", epoch)
    game_over = False
    game = ConnectFourSimulator()
    while not game_over:
        optimize_model(active, passive, memory, batch_size)
        passive.model.load_state_dict(active.model.state_dict())
                      
        game_over, old_state, next_state, action, reward, valid_actions = transition(game, active, passive)

        memory.push(old_state, next_state, action, reward, valid_actions)
        total_rewards[epoch].append(total_rewards[epoch][-1] + reward)
        total.append(total[-1] + reward)
end = int(round(time.time() * 1000))
print("Time taken:", (end - start))
print("Time taken in sec:", (end - start) / 1000)
# Time without batching: 657sec (10), 95 (5), 399 (5)
# Time with batching: 12 (5)

[[tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)], [tensor(0)]

In [None]:
total_rewards = [ [ val.tolist() for val in rewards ] for rewards in total_rewards ]
total = [ val.tolist() for val in total ]

In [None]:
print("Average moves to finish:", len(total) / epochs)

In [None]:
plt.figure()
#plt.plot(total_rewards[0])
plt.plot(total)

print(invalids)

In [None]:
# Error in the end comes from the network predicting a result, which is wrong and since exploration is way down it almost
# always predicts the same action which is always wrong. Should somehow learn though (maybe replay necessary?)

# Notes
- Use memory replay --> DONE
- Maybe higher rewards needed for backpropagation of Q values?
- View reward function by playing vs the network
- View network output for certain states
<br>
<br>
- Do I even backpropagate the reward to other states than the winning one in any way?
- Maybe the problem are few games (not enough possibilities learned) -> More iterations like 10_000 games instead of iterations
- Learning rate?
<br>
<br>
- Rework memory replay batch size and epochs analog to pytorch tutorial
- Plot metrics (e.g. total reward every iteration)
- Rework code --> Readability and reusability
- Maybe rework greedy policy
- Test the pytorch agent on the dungeon example --> DONE: Works
- Try increasing the performance (For running in the cloud) -> Use timer
- Maybe no punishment for invalid moves?
- Pass possible moves to network?
- Only give out copies of the state... --> FIXED (This literally ruined every single state in the memory...)
- Copy pytorch tensors via .copy().detach() (maybe more effectively possible as well?)
<br>
<br>
- How to choose rewards and how does the agent learn the rules (punishment for invalid moves?)
- How much training is needed for a game?
- Evaluation tactics: Total reward
- Model too big?
- Don't copy model to update agent? --> Constantly creating optimizer and agent again and again
- Only learning negative values atm --> Why?
- Ignore invalid moves
- Limit reward to between -1 and 1
- Let AI learn both sides at the same time, so playing against it makes more sense?!
<br>
<br>
Takeaways:
- Batching is so much quicker, it is absurd

In [None]:
empty_board = np.zeros(shape=(6, 7))
empty_board = torch.tensor(empty_board.flatten(), device=active.device).float()
empty_board = torch.unsqueeze(empty_board, dim=0)

In [None]:
example_board = np.array([
    [0,0,0,1,-1,0,0],
    [0,0,0,1,-1,0,0],
    [0,0,0,1,-1,0,0],
    [0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0]
])
example_board2 = np.array([
    [0,0,1,1,-1,0,0],
    [0,0,0,1,-1,0,0],
    [0,0,0,1,-1,0,0],
    [0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0]
])

example_board = torch.tensor(example_board.flatten(), device=active.device).float()
example_board = torch.unsqueeze(example_board, dim=0)
example_board2 = torch.tensor(example_board2.flatten(), device=active.device).float()
example_board2 = torch.unsqueeze(example_board2, dim=0)

In [None]:
print(active.next_action(example_board, [0, 1, 2, 24, 25, 5, 6]))
print(active.get_Q(example_board))
print(torch.max(active.normalized_Q(active.get_Q(example_board)[0], [0, 1, 2, 24, 25, 5, 6]), 0)[1])

In [None]:
print(active.next_action(example_board2, [0, 1, 9, 24, 25, 5, 6]))
print(active.get_Q(example_board2))
print(torch.max(active.normalized_Q(active.get_Q(example_board2)[0], [0, 1, 9, 24, 25, 5, 6]), 0)[1])

In [None]:
print(active.next_action(empty_board, range(7)))
print(active.get_Q(empty_board))
print(torch.max(active.normalized_Q(active.get_Q(empty_board)[0], range(7)), 0)[1])

game = ConnectFourSimulator()
val = input()
while val != "q":
    game_over, board, _, _, _, _ = game.take_action(int(val))
    print(game_over)
    print(board)
    print("------------------------------------")
    if not game_over:
        confirmation = "r"
        while confirmation == "r":
            pc_action = active.next_action(board)
            print(pc_action)
            confirmation = input()
            if confirmation == "c":
                game_over, board, _, _, _, _ = game.take_action(pc_action)
                print(game_over)
                print(board)
    val = input()

In [None]:
games = memory.sample(len(memory))

In [None]:
terminal_states = [ game for game in games if game[3] == 100 or game[3] == -100 ]

In [None]:
for state in terminal_states:
    print("Before:\n", state[0])
    print("After:\n", state[1])
    print("With action:", state[2])
    print("---------------------------------------")

In [None]:
for state in memory.memory[:42]:
    print("Before:\n", state[0])
    print("After:\n", state[1])
    print("With action:", state[2])
    print("Reward:", state[3])
    print("---------------------------------------")

In [None]:
class PlayAI:
    def __init__(self, ai):
        self.ai = ai
        self.game = ConnectFourSimulator()
        self.game_started = False
        self.random_actions = 0
        
    def start_game(self):
        self.__ai_move()
        self.game.print_board()
        print("Valid actions:", self.game.valid_actions)
        self.game_started = True
    
    def __ai_move(self):
        valid_actions = self.game.valid_actions
        state = torch.tensor(self.game.board.flatten(), device=self.ai.device).clone().float()
        action = self.ai.next_action(torch.unsqueeze(state, dim=0), valid_actions)
        if not action in valid_actions:
            action_index = random.random(0, len(valid_actions))
            action = valid_actions[action_index]
            self.random_actions += 1
        game_over, _, _, _, _, _ = self.game.take_action(action)
        return game_over
            
        
    def play(self):
        assert self.game_started == True, "Game has not yet been started"
        
        game_over = False
        
        while not game_over:
            action = input()
            if action == "q":
                return
            action = int(action)
            game_over, _, _, reward, _, _, = self.game.take_action(action)
            assert reward >= 0, "Invalid action!"
        
            self.game.print_board()
            if game_over:
                print("You won!")
                return
            
            game_over = self.__ai_move()
            self.game.print_board()
            print("Valid actions:", self.game.valid_actions)
            if game_over:
                print("You lost :/")
                return      

In [None]:
vs_game = PlayAI(active)
vs_game.start_game()
vs_game.play()