In [15]:
import numpy as np

In [16]:
# initialize NN and set parameters
input_size = 9 
output_size = 9
epsilon = 0.3 # Probabilty to take random actions
gamma = 0.7 # Discount factor

W1 = np.random.randn(input_size, output_size) / np.sqrt(output_size)
b1 = np.random.randn(output_size) / np.sqrt(output_size)

def forward_pass(x):
    return np.dot(x,W1) + b1

def get_propabilities(x):
    return np.exp(x)/np.sum(np.exp(x))


print(f'{W1=}\n {b1=}') 

W1=array([[-0.11569188,  0.18758434,  0.4064702 ,  0.30077469, -0.14208686,
         0.21211038, -0.05373123, -0.0565887 ,  0.13017197],
       [ 0.63724119,  0.3343816 ,  0.77357117, -0.21281862, -0.04372519,
         0.417956  ,  0.22006791, -0.02329753, -0.16515111],
       [ 0.32296089, -0.01441343, -0.17309183, -0.80075857,  0.35474248,
         0.00347848, -0.14952959, -0.49801301, -0.09529567],
       [ 0.35812143,  0.15035082,  0.88417077,  0.11678249, -0.35460181,
         0.33435319,  0.16843777,  0.17523144,  0.32437906],
       [-0.07964315, -0.27466987,  0.469438  , -0.23678733, -0.38010117,
         0.27911987,  0.43397022,  0.19890074, -0.01413051],
       [-0.24390604, -0.5080575 , -0.17373522, -0.29707844,  0.31877921,
        -0.32188755, -0.10922462, -0.07035495,  0.09345557],
       [ 0.00426819, -0.12890278,  0.4050655 , -0.15011606, -0.00100129,
        -0.11921062, -0.57004374,  0.14386591, -0.65519038],
       [-0.19085414,  0.18704138,  0.97152796, -0.38945102,

In [8]:
dummy_x = np.random.randn(9)

logits = forward_pass(dummy_x)
probabilities = get_propabilities(logits)
print(f'{logits=}\n {probabilities=}')

logits=array([-2.4192771 ,  0.26096579,  0.9994659 ,  0.02865598,  0.13620215,
       -0.80573426,  1.72681883, -0.10610988,  3.08538362])
 probabilities=array([0.0025335 , 0.03696036, 0.07735042, 0.02929849, 0.03262511,
       0.01271962, 0.16008405, 0.02560456, 0.62282388]) 
 1.0


In [54]:
def print_board(s):
    x = ['-', 'x', 'o']
    print(f' [{x[s[0]]} {x[s[1]]} {x[s[2]]}] \n [{x[s[3]]} {x[s[4]]} {x[s[5]]}] \n [{x[s[6]]} {x[s[7]]} {x[s[8]]}] \n')

def get_discounted_rewards(actions, reward):
    rewards = [reward * gamma**i for i in range(len(actions))]
    return rewards[::-1]    

In [57]:
# define game loop. Start with the Ai always starting and the opponent going second
ai_token = 1
opponent_token = 2

def take_random_action(board, token):
    free_indices = [i for i, val in enumerate(board) if val == 0]
    action = np.random.choice(free_indices)
    board[action] = token
    return action, board

def take_action(board, logits, token):
    if epsilon > np.random.random_sample():
        action, board = take_random_action(board, token)
    else:
        free_indices = [i for i, val in enumerate(board) if val == 0]
        l_free = logits[free_indices] # get logits of possible moves
        p = get_propabilities(l_free)
        action = np.random.choice(free_indices, p=p)
        board[action] = token
    return action, board

def check_winner(board):

    winning_combinations = [
        (0, 1, 2), (3, 4, 5), (6, 7, 8),
        (0, 3, 6), (1, 4, 7), (2, 5, 8),
        (0, 4, 8), (2, 4, 6)
    ]
    for combo in winning_combinations:
        if board[combo[0]] == board[combo[1]] == board[combo[2]] != 0:
            return board[combo[0]] 
    return 0

def get_reward(winner):
    if winner == ai_token:
        return 1
    elif winner == opponent_token:
        return -1
    else:
        return 0

def play_game():
    board = [0] * 9
    boards_arr = []
    actions_arr = []
    logits_arr = []
    for t in range(9):
        if t%2 == 0:
            boards_arr.append(board.copy())
            logits = forward_pass(board)
            action, board = take_action(board, logits, ai_token)
            actions_arr.append(action) 
            logits_arr.append(logits)
        else:
            action, board = take_random_action(board, opponent_token)   
        winner = check_winner(board)
        reward = get_reward(winner)
        if reward:
            break    
    return boards_arr, actions_arr, logits_arr, reward

boards, actions, logits, reward = play_game()

In [58]:
rewards = get_discounted_rewards(actions, reward)
print(rewards)

[-0.3429999999999999, -0.48999999999999994, -0.7, -1.0]


In [59]:
print(f'{actions=} \n {logits=}')

actions=[1, 5, 2, 4] 
 logits=[array([-0.21401941,  0.13248778,  0.10323213,  0.05765872,  0.09922875,
        0.28954263, -0.88812378, -0.37292827,  0.34824297]), array([ 0.19183801,  0.84203806,  1.68974371,  0.44638948, -0.22867017,
        1.13171939, -0.77551832, -0.5094032 ,  0.44343579]), array([-0.4337763 ,  0.70806332,  3.45906442, -0.629591  ,  1.19870031,
        0.14802567, -0.90408497, -0.89403561,  0.84819851]), array([-0.10227903,  0.43584433,  4.0961036 , -1.73058169,  1.55144021,
       -0.08691709, -2.19370203, -1.1043168 , -0.55747791])]


In [62]:
episodes = 2 # Do 10 training episodes
n = 10 # play 1000 games

def get_ydiff(logits, actions):
    y_diff = []
    for lg, a in zip(logits, actions):
        act_arr = [0] * 9
        act_arr[a] = 1
        y_diff.append(act_arr - get_propabilities(lg))
    return y_diff

for e in range(episodes):
    X = [] # Training Data of board states
    y_diff = [] # Difference between action taken and output probabilities
    R = [] # Rewards for the (X,y) samples
    for i in range(n):
        boards, actions, logits, reward = play_game()
        rewards = get_discounted_rewards(actions, reward)
        diff = get_ydiff(logits, actions)
        X.append(boards)
        y_diff.append(diff)
        R.append(rewards)

print(f'{X=} \n {y_diff=} \n {R=}')


X=[[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 1, 0, 0, 0], [1, 2, 0, 2, 0, 1, 0, 0, 0], [1, 2, 1, 2, 0, 1, 2, 0, 0], [1, 2, 1, 2, 2, 1, 2, 1, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 2, 0], [0, 1, 1, 0, 2, 0, 0, 2, 0], [0, 1, 1, 0, 2, 2, 1, 2, 0], [2, 1, 1, 0, 2, 2, 1, 2, 1]], [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 1, 0, 0, 0], [1, 2, 2, 0, 0, 1, 0, 0, 0], [1, 2, 2, 0, 2, 1, 0, 1, 0], [1, 2, 2, 2, 2, 1, 1, 1, 0]], [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 2, 0], [0, 0, 2, 0, 1, 1, 0, 2, 0], [0, 2, 2, 0, 1, 1, 0, 2, 1]], [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 2, 0], [1, 0, 0, 0, 0, 1, 0, 2, 2], [1, 0, 2, 0, 0, 1, 1, 2, 2], [1, 2, 2, 0, 1, 1, 1, 2, 2]], [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 2, 0, 0, 0, 0, 0], [2, 1, 1, 2, 0, 0, 0, 0, 0], [2, 1, 1, 2, 1, 0, 0, 2, 0], [2, 1, 1, 2, 1, 1, 0, 2, 2]], [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 2, 0, 0, 1, 0], [2, 0, 1, 0, 2, 0, 0, 1, 0], [2, 0, 1, 0, 2, 2, 0, 1, 1], [2, 2, 1, 1, 2, 2, 0, 1, 1]