In [129]:
import numpy as np

In [298]:
# initialize NN and set parameters
input_size = 9 
output_size = 9
epsilon = 0.0 # Probabilty to take random actions
gamma = 0.7 # Discount factor
learning_rate = 1e-3

W1 = np.random.randn(input_size, output_size) / np.sqrt(output_size)
b1 = np.random.randn(output_size) / np.sqrt(output_size)

def forward_pass(x):
    return np.dot(x,W1) + b1

def backward_pass(X, Y_diff):
    dW = np.dot(X.T,Y_diff)
    db = np.mean(Y_diff, axis = 0)
    return dW, db

def get_propabilities(x):
    return np.exp(x)/np.sum(np.exp(x))


print(f'{W1=}\n {b1=}') 

W1=array([[ 0.16398025,  0.13782668,  0.01902972,  0.2762423 , -0.05351312,
        -0.10859805, -0.48211178, -1.01243345, -0.18728845],
       [ 0.34425722,  0.63490566, -0.38263529, -0.10981032, -0.1485874 ,
        -0.36745441, -0.22019292,  0.65648781, -0.06217177],
       [ 0.00910355,  0.11803658, -0.39973997, -0.10408328, -0.06160152,
         0.64590874,  0.0557594 , -0.42742566,  0.4278595 ],
       [ 0.8069631 ,  0.35126014,  0.51547828, -0.25349181, -0.04062494,
        -0.44898085, -0.2765095 ,  0.12688572,  0.01766297],
       [-0.02135362,  0.36555394, -0.07914732,  0.09160859, -0.09910657,
        -0.31553757,  0.05947518, -0.20134149,  0.31935984],
       [-0.70245869,  0.38564473,  0.1562766 ,  0.26826128,  0.4609437 ,
         0.07650569, -0.06513652,  0.36672963, -0.13891527],
       [ 0.16617793, -0.38890609,  0.13127798,  0.16183769,  1.07110408,
         0.02249781,  0.02657527, -0.13402129,  0.44636766],
       [ 0.01739804, -0.20990466, -0.17972936, -0.20266385,

In [131]:
dummy_x = np.random.randn(9)

logits = forward_pass(dummy_x)
probabilities = get_propabilities(logits)
print(f'{logits=}\n {probabilities=}')

logits=array([-0.44920216,  0.2333591 ,  0.50280206, -0.22774109,  0.49907184,
       -1.63922816,  0.18299617, -0.15587364, -1.04781754])
 probabilities=array([0.07420931, 0.14685577, 0.1922687 , 0.0926057 , 0.19155283,
       0.02257546, 0.13964283, 0.09950602, 0.04078336])


In [132]:
def print_board(s):
    x = ['-', 'x', 'o']
    print(f' [{x[s[0]]} {x[s[1]]} {x[s[2]]}] \n [{x[s[3]]} {x[s[4]]} {x[s[5]]}] \n [{x[s[6]]} {x[s[7]]} {x[s[8]]}] \n')

def get_discounted_rewards(actions, reward):
    rewards = [reward * gamma**i for i in range(len(actions))]
    return rewards[::-1]    

In [187]:
# define game loop. Start with the Ai always starting and the opponent going second
ai_token = 1
opponent_token = 2

def take_random_action(board, token):
    free_indices = [i for i, val in enumerate(board) if val == 0]
    action = np.random.choice(free_indices)
    board[action] = token
    return action, board

def take_action(board, logits, token):
    if epsilon > np.random.random_sample():
        action, board = take_random_action(board, token)
    else:
        free_indices = [i for i, val in enumerate(board) if val == 0]
        l_free = logits[free_indices] # get logits of possible moves
        p = get_propabilities(l_free)
        action = np.random.choice(free_indices, p=p)
        board[action] = token
    return action, board

def check_winner(board):

    winning_combinations = [
        (0, 1, 2), (3, 4, 5), (6, 7, 8),
        (0, 3, 6), (1, 4, 7), (2, 5, 8),
        (0, 4, 8), (2, 4, 6)
    ]
    for combo in winning_combinations:
        if board[combo[0]] == board[combo[1]] == board[combo[2]] != 0:
            return board[combo[0]] 
    return 0

def get_reward(winner):
    if winner == ai_token:
        return 1
    elif winner == opponent_token:
        return -1
    else:
        return 0

def play_game(verbose = False):
    board = [0] * 9
    boards_arr = []
    actions_arr = []
    logits_arr = []
    for t in range(9):
        if t%2 == 0:
            boards_arr.append(board.copy())
            logits = forward_pass(board)
            action, board = take_action(board, logits, ai_token)
            actions_arr.append(action) 
            logits_arr.append(logits)
        else:
            action, board = take_random_action(board, opponent_token)   
        winner = check_winner(board)
        reward = get_reward(winner)
        if verbose:
            print_board(board)
        if reward:
            break    
    return boards_arr, actions_arr, logits_arr, reward

boards, actions, logits, reward = play_game()

In [134]:
rewards = get_discounted_rewards(actions, reward)
print(rewards)

[0.3429999999999999, 0.48999999999999994, 0.7, 1.0]


In [135]:
print(f'{actions=} \n {logits=}')

actions=[4, 0, 3, 6] 
 logits=[array([-0.07448157, -0.51780425, -0.08879829,  0.41894532, -0.51326728,
       -0.71489169, -0.0603893 , -0.03256772, -0.33317235]), array([ 1.39514956, -2.20877258,  0.25565649,  0.28250359, -1.6408395 ,
        0.39951564,  0.19126359, -0.4568058 , -0.42526926]), array([ 0.93492332, -0.6777083 , -1.07536591,  0.08331873, -1.65254216,
        1.24927242,  0.76636498, -0.21948233, -1.48464362]), array([ 1.46695489, -1.42480559, -1.94351699,  0.65118643, -2.50910046,
        0.90646256,  1.26078212,  0.17534536, -0.90762202])]


In [305]:
episodes = 10 # Do 10 training episodes
n = 1000 # play 1000 games

def get_ydiff(logits, actions):
    y_diff = []
    for lg, a in zip(logits, actions):
        act_arr = [0] * 9
        act_arr[a] = 1
        y_diff.append(act_arr - get_propabilities(lg))
    return y_diff

for e in range(episodes):
    X, y_diff, R = [], [], [] # X : Training Data of board states, y_diff Difference between action taken and output probabilities
    R_sum = 0
    # R: Rewards for the (X,y) samples
    for i in range(n):
        boards, actions, logits, reward = play_game()
        rewards = get_discounted_rewards(actions, reward)
        diff = get_ydiff(logits, actions)

        X.append(boards)
        y_diff.append(diff)
        R.append(rewards)
        R_sum += reward

    # reduce dimensionality to avoid problems with different game length
    Xeps = np.array([b for g in X for b in g]) 
    yeps = np.array([y for g in y_diff for y in g])
    Reps = np.array([r for g in R for r in g])

    # normalize the rewards
    Reps -= np.mean(Reps)
    Reps /= np.std(Reps)

    yeps *= Reps[:, np.newaxis] # mutiply gradient and reward

    dW, db = backward_pass(Xeps, yeps)
    W1 += learning_rate*dW
    b1 += learning_rate*db

    print(R_sum/n)

print(Xeps.shape)
print(yeps.shape)
print(Reps.shape)


0.554
0.593
0.555
0.562
0.58
0.575
0.564
0.538
0.551
0.574
(3898, 9)
(3898, 9)
(3898,)


In [258]:
board = [0] * 9
board[3] = 1
board[1] = 2
logits = forward_pass(board)
probs = get_propabilities(logits)
probs

array([0.1735514 , 0.09363312, 0.12925895, 0.03711028, 0.27139854,
       0.01675   , 0.13985467, 0.01363825, 0.12480478])

In [240]:
epsilon = 0.0
test = play_game(verbose = True)


 [- - -] 
 [- x -] 
 [- - -] 

 [o - -] 
 [- x -] 
 [- - -] 

 [o - x] 
 [- x -] 
 [- - -] 

 [o - x] 
 [- x -] 
 [o - -] 

 [o - x] 
 [- x -] 
 [o - x] 

 [o - x] 
 [o x -] 
 [o - x] 



In [139]:
np.sum(R)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

In [128]:
print(np.dot(Xeps.T,yeps))
print(np.mean(yeps, axis = 0))

[[-2.9257598  -0.12808648  0.67169048 -0.46612859 -1.24246901  5.51359058
  -5.19300185  1.01586     2.75430468]
 [ 1.73272651 -0.90027208  2.97492357 -0.76716662 -2.69652687  3.08612168
  -1.5684573  -3.09438909  1.2330402 ]
 [-0.53691396  0.8506162  -1.9221383   1.62784548 -2.74632659  2.66225984
  -1.06017609 -0.69583394  1.82066737]
 [ 1.88056057 -0.58182148 -2.22157024 -0.93973921 -2.29289809  4.598211
  -4.42971036  1.65559914  2.33136868]
 [-1.41193656 -0.30103781  4.46352023  2.41074382 -6.91654863  3.13049463
  -0.75365271 -1.33327531  0.71169234]
 [-0.19486346  1.47750293 -0.7810992   1.1693585   0.57564645 -0.82242128
  -4.02281438  0.05167864  2.5470118 ]
 [-1.9737401  -0.15485452  2.55924902 -0.57359852 -0.91462379  3.60481575
  -3.53194071 -0.82379213  1.80848501]
 [ 0.20398151  0.69563055 -1.09549952  2.536647   -2.79908921  2.29578348
  -2.68699572 -0.80254082  1.65208273]
 [-4.33880452  1.64694909  4.70475007  3.31689319 -9.44965072  5.91526758
   0.67899243 -1.9633674