In [3]:
import numpy as np

In [17]:
# initialize NN and set parameters
input_size = 9 
output_size = 9
epsilon = 0.0 # Probabilty to take random actions
gamma = 0.7 # Discount factor
learning_rate = 1e-3

W1 = np.random.randn(input_size, output_size) / np.sqrt(output_size)
b1 = np.random.randn(output_size) / np.sqrt(output_size)

def forward_pass(x):
    return np.dot(x,W1) + b1

def backward_pass(X, Y_diff):
    dW = np.dot(X.T,Y_diff)
    db = np.mean(Y_diff, axis = 0)
    return dW, db

def get_propabilities(x):
    return np.exp(x)/np.sum(np.exp(x))


#print(f'{W1=}\n {b1=}') 

In [None]:
class LinearModel:
    def __init__(self, hl_size = 9):
        self.W1 = np.random.randn(output_size, input_size) / np.sqrt(input_size)
        self.b1 = np.random.randn(output_size) / np.sqrt(output_size)

    def forward_pass(self, x):
        x1 =  np.dot(self.W1, x) + self.b1
        return x1
    
    def backward_pass(self, X, Y_diff):
        dW = np.dot(X.T,Y_diff)
        db = np.mean(Y_diff, axis = 0)
        return {'W1':dW, 'b1':db}
    
    def get_propabilities(self, x):
        return np.exp(x)/np.sum(np.exp(x))

In [None]:
class NonlinearModel:
    def __init__(self, hl_size = 9):
        self.W1 = np.random.randn(hl_size, input_size) / np.sqrt(input_size)
        self.W2 = np.random.randn(output_size, hl_size) / np.sqrt(hl_size)

    def forward_pass(self, x):
        x1 =  np.dot(self.W1, x) 
        x1[x1 < 0] = 0 # relu
        x2 = np.dot(self.W2, x1)
        return x2, x1
    
    def backward_pass(self, X, H, Y_diff):
        dW2 = np.dot(H.T, Y_diff).ravel()
        dh = np.outer(Y_diff, self.W2)
        dh[H <= 0] = 0 # backpro prelu
        dW1 = np.dot(dh.T, X)
        return {'W1':dW1, 'W2':dW2}
    
    def get_propabilities(self, x):
        return np.exp(x)/np.sum(np.exp(x))


In [5]:
dummy_x = np.random.randn(9)

logits = forward_pass(dummy_x)
probabilities = get_propabilities(logits)
print(f'{logits=}\n {probabilities=}')

logits=array([-0.73663234,  1.68002054,  1.49501772,  0.00455982,  0.80650807,
        0.54566942,  0.65658166, -1.5193994 , -0.70114445])
 probabilities=array([0.02671856, 0.29946911, 0.24888935, 0.05606718, 0.12502313,
       0.09631847, 0.10761633, 0.0122141 , 0.02768377])


In [6]:
def print_board(s):
    x = ['-', 'x', 'o']
    print(f' [{x[s[0]]} {x[s[1]]} {x[s[2]]}] \n [{x[s[3]]} {x[s[4]]} {x[s[5]]}] \n [{x[s[6]]} {x[s[7]]} {x[s[8]]}] \n')

def get_discounted_rewards(actions, reward):
    rewards = [reward * gamma**i for i in range(len(actions))]
    return rewards[::-1]    

In [7]:
# define game loop. Start with the Ai always starting and the opponent going second
ai_token = 1
opponent_token = 2

def take_random_action(board, token):
    free_indices = [i for i, val in enumerate(board) if val == 0]
    action = np.random.choice(free_indices)
    board[action] = token
    return action, board

def take_action(board, logits, token):
    if epsilon > np.random.random_sample():
        action, board = take_random_action(board, token)
    else:
        free_indices = [i for i, val in enumerate(board) if val == 0]
        l_free = logits[free_indices] # get logits of possible moves
        p = get_propabilities(l_free)
        action = np.random.choice(free_indices, p=p)
        board[action] = token
    return action, board

def check_winner(board):

    winning_combinations = [
        (0, 1, 2), (3, 4, 5), (6, 7, 8),
        (0, 3, 6), (1, 4, 7), (2, 5, 8),
        (0, 4, 8), (2, 4, 6)
    ]
    for combo in winning_combinations:
        if board[combo[0]] == board[combo[1]] == board[combo[2]] != 0:
            return board[combo[0]] 
    return 0

def get_reward(winner):
    if winner == ai_token:
        return 1
    elif winner == opponent_token:
        return -1
    else:
        return 0

def play_game(verbose = False):
    board = [0] * 9
    boards_arr = []
    actions_arr = []
    logits_arr = []
    for t in range(9):
        if t%2 == 0:
            boards_arr.append(board.copy())
            logits = forward_pass(board)
            action, board = take_action(board, logits, ai_token)
            actions_arr.append(action) 
            logits_arr.append(logits)
        else:
            action, board = take_random_action(board, opponent_token)   
        winner = check_winner(board)
        reward = get_reward(winner)
        if verbose:
            print_board(board)
        if reward:
            break    
    return boards_arr, actions_arr, logits_arr, reward

boards, actions, logits, reward = play_game()

In [8]:
rewards = get_discounted_rewards(actions, reward)
print(rewards)

[0.0, 0.0, 0.0, 0.0, 0.0]


In [9]:
print(f'{actions=} \n {logits=}')

actions=[np.int64(8), np.int64(6), np.int64(1), np.int64(3), np.int64(5)] 
 logits=[array([-0.24099705,  0.13863381, -0.13835625,  0.33455209,  0.08784616,
        0.29888912,  0.56022063, -0.08711018,  0.22725257]), array([-1.69209372, -0.249083  ,  0.05398859, -0.47040208, -0.14453246,
       -0.03746941,  0.61012092, -0.61968493, -1.43690049]), array([-1.78871478, -0.25637051, -0.85152251, -0.15397983,  0.80712391,
       -0.12914771,  0.64956039,  0.201317  , -2.50482501]), array([-1.9241332 , -0.20261841, -0.0610732 , -0.29182636,  1.10871276,
       -0.51740419,  0.00783651,  0.6875912 , -1.97572366]), array([-1.94186841,  0.08769638, -0.76587041, -0.19434156,  0.75114375,
       -0.06317836, -0.03367245,  1.09518468, -1.10606512])]


In [10]:
episodes = 10 # Do 10 training episodes
n = 1000 # play 1000 games

def get_ydiff(logits, actions):
    y_diff = []
    for lg, a in zip(logits, actions):
        act_arr = [0] * 9
        act_arr[a] = 1
        y_diff.append(act_arr - get_propabilities(lg))
    return y_diff

for e in range(episodes):
    X, y_diff, R = [], [], [] # X : Training Data of board states, y_diff Difference between action taken and output probabilities
    R_sum = 0
    # R: Rewards for the (X,y) samples
    for i in range(n):
        boards, actions, logits, reward = play_game()
        rewards = get_discounted_rewards(actions, reward)
        diff = get_ydiff(logits, actions)

        X.append(boards)
        y_diff.append(diff)
        R.append(rewards)
        R_sum += reward

    # reduce dimensionality to avoid problems with different game length
    Xeps = np.array([b for g in X for b in g]) 
    yeps = np.array([y for g in y_diff for y in g])
    Reps = np.array([r for g in R for r in g])

    # normalize the rewards
    Reps -= np.mean(Reps)
    Reps /= np.std(Reps)

    yeps *= Reps[:, np.newaxis] # mutiply gradient and reward

    dW, db = backward_pass(Xeps, yeps)
    W1 += learning_rate*dW
    b1 += learning_rate*db

    print(R_sum/n)

print(Xeps.shape)
print(yeps.shape)
print(Reps.shape)


0.281
0.302
0.415
0.355
0.336
0.399
0.423
0.469
0.415
0.475
(4046, 9)
(4046, 9)
(4046,)


In [11]:
board = [0] * 9
board[3] = 1
board[1] = 2
logits = forward_pass(board)
probs = get_propabilities(logits)
probs

array([0.10333687, 0.15810862, 0.14699808, 0.08881846, 0.13631235,
       0.0741378 , 0.11383991, 0.05616346, 0.12228444])

In [12]:
epsilon = 0.0
test = play_game(verbose = True)


 [- - -] 
 [- - -] 
 [- - x] 

 [- - -] 
 [- - -] 
 [- o x] 

 [- - -] 
 [- x -] 
 [- o x] 

 [- o -] 
 [- x -] 
 [- o x] 

 [- o x] 
 [- x -] 
 [- o x] 

 [- o x] 
 [- x -] 
 [o o x] 

 [x o x] 
 [- x -] 
 [o o x] 



In [None]:
print(np.dot(Xeps.T,yeps))
print(np.mean(yeps, axis = 0))

[[-2.9257598  -0.12808648  0.67169048 -0.46612859 -1.24246901  5.51359058
  -5.19300185  1.01586     2.75430468]
 [ 1.73272651 -0.90027208  2.97492357 -0.76716662 -2.69652687  3.08612168
  -1.5684573  -3.09438909  1.2330402 ]
 [-0.53691396  0.8506162  -1.9221383   1.62784548 -2.74632659  2.66225984
  -1.06017609 -0.69583394  1.82066737]
 [ 1.88056057 -0.58182148 -2.22157024 -0.93973921 -2.29289809  4.598211
  -4.42971036  1.65559914  2.33136868]
 [-1.41193656 -0.30103781  4.46352023  2.41074382 -6.91654863  3.13049463
  -0.75365271 -1.33327531  0.71169234]
 [-0.19486346  1.47750293 -0.7810992   1.1693585   0.57564645 -0.82242128
  -4.02281438  0.05167864  2.5470118 ]
 [-1.9737401  -0.15485452  2.55924902 -0.57359852 -0.91462379  3.60481575
  -3.53194071 -0.82379213  1.80848501]
 [ 0.20398151  0.69563055 -1.09549952  2.536647   -2.79908921  2.29578348
  -2.68699572 -0.80254082  1.65208273]
 [-4.33880452  1.64694909  4.70475007  3.31689319 -9.44965072  5.91526758
   0.67899243 -1.9633674