In [123]:
import random

# Maybe you have to !pip3 install torch or !pip install torch
import torch
from torch import nn
import numpy as np

seed = 0

random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

# Run Classes.ipynb for relevant classes

In [124]:
%run ./Classes.ipynb

# Initialize Players and Game

In [125]:
## Initialize Players
# 0 for position 'top'
# 1 for position 'bottom'

# Nr games for training
num_iterations = int(2e4) #int(2e5) 
epsilon_stop = int(num_iterations /2 * 3/5)

## AI Agent
ai0 = DQNPlayer(gamma=0.97, 
                learning_rate = 1e-2,
                batch_size = 128,
                target_net_update_steps = 1000,
                replay_memory_capacity = 1e5, 
                epsilon_start = 1000,
                epsilon_stop = epsilon_stop,
                epsilon_min = 0.05,
                weight_decay = 1e-4,
                policy = 'softmax',
                pos = 0,
                name = 'AI')

## AI Agent
ai1 = DQNPlayer(gamma=0.97, 
                learning_rate = 1e-2,
                batch_size = 128,
                target_net_update_steps = 1000,
                replay_memory_capacity = 1e5, 
                epsilon_start = 1000,
                epsilon_stop = epsilon_stop,
                epsilon_min = 0.05,
                weight_decay = 1e-4,
                policy = 'softmax',
                pos = 1,
                name = 'AI')

## Greedy Players (always choose action with max reward)
greedy0 = GreedyPlayer(pos=0)
greedy1 = GreedyPlayer(pos=1)

## Random Players
random0 = RandomPlayer(pos=0)
random1 = RandomPlayer(pos=1)

## Human Player (position: 1-'bottom')
human = HumanPlayer(pos=1)

## The Game Engine
game = Mancala(seed=seed, disp=False)

## Everything meets in the 'Arena'
arena = Arena(game, ai0, ai1, disp=False) # disp=False to avoid output

Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized


Every player has its own Memory. 

This is only relevant for the AIs.

The other ones get one for simplicity.

## Play one first game human vs greedy

The possible actions are 0-11.

0 - 5:  choose bowl 0-5, put NO bean into 'Score'

6 - 11: choose bowl 0-5 put A bean into 'Score'

use 12 to end the game.

In [126]:
#testarena = Arena(game, greedy0, human, disp=True)
#testarena.reset()
#testarena.play()

# Start the game, train the AIs

Takes a couple of minutes (~10-30min?)

Maybe num_iterations = 2e5 is an overkill (if changed, epsilon_stop should be changed as well)

In [127]:
# num_iterations defined above

# To safe some results
res = [[],[]]
loss = [[],[]]

# Training Loop
for i in range(num_iterations):
    
    # reset game
    arena.reset()
    
    # play one game
    arena.play()
    
    # save final score
    res[0].append(ai0.score)
    res[1].append(ai1.score)
    
    # save loss
    loss[0].append(ai0.loss)
    loss[1].append(ai1.loss)
    
    # print development
    if (i+1) % 1000 == 0:
        print(f'Game {i+1} / {int(num_iterations)}')
        print(f'Average Score:  ai0: {sum(res[0])/len(res[0]):.3f}    ai1: {sum(res[1])/len(res[1]):.3f}')
        print(f'Epsilon:        ai0: {ai0.epsilon() :.3f}     ai1: {ai1.epsilon() :.3f}')
        print(f'Av. Last Loss:  ai0: {sum(loss[0])/len(loss[0]) :.4f}    ai1: {sum(loss[1])/len(loss[1]) :.4f}')
        print('-------------------------------------------')
        res = [[],[]]
        loss = [[],[]]
    
    # The one that lost trains
    if ai0.score > ai1.score:
        ai1.update_step()
    else:
        ai0.update_step()

Updating target network...
Game 1000 / 20000
Average Score:  ai0: 24.232    ai1: 23.768
Epsilon:        ai0: 0.997     ai1: 1.000
Av. Last Loss:  ai0: 3.3111    ai1: 3.2676
-------------------------------------------
Updating target network...
Updating target network...
Updating target network...
Game 2000 / 20000
Average Score:  ai0: 24.040    ai1: 23.960
Epsilon:        ai0: 0.895     ai1: 0.915
Av. Last Loss:  ai0: 3.0507    ai1: 3.1192
-------------------------------------------
Updating target network...
Game 3000 / 20000
Average Score:  ai0: 24.124    ai1: 23.876
Epsilon:        ai0: 0.796     ai1: 0.824
Av. Last Loss:  ai0: 3.0155    ai1: 3.0171
-------------------------------------------
Updating target network...
Game 4000 / 20000
Average Score:  ai0: 23.759    ai1: 24.241
Epsilon:        ai0: 0.695     ai1: 0.735
Av. Last Loss:  ai0: 3.0183    ai1: 3.0702
-------------------------------------------
Updating target network...
Game 5000 / 20000
Average Score:  ai0: 24.304    ai

Average Loss has nothing to do with performance
since the target_net changes regularly.

So far, we do not have a good metric to rate
the performance of the agents. 

We will figure out performance by testing 
the AI against random, greedy and human players. 

Before doing so, we investigate which move is the 
optimal initial move. 

In [128]:
print('What (score(ai1) - score(ai0) = ?) does ai0 predict from the initial board?')

print(ai0.target_net(torch.tensor([[4]*12], dtype=torch.float32)))

print('How about ai1?')

print(ai1.target_net(torch.tensor([[4]*12], dtype=torch.float32)))



What (score(ai1) - score(ai0) = ?) does ai0 predict from the initial board?
tensor([[-0.5466, -0.5519, -0.5708, -0.5928, -0.6589, -0.8565, -0.1591, -0.1143,
          0.1243, -0.0187,  0.1029,  0.2972]], grad_fn=<AddmmBackward0>)
How about ai1?
tensor([[-0.4516, -0.4513, -0.5255, -0.6923, -0.9040, -0.8740, -0.3449, -0.2172,
          0.0736,  0.0119,  0.0199,  0.2223]], grad_fn=<AddmmBackward0>)


From this we can learn:

The best first actions in a game are:

11 > 10 > 7 > 9, 8 > 6 > 3 > 4 > 0, 1 > 5 > 2
according to ai0. 

ai1 seems to have a different opinion? 

# Final Test
Test performance against various players:
- ai0 vs ai1
- vs random
- vs greedy
- vs human

## ai0 vs ai1

In [129]:
# ai0 vs ai1

arena.test(ai0, ai1, num_games=500, disp=False)

Average Score: Top AI: 19.736 - Bottom AI: 28.264


## VS Random Player

In [130]:
# ai0 vs random1

arena.test(ai0, random1, num_games=500, disp=False)

Average Score: Top AI: 27.81 - Bottom Random: 20.19


In [131]:
# random0 vs ai1

arena.test(random0, ai1, num_games=500, disp=False)

Average Score: Top Random: 20.86 - Bottom AI: 27.14


AI is better than random players.

## VS Greedy Player

In [132]:
# ai0 vs greedy1

arena.test(ai0, greedy1, num_games=500, disp=False)

Average Score: Top AI: 18.132 - Bottom Greedy: 29.868


In [133]:
# greedy0 vs ai1

arena.test(greedy0, ai1, num_games=500, disp=False)

Average Score: Top Greedy: 29.972 - Bottom AI: 18.028


AI is worse than a greedy player.

## VS Human

Play a round against ai0 (AI on top position)

In [51]:
arena.test(ai0, human, num_games=1, disp=True)

Game initialized!
Top Player:    AI. Score: 1
Bottom Player: Human. Score: 11
[7, 0, 0, 6, 10, 0]
[1, 0, 1, 0, 9, 2]End = False
Next Player: Human
[tensor([-1.2467,  0.0699, -0.0493, -0.2444, -0.3837, -1.2272,  0.1459,  0.1590,
         0.6562, -0.1922,  0.4287,  0.0182], grad_fn=<UnbindBackward0>)]
AI chooses 8
Reward: 1
Top Player:    AI. Score: 1
Bottom Player: Human. Score: 0
[5, 5, 5, 0, 4, 4]
[4, 4, 4, 4, 4, 4]End = False
Next Player: AI
[tensor([-2.3094, -1.9140, -1.3966, -2.2820, -0.5283, -1.2177, -2.0655, -2.3037,
         0.3879,  0.3351, -0.3050, -1.2057], grad_fn=<UnbindBackward0>)]
AI chooses 9
Reward: 1
Top Player:    AI. Score: 2
Bottom Player: Human. Score: 0
[6, 6, 0, 0, 4, 4]
[5, 5, 4, 4, 4, 4]End = False
Next Player: Human
It is Humans turn. Whats your next move? (input 0-11, end: 12)12
Human chooses 12
Reward: 0
Game Over
Average Score: Top AI: 22.0 - Bottom Human: 0.0


It seems that the AI is not very good atm. 

In [None]:
#%run ./Classes.ipynb
#arena = Arena(game, ai0, ai1, disp=False) # disp=False to avoid output