In [12]:
import random

# Maybe you have to !pip3 install torch or !pip install torch
import torch
from torch import nn
import numpy as np

seed = 0

random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

# Run Classes.ipynb for relevant classes

In [13]:
%run ./Classes.ipynb

# Initialize Players and Game

In [14]:
## Initialize Players
# 0 for position 'top'
# 1 for position 'bottom'

## AI Agent
ai0 = DQNPlayer(gamma=0.97, 
                learning_rate = 1e-2,
                batch_size = 128,
                target_net_update_steps = 1000,
                replay_memory_capacity = 1e6, 
                epsilon_start = 1000,
                epsilon_stop = 8e4,
                epsilon_min = 0.05,
                weight_decay = 1e-4,
                pos = 0,
                name = 'AI')

## AI Agent
ai1 = DQNPlayer(gamma=0.97, 
                learning_rate = 1e-2,
                batch_size = 128,
                target_net_update_steps = 1000,
                replay_memory_capacity = 1e6, 
                epsilon_start = 1000,
                epsilon_stop = 8e4,
                epsilon_min = 0.05,
                weight_decay = 1e-4,
                pos = 1,
                name = 'AI')

## Greedy Players (always choose action with max reward)
greedy0 = GreedyPlayer(pos=0)
greedy1 = GreedyPlayer(pos=1)

## Random Players
random0 = RandomPlayer(pos=0)
random1 = RandomPlayer(pos=1)

## Human Player (position: 1-'bottom')
human = HumanPlayer(pos=1)

## The Game Engine
game = Mancala(seed=seed, disp=False)

## Everything meets in the 'Arena'
arena = Arena(game, ai0, ai1, disp=False) # disp=False to avoid output

Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized


Every player has its own Memory. 

This is only relevant for the AIs.

The other ones get one for simplicity.

## Play one first game human vs greedy

The possible actions are 0-11.

0 - 5:  choose bowl 0-5, put NO bean into 'Score'

6 - 11: choose bowl 0-5 put A bean into 'Score'

use 12 to end the game.

In [15]:
#testarena = Arena(game, greedy0, human, disp=True)
#testarena.reset()
#testarena.play()

# Start the game, train the AIs

Takes a couple of minutes (~10-20min?)

Maybe num_iterations = 2e5 is an overkill (if changed, epsilon_stop should be changed as well)

In [16]:
# Nr games for training
num_iterations = int(2e5) 

# To safe some results
res = [[],[]]
loss = [[],[]]

# Training Loop
for i in range(num_iterations):
    
    # reset game
    arena.reset()
    
    # play one game
    arena.play()
    
    # save final score
    res[0].append(ai0.score)
    res[1].append(ai1.score)
    
    # save loss
    loss[0].append(ai0.loss)
    loss[1].append(ai1.loss)
    
    # print development
    if (i+1) % 1000 == 0:
        print(f'Game {i+1} / {int(num_iterations)}')
        print(f'Average Score:  ai0: {sum(res[0])/len(res[0]):.3f}    ai1: {sum(res[1])/len(res[1]):.3f}')
        print(f'Epsilon:        ai0: {ai0.epsilon() :.3f}     ai1: {ai1.epsilon() :.3f}')
        print(f'Av. Last Loss:  ai0: {sum(loss[0])/len(loss[0]) :.4f}    ai1: {sum(loss[1])/len(loss[1]) :.4f}')
        print('-------------------------------------------')
        res = [[],[]]
        loss = [[],[]]
    
    # The one that lost trains
    if ai0.score > ai1.score:
        ai1.update_step()
    else:
        ai0.update_step()

Updating target network...
Game 1000 / 200000
Average Score:  ai0: 23.448    ai1: 24.552
Epsilon:        ai0: 0.999     ai1: 1.000
Av. Last Loss:  ai0: 3.8422    ai1: 3.8210
-------------------------------------------
Updating target network...
Updating target network...
Updating target network...
Game 2000 / 200000
Average Score:  ai0: 24.353    ai1: 23.647
Epsilon:        ai0: 0.993     ai1: 0.995
Av. Last Loss:  ai0: 3.8496    ai1: 3.8616
-------------------------------------------
Updating target network...
Game 3000 / 200000
Average Score:  ai0: 23.971    ai1: 24.029
Epsilon:        ai0: 0.987     ai1: 0.989
Av. Last Loss:  ai0: 3.8450    ai1: 3.8690
-------------------------------------------
Updating target network...
Game 4000 / 200000
Average Score:  ai0: 23.910    ai1: 24.090
Epsilon:        ai0: 0.980     ai1: 0.984
Av. Last Loss:  ai0: 3.8442    ai1: 3.8791
-------------------------------------------
Updating target network...
Game 5000 / 200000
Average Score:  ai0: 24.072 

Updating target network...
Game 39000 / 200000
Average Score:  ai0: 23.903    ai1: 24.097
Epsilon:        ai0: 0.760     ai1: 0.783
Av. Last Loss:  ai0: 3.9142    ai1: 3.8886
-------------------------------------------
Updating target network...
Game 40000 / 200000
Average Score:  ai0: 23.961    ai1: 24.039
Epsilon:        ai0: 0.754     ai1: 0.777
Av. Last Loss:  ai0: 3.9351    ai1: 3.8896
-------------------------------------------
Updating target network...
Game 41000 / 200000
Average Score:  ai0: 23.935    ai1: 24.065
Epsilon:        ai0: 0.747     ai1: 0.772
Av. Last Loss:  ai0: 3.9208    ai1: 3.8706
-------------------------------------------
Updating target network...
Game 42000 / 200000
Average Score:  ai0: 24.279    ai1: 23.721
Epsilon:        ai0: 0.741     ai1: 0.766
Av. Last Loss:  ai0: 3.9418    ai1: 3.8598
-------------------------------------------
Updating target network...
Game 43000 / 200000
Average Score:  ai0: 24.980    ai1: 23.020
Epsilon:        ai0: 0.735     ai1

Updating target network...
Game 77000 / 200000
Average Score:  ai0: 25.067    ai1: 22.933
Epsilon:        ai0: 0.527     ai1: 0.559
Av. Last Loss:  ai0: 3.9274    ai1: 3.8650
-------------------------------------------
Updating target network...
Game 78000 / 200000
Average Score:  ai0: 24.050    ai1: 23.950
Epsilon:        ai0: 0.521     ai1: 0.553
Av. Last Loss:  ai0: 3.9398    ai1: 3.9226
-------------------------------------------
Updating target network...
Game 79000 / 200000
Average Score:  ai0: 23.908    ai1: 24.092
Epsilon:        ai0: 0.514     ai1: 0.548
Av. Last Loss:  ai0: 3.8999    ai1: 3.8958
-------------------------------------------
Updating target network...
Game 80000 / 200000
Average Score:  ai0: 23.521    ai1: 24.479
Epsilon:        ai0: 0.508     ai1: 0.542
Av. Last Loss:  ai0: 3.8898    ai1: 3.8887
-------------------------------------------
Updating target network...
Game 81000 / 200000
Average Score:  ai0: 24.305    ai1: 23.695
Epsilon:        ai0: 0.502     ai1

Updating target network...
Game 115000 / 200000
Average Score:  ai0: 25.384    ai1: 22.616
Epsilon:        ai0: 0.294     ai1: 0.335
Av. Last Loss:  ai0: 3.9134    ai1: 3.9009
-------------------------------------------
Updating target network...
Game 116000 / 200000
Average Score:  ai0: 25.018    ai1: 22.982
Epsilon:        ai0: 0.289     ai1: 0.328
Av. Last Loss:  ai0: 3.9001    ai1: 3.8824
-------------------------------------------
Updating target network...
Game 117000 / 200000
Average Score:  ai0: 25.680    ai1: 22.320
Epsilon:        ai0: 0.283     ai1: 0.322
Av. Last Loss:  ai0: 3.8659    ai1: 3.8887
-------------------------------------------
Updating target network...
Game 118000 / 200000
Average Score:  ai0: 23.707    ai1: 24.293
Epsilon:        ai0: 0.277     ai1: 0.316
Av. Last Loss:  ai0: 3.9049    ai1: 3.9279
-------------------------------------------
Updating target network...
Game 119000 / 200000
Average Score:  ai0: 23.976    ai1: 24.024
Epsilon:        ai0: 0.270   

Updating target network...
Game 153000 / 200000
Average Score:  ai0: 24.872    ai1: 23.128
Epsilon:        ai0: 0.066     ai1: 0.106
Av. Last Loss:  ai0: 3.9554    ai1: 3.9025
-------------------------------------------
Updating target network...
Game 154000 / 200000
Average Score:  ai0: 25.585    ai1: 22.415
Epsilon:        ai0: 0.061     ai1: 0.099
Av. Last Loss:  ai0: 3.8988    ai1: 3.9146
-------------------------------------------
Updating target network...
Game 155000 / 200000
Average Score:  ai0: 23.725    ai1: 24.275
Epsilon:        ai0: 0.054     ai1: 0.094
Av. Last Loss:  ai0: 3.8916    ai1: 3.8815
-------------------------------------------
Updating target network...
Game 156000 / 200000
Average Score:  ai0: 25.306    ai1: 22.694
Epsilon:        ai0: 0.050     ai1: 0.088
Av. Last Loss:  ai0: 3.9005    ai1: 3.8994
-------------------------------------------
Updating target network...
Game 157000 / 200000
Average Score:  ai0: 25.346    ai1: 22.654
Epsilon:        ai0: 0.050   

Updating target network...
Game 191000 / 200000
Average Score:  ai0: 24.331    ai1: 23.669
Epsilon:        ai0: 0.050     ai1: 0.050
Av. Last Loss:  ai0: 3.8763    ai1: 3.9303
-------------------------------------------
Updating target network...
Game 192000 / 200000
Average Score:  ai0: 23.460    ai1: 24.540
Epsilon:        ai0: 0.050     ai1: 0.050
Av. Last Loss:  ai0: 3.9178    ai1: 3.8924
-------------------------------------------
Updating target network...
Game 193000 / 200000
Average Score:  ai0: 24.539    ai1: 23.461
Epsilon:        ai0: 0.050     ai1: 0.050
Av. Last Loss:  ai0: 3.9210    ai1: 3.8905
-------------------------------------------
Updating target network...
Game 194000 / 200000
Average Score:  ai0: 20.508    ai1: 27.492
Epsilon:        ai0: 0.050     ai1: 0.050
Av. Last Loss:  ai0: 3.9150    ai1: 3.9129
-------------------------------------------
Updating target network...
Game 195000 / 200000
Average Score:  ai0: 22.961    ai1: 25.039
Epsilon:        ai0: 0.050   

In [17]:
print('What (score(ai1) - score(ai0) = ?) does ai0 predict from the initial board?')

print(ai0.target_net(torch.tensor([[4]*12], dtype=torch.float32)))

print('How about ai1?')

print(ai1.target_net(torch.tensor([[4]*12], dtype=torch.float32)))



What (score(ai1) - score(ai0) = ?) does ai0 predict from the initial board?
tensor([[-0.6346, -0.8255, -0.8171, -0.8774, -1.0938, -1.9241, -0.4624, -0.6428,
         -0.2846, -0.4909, -0.4240, -0.1323]], grad_fn=<AddmmBackward0>)
How about ai1?
tensor([[-0.6623, -0.7241, -0.6190, -0.8943, -1.0796, -1.5911, -0.3514, -0.4626,
         -0.3372, -0.5279, -0.3493,  0.0069]], grad_fn=<AddmmBackward0>)


From this we can learn:

The best first actions in a game are:

11 > 10 > 9 > 8 > 7 > 6 > 0, 1 > 2, 3 > 4 > 5

# Final Test
Test performance against various players:
- ai0 vs ai1
- vs random
- vs greedy
- vs human

## ai0 vs ai1

In [18]:
# ai0 vs ai1

arena.test(ai0, ai1, num_games=500, disp=False)

Average Score: Top AI: 27.796 - Bottom AI: 20.204


## VS Random Player

In [19]:
# ai0 vs random1

arena.test(ai0, random1, num_games=500, disp=False)

Average Score: Top AI: 33.122 - Bottom Random: 14.878


In [20]:
# random0 vs ai1

arena.test(random0, ai1, num_games=500, disp=False)

Average Score: Top Random: 16.076 - Bottom AI: 31.924


AI is much better than random players.

## VS Greedy Player

In [21]:
# ai0 vs greedy1

arena.test(ai0, greedy1, num_games=500, disp=False)

Average Score: Top AI: 25.438 - Bottom Greedy: 22.562


In [22]:
# greedy0 vs ai1

arena.test(greedy0, ai1, num_games=500, disp=False)

Average Score: Top Greedy: 24.554 - Bottom AI: 23.446


AI is only slightly better than a greedy player.

## VS Human

Play a round against ai0 (AI on top position)

In [23]:
arena.test(ai0, human, num_games=1, disp=True)

Game initialized!
Top Player:    AI. Score: 24
Bottom Player: Human. Score: 0
[0, 0, 1, 1, 1, 0]
[0, 0, 0, 0, 0, 0]End = True
Next Player: Human
[tensor([-0.5821, -0.7393, -0.8915, -0.9270, -0.9370, -1.7775, -0.4072, -0.4386,
        -0.5224, -0.3173, -0.2789, -0.1769], grad_fn=<UnbindBackward0>)]
AI chooses 11
Reward: 1
Top Player:    AI. Score: 1
Bottom Player: Human. Score: 0
[0, 4, 4, 4, 4, 4]
[5, 5, 5, 4, 4, 4]End = False
Next Player: Human
It is Humans turn. Whats your next move? (input 0-11, end: 12)12
Human chooses 12
Reward: 0
Game Over
Average Score: Top AI: 21.0 - Bottom Human: 0.0


It seems that the AI is able to play solidly.

But not great. 

In [None]:
#%run ./Classes.ipynb
#arena = Arena(game, ai0, ai1, disp=False) # disp=False to avoid output