In [None]:
import random

# Maybe you have to !pip3 install torch or !pip install torch
import torch
from torch import nn
import numpy as np

seed = 0

random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

# Run Classes.ipynb for relevant classes

In [19]:
%run ./Classes.ipynb

# Initialize Players and Game

In [7]:
## Initialize Players
# 0 for position 'top'
# 1 for position 'bottom'

## AI Agent
ai0 = DQNPlayer(gamma=0.97, 
                learning_rate = 1e-2,
                batch_size = 128,
                target_net_update_steps = 1000,
                replay_memory_capacity = 1e6, 
                epsilon_start = 1000,
                epsilon_stop = 8e4,
                epsilon_min = 0.05,
                pos = 0,
                name = 'AI')

## AI Agent
ai1 = DQNPlayer(gamma=0.97, 
                learning_rate = 1e-2,
                batch_size = 128,
                target_net_update_steps = 1000,
                replay_memory_capacity = 1e6, 
                epsilon_start = 1000,
                epsilon_stop = 8e4,
                epsilon_min = 0.05,
                pos = 1,
                name = 'AI')

## Greedy Players (always choose action with max reward)
greedy0 = GreedyPlayer(pos=0)
greedy1 = GreedyPlayer(pos=1)

## Random Players
random0 = RandomPlayer(pos=0)
random1 = RandomPlayer(pos=1)

## Human Player (position: 1-'bottom')
human = HumanPlayer(pos=1)

## The Game Engine
game = Mancala(seed=seed, disp=False)

## Everything meets in the 'Arena'
arena = Arena(game, ai0, ai1, disp=False) # disp=False to avoid output

Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized
Memory Initialized


Every player has its own Memory. 

This is only relevant for the AIs.

The other ones get one for simplicity.

## Play one first game human vs greedy

The possible actions are 0-11.

0 - 5:  choose bowl 0-5, put NO bean into 'Score'

6 - 11: choose bowl 0-5 put A bean into 'Score'

use 12 to end the game.

In [26]:
testarena = Arena(game, greedy0, human, disp=True)
testarena.reset()
testarena.play()

Game initialized!
Top Player:    Greedy. Score: 0
Bottom Player: Human. Score: 28
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]End = True
Next Player: Greedy
It is Humans turn. Whats your next move? (input 0-11, end: 12)0
Human chooses 0
Reward: 0
Top Player:    Greedy. Score: 0
Bottom Player: Human. Score: 0
[4, 4, 4, 4, 4, 4]
[0, 5, 5, 5, 5, 4]End = False
Next Player: Greedy
Greedy chooses 7
Reward: 0
Top Player:    Greedy. Score: 0
Bottom Player: Human. Score: 0
[5, 5, 5, 5, 0, 4]
[0, 5, 5, 5, 5, 4]End = False
Next Player: Human
It is Humans turn. Whats your next move? (input 0-11, end: 12)12
Human chooses 12
Reward: 0
Game Over


# Start the game, train the AIs

Takes a couple of minutes

In [10]:
# Nr games for training
num_iterations = int(2e5)

# To safe some results
res = [[],[]]
loss = [[],[]]

# Training Loop
for i in range(num_iterations):
    
    # reset game
    arena.reset()
    
    # play one game
    arena.play()
    
    # save final score
    res[0].append(ai0.score)
    res[1].append(ai1.score)
    
    # save loss
    loss[0].append(ai0.loss)
    loss[1].append(ai1.loss)
    
    # print development
    if (i+1) % 1000 == 0:
        print(f'Game {i+1} / {int(num_iterations)}')
        print(f'Average Score:  ai0: {sum(res[0])/len(res[0]):.3f}    ai1: {sum(res[1])/len(res[1]):.3f}')
        print(f'Epsilon:        ai0: {ai0.epsilon() :.3f}     ai1: {ai1.epsilon() :.3f}')
        print(f'Av. Last Loss:  ai0: {sum(loss[0])/len(loss[0]) :.4f}    ai1: {sum(loss[1])/len(loss[1]) :.4f}')
        print('-------------------------------------------')
        res = [[],[]]
        loss = [[],[]]
    
    # The one that lost trains
    if ai0.score > ai1.score:
        ai1.update_step()
    else:
        ai0.update_step()

Updating target network...
Game 1000 / 200000
Score:        ai0: 24.663, ai1: 23.337
Epsilon:      ai0: 0.799,  ai1: 0.817
Average Loss: ai0: 0.9340,  ai1: 0.9577
-------------------------------------------
Updating target network...
Game 2000 / 200000
Score:        ai0: 24.307, ai1: 23.693
Epsilon:      ai0: 0.792,  ai1: 0.811
Average Loss: ai0: 0.9401,  ai1: 0.9393
-------------------------------------------
Updating target network...
Game 3000 / 200000
Score:        ai0: 24.177, ai1: 23.823
Epsilon:      ai0: 0.786,  ai1: 0.805
Average Loss: ai0: 0.9286,  ai1: 0.9298
-------------------------------------------
Updating target network...
Game 4000 / 200000
Score:        ai0: 24.279, ai1: 23.721
Epsilon:      ai0: 0.780,  ai1: 0.800
Average Loss: ai0: 0.9323,  ai1: 0.9438
-------------------------------------------
Updating target network...
Game 5000 / 200000
Score:        ai0: 24.435, ai1: 23.565
Epsilon:      ai0: 0.774,  ai1: 0.794
Average Loss: ai0: 0.9339,  ai1: 0.9416
---------

Updating target network...
Game 41000 / 200000
Score:        ai0: 24.551, ai1: 23.449
Epsilon:      ai0: 0.554,  ai1: 0.581
Average Loss: ai0: 0.9311,  ai1: 0.9481
-------------------------------------------
Updating target network...
Game 42000 / 200000
Score:        ai0: 24.374, ai1: 23.626
Epsilon:      ai0: 0.548,  ai1: 0.574
Average Loss: ai0: 0.9244,  ai1: 0.9534
-------------------------------------------
Updating target network...
Game 43000 / 200000
Score:        ai0: 24.629, ai1: 23.371
Epsilon:      ai0: 0.542,  ai1: 0.569
Average Loss: ai0: 0.9399,  ai1: 0.9516
-------------------------------------------
Updating target network...
Game 44000 / 200000
Score:        ai0: 24.570, ai1: 23.430
Epsilon:      ai0: 0.536,  ai1: 0.563
Average Loss: ai0: 0.9197,  ai1: 0.9525
-------------------------------------------
Updating target network...
Game 45000 / 200000
Score:        ai0: 24.438, ai1: 23.562
Epsilon:      ai0: 0.530,  ai1: 0.557
Average Loss: ai0: 0.9326,  ai1: 0.9514
----

Updating target network...
Game 81000 / 200000
Score:        ai0: 23.888, ai1: 24.112
Epsilon:      ai0: 0.314,  ai1: 0.339
Average Loss: ai0: 0.9298,  ai1: 0.9265
-------------------------------------------
Updating target network...
Game 82000 / 200000
Score:        ai0: 24.403, ai1: 23.597
Epsilon:      ai0: 0.308,  ai1: 0.334
Average Loss: ai0: 0.9308,  ai1: 0.9381
-------------------------------------------
Updating target network...
Game 83000 / 200000
Score:        ai0: 24.617, ai1: 23.383
Epsilon:      ai0: 0.302,  ai1: 0.328
Average Loss: ai0: 0.9470,  ai1: 0.9302
-------------------------------------------
Updating target network...
Game 84000 / 200000
Score:        ai0: 24.387, ai1: 23.613
Epsilon:      ai0: 0.296,  ai1: 0.322
Average Loss: ai0: 0.9380,  ai1: 0.9317
-------------------------------------------
Updating target network...
Game 85000 / 200000
Score:        ai0: 24.736, ai1: 23.264
Epsilon:      ai0: 0.290,  ai1: 0.316
Average Loss: ai0: 0.9232,  ai1: 0.9485
----

Updating target network...
Game 121000 / 200000
Score:        ai0: 23.218, ai1: 24.782
Epsilon:      ai0: 0.058,  ai1: 0.115
Average Loss: ai0: 0.9424,  ai1: 0.9559
-------------------------------------------
Game 122000 / 200000
Score:        ai0: 23.773, ai1: 24.227
Epsilon:      ai0: 0.050,  ai1: 0.110
Average Loss: ai0: 0.9166,  ai1: 0.9167
-------------------------------------------
Updating target network...
Updating target network...
Game 123000 / 200000
Score:        ai0: 23.479, ai1: 24.521
Epsilon:      ai0: 0.050,  ai1: 0.106
Average Loss: ai0: 0.9297,  ai1: 0.9188
-------------------------------------------
Updating target network...
Game 124000 / 200000
Score:        ai0: 23.409, ai1: 24.591
Epsilon:      ai0: 0.050,  ai1: 0.102
Average Loss: ai0: 0.9315,  ai1: 0.9487
-------------------------------------------
Updating target network...
Game 125000 / 200000
Score:        ai0: 23.559, ai1: 24.441
Epsilon:      ai0: 0.050,  ai1: 0.098
Average Loss: ai0: 0.9367,  ai1: 0.9239

Updating target network...
Game 161000 / 200000
Score:        ai0: 22.556, ai1: 25.444
Epsilon:      ai0: 0.050,  ai1: 0.050
Average Loss: ai0: 0.9447,  ai1: 0.9527
-------------------------------------------
Updating target network...
Game 162000 / 200000
Score:        ai0: 21.642, ai1: 26.358
Epsilon:      ai0: 0.050,  ai1: 0.050
Average Loss: ai0: 0.9332,  ai1: 0.9513
-------------------------------------------
Updating target network...
Game 163000 / 200000
Score:        ai0: 22.670, ai1: 25.330
Epsilon:      ai0: 0.050,  ai1: 0.050
Average Loss: ai0: 0.9278,  ai1: 0.9453
-------------------------------------------
Updating target network...
Game 164000 / 200000
Score:        ai0: 22.579, ai1: 25.421
Epsilon:      ai0: 0.050,  ai1: 0.050
Average Loss: ai0: 0.9307,  ai1: 0.9612
-------------------------------------------
Updating target network...
Game 165000 / 200000
Score:        ai0: 22.635, ai1: 25.365
Epsilon:      ai0: 0.050,  ai1: 0.050
Average Loss: ai0: 0.9384,  ai1: 0.9131

In [12]:
print('What (score(ai1) - score(ai0) = ?) does ai0 predict from the initial board?')

print(ai0.target_net(torch.tensor([[4]*12], dtype=torch.float32)))

print('How about ai1?')

print(ai1.target_net(torch.tensor([[4]*12], dtype=torch.float32)))



What (score(ai1) - score(ai0) = ?) does ai0 predict from the initial board?
tensor([[-0.2839, -0.2945, -0.3810, -0.4163, -0.4798, -0.5579, -0.0037,  0.1415,
          0.2691,  0.3647,  0.5293,  0.6231]], grad_fn=<AddmmBackward0>)
How about ai1?
tensor([[-0.2990, -0.2997, -0.4171, -0.3973, -0.4636, -0.5963, -0.0836,  0.1288,
          0.2255,  0.3620,  0.4615,  0.5946]], grad_fn=<AddmmBackward0>)


From this we can learn:

The best first actions in a game are:

11 > 10 > 9 > 8 > 7 > 6 > 0, 1 > 2, 3 > 4 > 5

# Final Test
Test performance against various players:
- ai0 vs ai1
- vs random
- vs greedy
- vs human

## ai0 vs ai1

In [13]:
# ai0 vs ai1

arena.test(ai0, ai1, num_games=500, disp=False)

Average Score: Top AI: 24.516 - Bottom AI: 23.484


## VS Random Player

In [14]:
# ai0 vs random1

arena.test(ai0, random1, num_games=500, disp=False)

Average Score: Top AI: 32.79 - Bottom Random: 15.21


In [15]:
# random0 vs ai1

arena.test(random0, ai1, num_games=500, disp=False)

Average Score: Top Random: 16.066 - Bottom AI: 31.934


AI is much better than random players.

## VS Greedy Player

In [16]:
# ai0 vs greedy1

arena.test(ai0, greedy1, num_games=500, disp=False)

Average Score: Top AI: 24.762 - Bottom Greedy: 23.238


In [17]:
# greedy0 vs ai1

arena.test(ai0, greedy1, num_games=500, disp=False)

Average Score: Top AI: 25.114 - Bottom Greedy: 22.886


AI is only slightly better than a greedy player.

## VS Human

Play a round against ai0 (AI on top position)

In [23]:
arena.test(ai0, human, num_games=1, disp=True)

Game initialized!
Top Player:    AI. Score: 24
Bottom Player: Human. Score: 0
[4, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4]End = True
Next Player: Human
AI chooses 11
Reward: 1
Top Player:    AI. Score: 1
Bottom Player: Human. Score: 0
[0, 4, 4, 4, 4, 4]
[5, 5, 5, 4, 4, 4]End = False
Next Player: Human
It is Humans turn. Whats your next move? (input 0-11, end: 12)9
Human chooses 9
Reward: 1
Top Player:    AI. Score: 1
Bottom Player: Human. Score: 1
[0, 4, 4, 4, 4, 5]
[5, 5, 5, 0, 5, 5]End = False
Next Player: AI
AI chooses 10
Reward: 1
Top Player:    AI. Score: 2
Bottom Player: Human. Score: 1
[1, 0, 4, 4, 4, 5]
[6, 6, 5, 0, 5, 5]End = False
Next Player: Human
It is Humans turn. Whats your next move? (input 0-11, end: 12)7
Human chooses 7
Reward: 1
Top Player:    AI. Score: 2
Bottom Player: Human. Score: 2
[1, 0, 4, 4, 4, 6]
[6, 0, 6, 1, 6, 6]End = False
Next Player: AI
AI chooses 11
Reward: 1
Top Player:    AI. Score: 3
Bottom Player: Human. Score: 2
[0, 0, 4, 4, 4, 6]
[6, 0, 6, 1, 6, 6]End =

It seems that the AI is able to play solid.

But not great. 