# Zoning Game AlphaZero Demo

## Individually construct bits and pieces

In [1]:
%autoreload 1

from nsai_experiments.general_az_1p.utils import disable_numpy_multithreading, use_deterministic_cuda
disable_numpy_multithreading()
use_deterministic_cuda()

import copy

# we have to imports a bit oddly to get autoreload to work
%aimport nsai_experiments.general_az_1p.game
Game = nsai_experiments.general_az_1p.game.Game
%aimport nsai_experiments.general_az_1p.policy_value_net
PolicyValueNet = nsai_experiments.general_az_1p.policy_value_net.PolicyValueNet
%aimport nsai_experiments.general_az_1p.agent
Agent = nsai_experiments.general_az_1p.agent.Agent

%aimport nsai_experiments.general_az_1p.zoning_game.zoning_game_az_impl
ZoningGameGame = nsai_experiments.general_az_1p.zoning_game.zoning_game_az_impl.ZoningGameGame
%aimport nsai_experiments.general_az_1p.zoning_game.zoning_game_az_impl
ZoningGamePolicyValueNet = nsai_experiments.general_az_1p.zoning_game.zoning_game_az_impl.ZoningGamePolicyValueNet

### The `Game`

In [2]:
mygame = ZoningGameGame()
assert isinstance(mygame, Game)
mygame.reset_wrapper(seed=47)
print(mygame.render().read())  # type: ignore[union-attr]

Tile grid:
[[0 0 0 5 1 0]
 [0 4 0 0 0 0]
 [0 3 0 3 2 4]
 [0 0 0 0 0 0]
 [2 0 0 0 0 0]
 [0 0 0 0 3 1]]
Tile queue (leftmost next): [1 4 2 1 5 2 3 3 2 3 1 1 1 4 2 2 1 5 5 2 1 5 3 2 5 1 0 0 0 0 0 0 0 0 0 0]
where 0 = EMPTY, 1 = RESIDENTIAL, 2 = COMMERCIAL, 3 = INDUSTRIAL, 4 = DOWNTOWN, 5 = PARK.
After 0 moves, current grid score is 3; terminated = False, truncated = False.



### The `PolicyValueNet`

In [3]:
import torch
from nsai_experiments.zoning_game.notebook_utils import get_zg_data
from nsai_experiments.zoning_game.zg_policy import create_policy_indiv_greedy

torch.manual_seed(47)
n_games = 20_000
savedir = "../../zoning_game/zg_data"
valid_frac = 0.15
test_frac = 0.35

states_tensor, values_tensor, moves_tensor = get_zg_data(create_policy_indiv_greedy, n_games = n_games, savedir = savedir)
indices = torch.randperm(len(values_tensor))
full_dataset_3 = torch.utils.data.TensorDataset(states_tensor[indices], moves_tensor[indices], values_tensor[indices])

valid_size_3 = int(valid_frac * len(full_dataset_3))
test_size_3 = int(test_frac * len(full_dataset_3))
train_size_3 = len(full_dataset_3) - valid_size_3 - test_size_3
train_dataset_3, valid_dataset_3, test_dataset_3 = torch.utils.data.random_split(full_dataset_3, [train_size_3, valid_size_3, test_size_3])
print("Done loading, shuffling, splitting data")

Loading data from disk: ../../zoning_game/zg_data/create_policy_indiv_greedy__20000
Done loading, shuffling, splitting data


In [4]:
mynet = ZoningGamePolicyValueNet(random_seed=47)
assert isinstance(mynet, PolicyValueNet)
initial_obs = copy.deepcopy(mygame.obs)
mynet.predict(initial_obs)

Neural network training will occur on device 'mps'


(array([0.02805594, 0.02738343, 0.02839885, 0.02788622, 0.0291001 ,
        0.02763609, 0.02681519, 0.02864029, 0.02788756, 0.02838092,
        0.02778699, 0.02877449, 0.02703443, 0.02906117, 0.02899825,
        0.02776492, 0.02859806, 0.02702451, 0.02705516, 0.02807434,
        0.02658405, 0.02678845, 0.02949526, 0.02644843, 0.02754996,
        0.02703502, 0.02698193, 0.02768094, 0.02770268, 0.02843505,
        0.02523457, 0.02692246, 0.02690465, 0.0284914 , 0.02925407,
        0.02813419], dtype=float32),
 array(0.02830549, dtype=float32))

### The `Agent` and `MCTS`

In [5]:
import logging

logging.getLogger().setLevel(logging.WARN)  # TODO
myagent = Agent(mygame, mynet, random_seeds={"mcts": 48, "train": 49, "eval": 50})

RNG seeds are fully specified


In [10]:
real_examples = []
for i in range(100):
    res = myagent._play_for_examples(i, i+1, i+2)
    real_examples.extend(res)
print(len(real_examples))

2166


In [7]:
oldagent = copy.deepcopy(myagent)
myagent.pit(oldagent)

..evaluation done in 3.05 seconds
Old network+MCTS average reward: 0.26, min: -0.13, max: 0.64, stdev: 0.20
New network+MCTS average reward: 0.26, min: -0.13, max: 0.64, stdev: 0.20
Old bare network average reward: 0.29, min: -0.04, max: 0.66, stdev: 0.20
New bare network average reward: 0.29, min: -0.04, max: 0.66, stdev: 0.20
New network won 0 and tied 20 out of 20 games (50.00% wins where ties are half wins)


np.float64(0.5)

Does it perform better with some supervised pretraining?

In [8]:
print(mynet.validate(valid_dataset_3, needs_reshape=False))
# print(mynet.validate(real_examples, needs_reshape=True))

(2121.322389602661, 0.029563357558799908)


In [9]:
for i in range(5):
    mynet.train(train_dataset_3, valid_dataset_3, needs_reshape=False, print_all_epochs=True);  # takes a little while

Training with 212 batches of size 1024
Validating with 64 batches of size 1024
Epoch 1/10, Train Loss: 226.1332, Train Mean Max: 0.0916, Val Loss: 67.2739, Val Mean Max: 0.5250
Epoch 2/10, Train Loss: 41.5601, Train Mean Max: 0.0719, Val Loss: 33.3013, Val Mean Max: 0.0802
Epoch 3/10, Train Loss: 29.9196, Train Mean Max: 0.0701, Val Loss: 35.4818, Val Mean Max: 0.2063
Epoch 4/10, Train Loss: 26.7608, Train Mean Max: 0.0721, Val Loss: 24.6123, Val Mean Max: 0.1887
Epoch 5/10, Train Loss: 23.6819, Train Mean Max: 0.0786, Val Loss: 21.8252, Val Mean Max: 0.1003
Epoch 6/10, Train Loss: 21.8756, Train Mean Max: 0.0959, Val Loss: 23.6329, Val Mean Max: 0.1294
Epoch 7/10, Train Loss: 20.2668, Train Mean Max: 0.1186, Val Loss: 21.0380, Val Mean Max: 0.1770
Epoch 8/10, Train Loss: 18.9612, Train Mean Max: 0.1509, Val Loss: 17.6440, Val Mean Max: 0.1900
Epoch 9/10, Train Loss: 17.6445, Train Mean Max: 0.1874, Val Loss: 18.9868, Val Mean Max: 0.2482
Epoch 10/10, Train Loss: 16.0391, Train Mean Ma

KeyboardInterrupt: 

In [11]:
print(mynet.validate(valid_dataset_3, needs_reshape=False))
print(mynet.validate(real_examples, needs_reshape=True))

(10.231495648622513, 0.4422969068400562)
(1130.5799153645833, 0.36118146777153015)


In [None]:
print(mynet.validate(valid_dataset_3, needs_reshape=False))
print(mynet.validate(real_examples, needs_reshape=True))

In [13]:
mynet.predict(initial_obs)

(array([0.00171641, 0.06226806, 0.14863802, 0.01154106, 0.00627946,
        0.19635542, 0.11545606, 0.001392  , 0.04622583, 0.0365124 ,
        0.01825142, 0.03437214, 0.00270969, 0.00790704, 0.00148205,
        0.00158032, 0.01038758, 0.01097031, 0.01023317, 0.00970493,
        0.00760836, 0.00303479, 0.01117896, 0.08993824, 0.01089501,
        0.00764156, 0.0113241 , 0.00556701, 0.00761443, 0.08500791,
        0.00693498, 0.00597418, 0.00562198, 0.00591261, 0.00083107,
        0.0009315 ], dtype=float32),
 array(1.9298165, dtype=float32))

In [20]:
myagent.n_procs = -1
myagent.pit(oldagent)

..evaluation done in 14.31 seconds
Old network+MCTS average reward: 0.34, min: -0.04, max: 0.74, stdev: 0.23
New network+MCTS average reward: 0.44, min: 0.18, max: 0.85, stdev: 0.19
Old bare network average reward: 0.30, min: 0.03, max: 0.75, stdev: 0.20
New bare network average reward: 0.41, min: -0.03, max: 0.90, stdev: 0.23
New network won 15 and tied 0 out of 20 games (75.00% wins where ties are half wins)


np.float64(0.75)

In [21]:
myagent.mcts_params["temperature"] = 1/20.
myagent.pit(oldagent)

..evaluation done in 13.23 seconds
Old network+MCTS average reward: 0.34, min: -0.22, max: 0.97, stdev: 0.24
New network+MCTS average reward: 0.44, min: 0.04, max: 0.93, stdev: 0.25
Old bare network average reward: 0.31, min: -0.26, max: 0.95, stdev: 0.28
New bare network average reward: 0.36, min: 0.02, max: 0.91, stdev: 0.22
New network won 15 and tied 1 out of 20 games (77.50% wins where ties are half wins)


np.float64(0.775)