In [2]:
import random
from typing import Dict, List, Tuple
from functools import partial
from check_submission import check_submission
from game_mechanics import (
    Cell,
    WildTictactoeEnv,
    choose_move_randomly,
    load_dictionary,
    play_wild_ttt_game,
    render,
    save_dictionary,
)
import tqdm

In [4]:
def choose_move_no_value_fn(board: List[str]) -> Tuple[int, str]:
    return choose_move(board, {})

all_possible_moves = [(cell, mark) for cell in range(9) for mark in 'XO']

def make_move(board, move):
    return board[:move[0]] + move[1] + board[move[0]+1:]

In [151]:
0.99999**1_00000

0.3678776017682465

In [139]:
def choose_move(board: List[str], value_function: Dict, epsilon=0.05) -> Tuple[int, str]:
    board = ''.join(board)
    action_values = value_function.get(board, {})
    possible_moves = [m for m in all_possible_moves if board[m[0]] == ' ']
    if random.random() > epsilon:
        move_values = [-value_function.get(make_move(board, m), 0) for m in possible_moves]
        best_move_value = max(move_values)
        best_moves = [m for (m, v) in zip(possible_moves, move_values) if (v - best_move_value) < 0.001]
        best_move = random.choice(best_moves)
        return best_move
    else:
        return random.choice(possible_moves)


In [140]:
def train(n_episodes = 100_000, gamma = 0.99, alpha = 0.95) -> Dict:
    """Write this function to train your algorithm.

    Returns:
         Value function dictionary used by your agent. You can
         structure this how you like, however your choose_move must
         be able to use it.
    """
    env = WildTictactoeEnv(choose_move_no_value_fn)
    value_fn = {}
    for episode in tqdm.tqdm(range(n_episodes)):
        state, reward, done, info = env.reset(0)
        while not done:
            old_state = ''.join(state)
            old_reward = reward
            move = choose_move(state, value_fn)
            state, reward, done, info = env.step(move, 0)
            state = ''.join(state)
            old_evaluation = value_fn.get(old_state,0)
            new_evaluation = value_fn.get(state,0)
            value_fn[old_state] = old_evaluation*(1-alpha) + alpha*(old_reward + gamma*new_evaluation)
        value_fn[state] = (1 - alpha) * value_fn.get(state,0) + alpha * (reward)
    return value_fn

In [141]:
vf = train()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:12<00:00, 8039.81it/s]


In [144]:
total_return = 0
for i in range(100):
    total_return += play_wild_ttt_game(
        your_choose_move=partial(choose_move, value_function=vf, epsilon=0),
        opponent_choose_move=choose_move_randomly,
        game_speed_multiplier=10000,
        verbose=False,
    )
total_return

-11

-4

In [138]:
vf

{'         ': 0.7951294339073957,
 '    X  O ': 0.7727947330467161,
 'O   XX O ': 0.9737924953166055,
 'O  XXX O ': 0.99999999921875,
 '   X  O  ': 0.9603961398014262,
 ' X XX O  ': 0.8293184705221751,
 ' X XXXO  ': 0.9999999999609375,
 ' O      X': -0.8691032941258319,
 'OO     XX': 0.9870359284007724,
 'OOX O  XX': -0.9899997030882296,
 'OOXOOO XX': -0.9999999999999998,
 '    OX   ': 0.8686349733651157,
 ' O  OXO  ': 0.9635442201137883,
 ' OX OXOO ': -0.9999996875,
 'O      X ': 0.9620464041832569,
 'O   X  XX': 0.9755402985915583,
 'O   X XXX': 0.999999984375,
 '   OX    ': 0.7868475319533874,
 '   OX  OO': -0.7964124121937362,
 ' O OX XOO': 0.04222499641960674,
 'XO OXXXOO': 2.939449218629466e-07,
 'XOXOXXXOO': 1.0,
 '  O   X  ': -0.00023246339958405554,
 'X O  XX  ': 0.9862800376911898,
 'X OOXXX  ': -0.896151070828571,
 'X OOXXXXX': -0.999875,
 ' XO X X  ': -0.7962858220680882,
 ' XOOXXX  ': 0.08844186109096908,
 ' XOOXXXOX': 0.942968827931915,
 'XXOOXXXOX': 1.0,
 'XX       ': 0.

In [12]:
env = WildTictactoeEnv(choose_move_no_value_fn)
value_fn = {}

In [97]:
state, reward, done, info = env.reset(1)


Game starts!
[[' ' ' ' ' ']
 [' ' ' ' ' ']
 [' ' ' ' ' ']]



In [13]:
old_state = ''.join(state)
old_state

'         '

In [90]:
env

[[' ' ' ' 'X']
 [' ' 'X' 'O']
 [' ' ' ' ' ']]

In [91]:
reward

0

In [96]:
state, reward, done, info = env.reset(1)
gamma = 0.99
alpha = 0.8
while not done:
    old_state = ''.join(state)
    move = choose_move(state, value_fn)
    print(move)
    state, reward, done, info = env.step(move, 0)
    print(env)
    print(reward)
    state = ''.join(state)
    old_evaluation = value_fn.get(old_state,0)
    new_evaluation = value_fn.get(state,0)
    value_fn[old_state] = old_evaluation*(1-alpha) + alpha*(reward + gamma*new_evaluation)
value_fn[state]

Game starts!
[[' ' ' ' ' ']
 [' ' ' ' ' ']
 [' ' ' ' ' ']]

opponent makes a move!
[[' ' ' ' ' ']
 [' ' 'X' ' ']
 [' ' ' ' ' ']]

(7, 'X')
[[' ' ' ' ' ']
 [' ' 'X' ' ']
 [' ' 'X' 'O']]

0
(5, 'X')
[['O' ' ' ' ']
 [' ' 'X' 'X']
 [' ' 'X' 'O']]

0
(6, 'X')
[['O' 'O' ' ']
 [' ' 'X' 'X']
 ['X' 'X' 'O']]

0
(2, 'X')
[['O' 'O' 'X']
 [' ' 'X' 'X']
 ['X' 'X' 'O']]

1


In [102]:
my_value_fn = load_dictionary("Deep Learners")

In [110]:
env.__dict__

{'opponent_choose_move': <function __main__.choose_move_no_value_fn(board: List[str]) -> Tuple[int, str]>,
 'done': False,
 'board': [[' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' ']],
 'player_move': 'player',
 'went_first': 'player'}

In [129]:
env.went_first

'opponent'

In [125]:
state, reward, done, info = env.reset(1)

Game starts!
[[' ' ' ' ' ']
 [' ' ' ' ' ']
 [' ' ' ' ' ']]

opponent makes a move!
[[' ' ' ' ' ']
 [' ' ' ' 'O']
 [' ' ' ' ' ']]



In [126]:
env.board = [[' ', 'X', 'O'],['O', 'X', 'X'],['X', 'X', 'O']]

In [120]:
env

[[' ' 'X' 'O']
 ['O' 'X' 'X']
 ['X' 'X' 'O']]

In [127]:
state, reward, done, info = env.step([0, 'O'], 1)

player makes a move!
[['O' 'X' 'O']
 ['O' 'X' 'X']
 ['X' 'X' 'O']]

You win!


In [128]:
reward

1

In [115]:
reward

1

In [103]:
my_value_fn

{'   O     ': 0.8734993165996106,
 '   O XX  ': 0.836964082681598,
 ' X O XX O': 0.7934652322704216,
 ' XOOXXX O': 0.00011670940385706434,
 ' XOOXXXXO': -0.9000000311757806,
 ' O       ': -0.8661021873895837,
 ' OO  O   ': -0.7910540917212813,
 ' OOO O  X': -0.8883877347116156,
 'XOOO O OX': -0.8914691614403375,
 'XOOOOO OX': 0.9002499984414062,
 '        O': -0.8749730513806928,
 'O  O    O': -0.037447385447724184,
 'O XO O  O': 0.8862943930399841,
 'OXXO O XO': 0.9805837979027031,
 'OXXOXO XO': -0.9047505937537206,
 '    O    ': 0.7974958713132871,
 'X O O    ': -0.8739111932006637,
 'X O O X X': -0.9829125546328007,
 'XOO O XOX': 0.999999998511728,
 'XOX      ': -0.8636860542233669,
 'XOX O O  ': 0.8783273566606712,
 'XOXXO OX ': -0.00011723927642705764,
 'XOXXOXOXO': 0.0,
 '         ': -0.9171188271664883,
 ' X X     ': -0.8046962755171903,
 'OX XO    ': 0.9623604521842671,
 'OXXXOX   ': -0.8105244853337181,
 'OXXXOXO X': 0.904762468824414,
 '  O   X  ': 0.8007928713566167,
 'O O  