Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

**Following is the implemention of the code by professor Giovanni Squillero.
Optimization of state value function based on Monte Carlo method, evalution results have been reported based on number of wins, looses and draws, for each method. States will be chosen completely random.**



In [2]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy
from tqdm.auto import tqdm
import numpy as np

In [3]:
State = namedtuple('State', ['x', 'o'])
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

In [4]:
def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end='')
            elif MAGIC[i] in pos.o:
                print('O', end='')
            else:
                print('.', end='')
        print()
    print()

In [5]:
def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))

def state_value(pos: State):
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0

In [6]:
def random_game():
    global wins, draws, looses
    trajectory = list()
    state = State(set(), set())
    available = set(range(1, 9+1))
    while available:
        x = choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) or not available:
            break

        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)

        if win(state.o):
            break
    return trajectory

In [8]:
wins = 0
draws = 0
looses = 0

value_dictionary = defaultdict(float)
hit_state = defaultdict(int)
epsilon = 0.001
total_iteration = 500_000

for steps in tqdm(range(total_iteration)):
    trajectory = random_game()
    final_reward = state_value(trajectory[-1])
    if (final_reward == 1):
      wins += 1
    elif (final_reward == -1):
      looses += 1
    else:
      draws += 1

    for state in trajectory:
        hashable_state = (frozenset(state.x), frozenset(state.o))
        hit_state[hashable_state] += 1

        value_dictionary[hashable_state] = value_dictionary[
            hashable_state
        ] + epsilon * (final_reward - value_dictionary[hashable_state])


print(f'Wins {wins/total_iteration}')
print(f'Draws {draws/total_iteration}')
print(f'Looses {looses/total_iteration}')

  0%|          | 0/500000 [00:00<?, ?it/s]

Wins 0.58533
Draws 0.126752
Looses 0.287918


In [9]:
top_items = sorted(value_dictionary.items(), key=lambda e: e[1], reverse=True)[:10]
top_items

[((frozenset({1, 5, 7, 8, 9}), frozenset({2, 3, 4, 6})), 0.9176064673406595),
 ((frozenset({1, 2, 3, 6, 8}), frozenset({4, 5, 7, 9})), 0.914499223392184),
 ((frozenset({1, 2, 5, 7, 8}), frozenset({3, 4, 6, 9})), 0.9134665137729492),
 ((frozenset({3, 5, 7, 8, 9}), frozenset({1, 2, 4, 6})), 0.9130325453047803),
 ((frozenset({1, 4, 6, 7, 8}), frozenset({2, 3, 5, 9})), 0.9125089093789029),
 ((frozenset({2, 3, 5, 8, 9}), frozenset({1, 4, 6, 7})), 0.9124213307096125),
 ((frozenset({1, 3, 6, 8, 9}), frozenset({2, 4, 5, 7})), 0.9123336643739864),
 ((frozenset({4, 5, 6, 7, 9}), frozenset({1, 2, 3, 8})), 0.9118058204467994),
 ((frozenset({1, 2, 3, 5, 7}), frozenset({4, 6, 8, 9})), 0.9116291671519361),
 ((frozenset({1, 4, 5, 6, 7}), frozenset({2, 3, 8, 9})), 0.9112747983417009)]

**Implementation of e-greedy method on state value optimization in Monte Carlo Method.**





In [10]:
wins = 0
draws = 0
looses = 0

value_dictionary = defaultdict(float)
hit_state = defaultdict(int)
epsilon = 0.001
total_iteration = 500_000

for steps in tqdm(range(total_iteration)):

    trajectory = list()
    state = State(set(), set())
    available = set(range(1, 9+1))
    while available:
        if np.random.rand() < epsilon:
            # Exploration: Choose a random action
            x = choice(list(available))
        else:
            # Exploitation: Choose the action with the highest Q-value
            available_moves = [i for i in range(1, 10) if i not in state.x.union(state.o)]
            max_value_dict = max(available_moves, key=lambda move: value_dictionary[(frozenset(state.x.union({move})), frozenset(state.o))])
            x = max_value_dict

        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) or not available:
            break

        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)

        if win(state.o):
            break

    final_reward = state_value(trajectory[-1])
    if (final_reward == 1):
      wins += 1
    elif (final_reward == -1):
      looses += 1
    else:
      draws += 1

    for state in trajectory:
        hashable_state = (frozenset(state.x), frozenset(state.o))
        hit_state[hashable_state] += 1

        value_dictionary[hashable_state] = value_dictionary[
            hashable_state
        ] + epsilon * (final_reward - value_dictionary[hashable_state])

print(f'Wins {wins/total_iteration}')
print(f'Draws {draws/total_iteration}')
print(f'Looses {looses/total_iteration}')

  0%|          | 0/500000 [00:00<?, ?it/s]

Wins 0.675356
Draws 0.28765
Looses 0.036994


**Implementation of e-greedy method on Q-learning and optimizing action value function.**

In [87]:
total_iteration = 1_000_000
wins = 0
draws = 0
looses = 0

# Q-learning parameters
epsilon = 0.001
alpha = 0.1
gamma = 1

# Action-value function (Q-values) initialization
q_values = defaultdict(float)

for steps in tqdm(range(total_iteration)):
    state = State(set(), set())
    available = set(range(1, 9+1))
    trajectory = []

    while available:
        # Player X's turn
        if np.random.rand() < epsilon:
            # Exploration: Choose a random action
            x = choice(list(available))
        else:
            # Exploitation: Choose the action with the highest Q-value
            available_moves = [i for i in range(1, 10) if i not in state.x.union(state.o)]
            max_q_action = max(available_moves, key=lambda move: q_values[(frozenset(state.x.union({move})), frozenset(state.o))])
            x = max_q_action
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)

        if win(state.x):
            wins += 1
            break
        if not available:
            draws += 1
            break

        # Player O's turn
        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)

        if win(state.o):
            looses += 1
            break

    final_reward = state_value(state)
    hashable_state = (frozenset(state.x), frozenset(state.o))
    q_values[hashable_state] += 10*final_reward

    for state, next_state in zip(trajectory, trajectory[1:]):
      hashable_state = (frozenset(state.x), frozenset(state.o))
      hashable_next_state = (frozenset(next_state.x), frozenset(next_state.o))

      if (final_reward == 1 ):
          q_values[hashable_state] += 2
      elif(final_reward == -1):
          q_values[hashable_state] -= 1
      else:
          available_moves = [i for i in range(1, 10) if i not in state.x.union(state.o)]
          max_q_value = max(q_values[hashable_next_state] for move in available_moves)
          q_values[hashable_state] += alpha * (gamma * max_q_value - q_values[hashable_state])


print(f'Wins {wins/total_iteration}')
print(f'Draws {draws/total_iteration}')
print(f'Looses {looses/total_iteration}')

  0%|          | 0/1000000 [00:00<?, ?it/s]

Wins 0.805522
Draws 0.114861
Looses 0.079617


**Implementation of e-greedy method on Q-learning and optimizing action value function by considering the following parameters: big reward for final winner state = 15 and low reward for looser state = -15 and zero for draw state.  If the final state is winner other states in the trajectory will recieve +1 reward ,otherwise if the final state is looser other states in the trajectory will recieve -1 reward.**

In [12]:
total_iteration = 1_000_000
wins = 0
draws = 0
looses = 0

# Q-learning parameters
epsilon = 0.001
alpha = 0.1
gamma = 1

# Action-value function (Q-values) initialization
q_values = defaultdict(float)

for steps in tqdm(range(total_iteration)):
    state = State(set(), set())
    available = set(range(1, 9+1))
    trajectory = []

    while available:
        # Player X's turn
        if np.random.rand() < epsilon:
            # Exploration: Choose a random action
            x = choice(list(available))
        else:
            # Exploitation: Choose the action with the highest Q-value
            available_moves = [i for i in range(1, 10) if i not in state.x.union(state.o)]
            max_q_action = max(available_moves, key=lambda move: q_values[(frozenset(state.x.union({move})), frozenset(state.o))])
            x = max_q_action
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)

        if win(state.x):
            wins += 1
            break
        if not available:
            draws += 1
            break

        # Player O's turn
        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)

        if win(state.o):
            looses += 1
            break

    final_reward = state_value(state)
    hashable_state = (frozenset(state.x), frozenset(state.o))
    q_values[hashable_state] += 15*final_reward

    for state, next_state in zip(trajectory, trajectory[1:]):
      hashable_state = (frozenset(state.x), frozenset(state.o))
      hashable_next_state = (frozenset(next_state.x), frozenset(next_state.o))

      if (final_reward == 1 ):
          q_values[hashable_state] += 1
      elif(final_reward == -1):
          q_values[hashable_state] -= 1
      else:
          available_moves = [i for i in range(1, 10) if i not in state.x.union(state.o)]
          max_q_value = max(q_values[hashable_next_state] for move in available_moves)
          q_values[hashable_state] += alpha * (gamma * max_q_value - q_values[hashable_state])


print(f'Wins {wins/total_iteration}')
print(f'Draws {draws/total_iteration}')
print(f'Looses {looses/total_iteration}')

  0%|          | 0/1000000 [00:00<?, ?it/s]

Wins 0.88047
Draws 0.066197
Looses 0.053333
