# Q-Learning

In [1]:
import os
import random
import sys

cwd = os.getcwd()

parentdir = os.path.dirname(cwd)
sys.path.insert(0, parentdir)

from environments.tic_tac_toe.tic_tac_toe import TicTacToe
from environments.game import PlayRoom

In [2]:
playroom = PlayRoom(game = TicTacToe())

class Transistion():
  def __init__(self, state_code: str, action: int, reward: int, next_state_code: str):
    self.state_code = state_code
    self.action = action
    self.next_state_code = next_state_code
    self.reward = reward

In [3]:
##EPSILON STILL NEEDED

class QLearning():
  def __init__(self, learning_rate: float, discount_factor: float, epsilon: float):
    self.Q_table = {}
    self.playroom = PlayRoom(game = TicTacToe())
    self.epsilon = epsilon
    self.learning_rate = learning_rate
    self.discount_factor = discount_factor

  def learn(self, n_epochs: int, eval_freq: int, num_eval_episodes: int):
    self.playroom.reset()
    done = False

    for epoch in range(n_epochs):
      action = self._determine_action(deterministic = False)
      state_action_code = f'{self.playroom.game_state_code}-{action}'

      if state_action_code not in self.Q_table.keys():
        self.Q_table[state_action_code] = 0

      _, reward, done, _, _ = self.playroom.step(action = action)
      
      if done:
        self.playroom.reset()
        self.Q_table[state_action_code] = (1 - self.learning_rate) * self.Q_table[state_action_code] + self.learning_rate * (reward)
      
      else:
        next_optimal_state_action_code = f'{self.playroom.game_state_code}-{self._determine_action(deterministic = True)}'

        if next_optimal_state_action_code not in self.Q_table.keys():
          self.Q_table[next_optimal_state_action_code] = 0

        self.Q_table[state_action_code] = (1 - self.learning_rate) * self.Q_table[state_action_code] + self.learning_rate * (reward + self.discount_factor * self.Q_table[next_optimal_state_action_code])

      if (epoch + 1) % eval_freq == 0:
        print(f'Evaluate {epoch + 1}/{n_epochs}: {self._evaluate(num_eval_episodes = num_eval_episodes)}')
        self.playroom.reset()


  def _determine_action(self, deterministic):
    if random.random() < self.epsilon and not deterministic:
      return random.choice(self.playroom.possible_actions)
    
    state_code = self.playroom.game_state_code
    state_q_values = {k: v for k, v in self.Q_table.items() if state_code in k}

    if len(state_q_values) == 0:
      return random.choice(self.playroom.possible_actions)
    
    max_state_action = max(state_q_values, key = state_q_values.get)
    return int(max_state_action.split('-')[-1])
  
  def _evaluate(self, num_eval_episodes: int):
    total_reward = 0
    for _ in range(num_eval_episodes):
      self.playroom.reset()
      done = False
      while not done:
        _, reward, done, _, _ = self.playroom.step(action = self._determine_action(deterministic = True))
      total_reward += reward
    return total_reward / num_eval_episodes

In [8]:
q_learing = QLearning(learning_rate = 0.9, discount_factor = 0.9, epsilon = 0.5)
q_learing.learn(50_000, eval_freq = 5_000, num_eval_episodes = 100)

Evaluate 5000/50000: -0.18
Evaluate 10000/50000: 0.08
Evaluate 15000/50000: 0.12
Evaluate 20000/50000: 0.4
Evaluate 25000/50000: 0.28
Evaluate 30000/50000: 0.18
Evaluate 35000/50000: 0.52
Evaluate 40000/50000: 0.3
Evaluate 45000/50000: 0.32
Evaluate 50000/50000: 0.64
