In [1176]:
import gym
from gym import spaces
from gym.utils.env_checker import check_env
import matplotlib.pyplot as plt
import numpy as np
from bucket_env import plot_stats
%matplotlib inline

In [1177]:
import scipy.stats as stat

In [1178]:
class HanoiEnv(gym.Env):

    def __init__(self, disks=3) -> None:
        self.disks = disks
        self.reward = 0
        self.done = False
        self.columns = tuple([0 for _ in range(disks)])
        self.action_space = spaces.Tuple((spaces.Discrete(disks), spaces.Discrete(2)))
        self.observation_space = spaces.Tuple(tuple([spaces.Discrete(3) for _ in range(disks)]))
    

    def is_move_allowed(self, action):
        disk_chosen, tower_destination = action

        for i in range(disk_chosen + 1, len(tuple(self.columns))): # Vérif pas de disques au dessus du disque choisi
            if self.columns[disk_chosen] == self.columns[i]: 
                return False
        for i in range(disk_chosen + 1, len(tuple(self.columns))):
            if self.columns[i] == tower_destination:
                return False
        return True



    def step(self, action):
        disk_chosen, tower_destination = action
        if not self.is_move_allowed(action):
            self.reward = float('-inf')
            self.done = True
            return self.columns, self.reward, self.done, {}
        columns = list(self.columns)
        columns[disk_chosen] = tower_destination
        self.columns = tuple(columns)
        if not self.columns == tuple([2 for _ in range(self.disks)]):
            self.reward = -1
            self.done = False
        else:
            self.done = True
        return self.columns, self.reward, self.done, {}

    def reset(self):
        self.reward = 0
        self.done = False
        self.columns = tuple([0 for _ in range(self.disks)])
        return self.columns
    
    def render(self, mode='human'):
        print("a b c")
        pass

    def close(self):
        pass



In [1179]:
env = HanoiEnv()
check_env(env)

In [1180]:
env = HanoiEnv(disks=3)
env.reset()


(0, 0, 0)

In [1181]:
env.step((2,1)) # Impossible donc donne -inf et True
env.reset()
env.step((2,1)) # Passe le petit disque en colonne du milieu
env.step((1,1)) # Passe le moyen en colone du milieu -> impossible donc donne -inf et True

((0, 0, 1), -inf, True, {})

In [1182]:
from tqdm import tqdm

In [1183]:
def exploratory_policy(state):
  random_disk = np.random.randint(0,3)
  random_tower = np.random.randint(0,3)
  return (random_disk, random_tower)

In [1184]:
action_values = np.zeros(((3, 3, 3) + (2,1)))
print(action_values.shape)

(3, 3, 3, 2, 1)


In [1185]:
def target_policy(state):
    av = action_values[state]
    return np.random.choice(np.flatnonzero(av == av.max()))

In [1186]:
def q_learning(exploratory_policy, target_policy, action_values, episodes, alpha=0.1, gamma=0.99):
    stats = {'Returns': []}
    for episode in tqdm(range(1, episodes + 1)):
        state = env.reset()
        done = False
        ep_return = 0
        while not done:
            action = exploratory_policy(state)
            next_state, reward, done, _ = env.step(action)
            ep_return += reward
            next_action = target_policy(next_state) if not done else None
            qsa = action_values[state][action]
            next_qsa = action_values[next_state][next_action] if not done else 0.
            action_values[state][action] = qsa + alpha * (reward + gamma * next_qsa - qsa)
            print(qsa)
            state = next_state
        stats['Returns'].append(ep_return)
    return stats

In [1187]:
stats = q_learning(exploratory_policy, target_policy, action_values, episodes=10000)
plot_stats(stats)

  0%|          | 0/10000 [00:00<?, ?it/s]


IndexError: index 2 is out of bounds for axis 0 with size 2