1er ejercicio 4 puzzle


In [None]:

import numpy as np
import matplotlib.pyplot as plt


In [None]:
A_UP, A_DOWN, A_LEFT, A_RIGHT = range(4)
ACTIONS = [A_UP, A_DOWN, A_LEFT, A_RIGHT]
DIRS = {A_UP:(-1,0), A_DOWN:(1,0), A_LEFT:(0,-1), A_RIGHT:(0,1)}

In [None]:
class Gridworld:
    def __init__(self, h=2, w=2, goal=(1, 2, 3, 0),
                 step_reward=-1.0, goal_reward=10.0, max_steps=60, seed=123):
        self.h, self.w = h, w
        self.goal = goal
        self.step_reward, self.goal_reward = step_reward, goal_reward
        self.max_steps = max_steps
        self.rng = np.random.default_rng(seed)
        self.reset()

    def _mezclar_desde_goal(self):
        self.s = self.goal
        for y in range(self.scramble_steps):
            s_list = list(self.s)
            zero_pos = s_list.index(0)
            i, j = divmod(zero_pos, 2)

            # Elegir acción aleatoria
            a = int(self.rng.integers(len(ACTIONS)))
            di, dj = DIRS[a]
            ni = i + di
            nj = j + dj

            # Si se sale del tablero, no se mueve
            if 0 <= ni < 2 and 0 <= nj < 2:
                new_pos = ni * 2 + nj
                s_list[zero_pos], s_list[new_pos] = s_list[new_pos], s_list[zero_pos]
                self.s = tuple(s_list)    

    
    def reset(self):
        self.t = 0
        self._mezclar_desde_goal()
        return self.s


    def step(self, a):
        """
        Mueve el 0 (vacío) en la dirección a.
        Devuelve: (nuevo_estado, recompensa, done)
        """
        self.t += 1
        s_list = list(self.s)

        # Posición del 0
        zero_pos = s_list.index(0)    # 0..3
        i, j = divmod(zero_pos, 2)    # 2x2

        di, dj = DIRS[a]
        ni = i + di
        nj = j + dj

        # Si se sale del tablero, no se mueve
        if 0 <= ni < 2 and 0 <= nj < 2:
            new_pos = ni * 2 + nj
            # Intercambiar el 0 con la ficha adyacente
            s_list[zero_pos], s_list[new_pos] = s_list[new_pos], s_list[zero_pos]

        self.s = tuple(s_list)

        # Recompensa y fin
        if self.s == self.goal:
            r = self.goal_reward
            done = True
        else:
            r = self.step_reward
            done = (self.t >= self.max_steps)

        return self.s, r, done


In [None]:
def eps_greedy(Q_row, epsilon, rng):
    if rng.uniform(0, 1) < epsilon:
        return int(rng.integers(len(Q_row)))
    m = np.max(Q_row)
    cand = np.flatnonzero(np.isclose(Q_row, m)) 
    return int(rng.choice(cand))

def play_episode(env, Q, epsilon, rng):
    s = env.reset()
    traj = []                      
    done = False
    steps = 0
    while not done and steps < env.max_steps:
        a = eps_greedy(Q[s], epsilon, rng)
        sp, r, done = env.step(a)
        traj.append((s, a, r))
        s = sp
        steps += 1
    return traj, done, steps



            

