A simple gridworld environment for testing RL algorithms in discrete action and state spaces.

In [1]:
import numpy as np
import plotly.express as px

In [2]:
class GridEnv():
    def __init__(self) -> None:
        self.grid = np.ones((10,10)) * -1
        self.grid[0, 1:-1] = -10
        
    def reset(self):
        self.player_position = [0,0]
    
    def step(self, action): # up=1, down=2, left=3, right=4

        next_state = self.player_position
        if action == "up":
            if next_state[0] != 0:
                next_state[0] -= 1
        elif action == "down":
            if next_state[0] != 9:
                next_state[0] += 1 
        elif action == "left":
            if next_state[1] != 0:
                next_state[1] -= 1
        elif action == "right":
            if next_state[1] != 9:
                next_state[1] += 1

        reward = self.grid[tuple(next_state)]
        done = True if self.player_position == [0,9] else False

        return next_state, reward, done

In [3]:
def random_policy():
    return np.random.choice(["left", "right", "up", "down"])

In [60]:
env = GridEnv()

n_episodes = 1000
all_rewards = []
trajectories = []

for _ in range(n_episodes):
    env.reset()
    done = False
    trajectory = []
    rewards = []
    n = 0
    n_max = 100
    
    while not done:
        

        n += 1
        action = random_policy()
        state=env.player_position
        trajectory.append((action, state.copy()))

        next_state, reward, done = env.step(action)
        rewards.append(reward)
        state=next_state

    trajectories.append(trajectory)
    all_rewards.append(rewards)


Monte Carlo Estimation

In [92]:
# express Q estimation as V estimation
state_values =  {i:np.zeros((10,10)) for i in ["left", "right", "up", "down"]}

In [93]:
state_counts = {i:np.zeros((10,10)) for i in ["left", "right", "up", "down"]}
for j, traj in enumerate(trajectories):
    for idx, val in enumerate(traj):
        action = val[0]
        position= tuple(val[1])
        
        state_counts[action][position] += 1
        state_values[action][position] += sum(all_rewards[j][idx:])

TD Evaluation

In [None]:
state_values =  {i:np.zeros((10,10)) for i in ["left", "right", "up", "down"]}

In [None]:
n = 2
alpha = 0.03

for j, traj in enumerate(trajectories):
    for idx, val in enumerate(traj):
        action = val[0]
        position= tuple(val[1])

        gt = all_rewards[j][idx:idx+n] + state_values[traj[idx+n][0]][tuple(traj[idx+n][1])]

        state_values[action][position] += alpha * sum(gt - state_values[action][position])

In [88]:
#state_values =  {i:np.zeros((10,10)) for i in ["left", "right", "up", "down"]}
#alpha=0.10
#for j, traj in enumerate(trajectories):
#    for idx, val in enumerate(traj):
#        action = val[0]
#        position= tuple(val[1])
#
#        state_values[action][position] += alpha * (sum(all_rewards[j][:idx])-state_values[action][position])

In [94]:
px.imshow(state_values["right"]/state_counts["right"])


invalid value encountered in true_divide



In [95]:
px.imshow(state_values["left"]/state_counts["left"])


invalid value encountered in true_divide

