# 🏔 Monte Carlo Every-Visit

In [1]:
import gym

env = gym.make("FrozenLake-v0")
env.seed(42)

[42]

In [2]:
def argmax(array):
    return np.random.choice(np.flatnonzero(array == array.max()))

In [3]:
import numpy as np
from collections import defaultdict

class MonteCarloAgent(object):
    def __init__(self, gamma, action_space):
        self.q_values = defaultdict(lambda: np.ones(action_space.n))
        self.times_visited = defaultdict(lambda: np.zeros(action_space.n))
        self.experiences = []
        self.gamma = gamma
        self.action_space = action_space
        
    def step(self, state, epsilon=0):
        if np.random.random() < epsilon:
            action = self.action_space.sample()
        else:
            action = argmax(self.q_values[state])
        return action
    
    def store_experience(self, state, action, reward):
        self.experiences.append((state, action, reward))
        
    def update(self):
        g = 0
        for state, action, reward in reversed(self.experiences):
            g = self.gamma*g + reward
            self.times_visited[state][action] += 1
            self.q_values[state][action] = ((self.times_visited[state][action]-1) * self.q_values[state][action] + g)/self.times_visited[state][action]
            
        self.experiences = []

In [4]:
agent = MonteCarloAgent(0.9, env.action_space)

In [5]:
from collections import deque

returns = deque(maxlen=1000)

for episode in range(1, 100001):
    state = env.reset()
    done = False
    
    ep_return = 0
    
    while not done:
        action = agent.step(state, epsilon=0.1)
        next_state, reward, done, _ = env.step(action)
        agent.store_experience(state, action, reward)
        state = next_state
        ep_return += reward
        
    returns.append(ep_return)
    agent.update()
    
    if episode % 500 == 0:
        print(f"Episode: {episode:5d} Success Rate: {np.mean(returns):5.4f}\r", end="")

Episode: 100000 Success Rate: 0.3980

In [6]:
from collections import deque

returns = deque(maxlen=1000)

for episode in range(1, 1001):
    state = env.reset()
    done = False
    
    ret = 0
    
    while not done:
        action = agent.step(state, epsilon=0)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        ret += reward
        
    returns.append(ret)
    
print(f"Episode: {episode:5d} Success Rate: {np.mean(returns):5.4f}\r", end="")

Episode:  1000 Success Rate: 0.7190