In [185]:
import numpy as np
import pandas as pd

import gym
import timeit
from gym import wrappers

In [285]:
max_iterations = 10000000
t_max = 10000
epsilon = 1e-100

env_name = 'FrozenLake8x8-v0'
env = gym.make(env_name)
env.seed(0)
np.random.seed(0)

In [187]:
# https://medium.com/@m.alzantot/deep-reinforceme
# nt-learning-demysitifed-episode-2-policy-iteration
# -value-iteration-and-q-978f9e89ddaa

def run_episode(env, policy, gamma = 1.0, render = False):
    """ Evaluates policy by using it to run an episode and finding its
    total reward.
    args:
    env: gym environment.
    policy: the policy to be used.
    gamma: discount factor.
    render: boolean to turn rendering on/off.
    returns:
    total reward: real value of the total reward recieved by agent under policy.
    """
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done , _ = env.step(int(policy[obs]))
        total_reward += (gamma ** step_idx * reward)
        step_idx += 1
        if done:
            break
    return total_reward


def evaluate_policy(env, policy, gamma = 1.0,  n = 1000):
    """ Evaluates a policy by running it n times.
    returns:
    average total reward
    """
    scores = [
            run_episode(env, policy, gamma = gamma, render = False)
            for _ in range(n)]
    return np.mean(scores)

In [188]:
def value_iteration(env, gamma = 1, v = None):
    v = np.zeros(env.env.nS)
    
    for i in range(max_iterations):
        old_v = np.copy(v)
        for s in range(env.env.nS):
            q_sa = [sum([gamma * p * (r + old_v[s_]) for p, s_, r, _ in env.env.P[s][a]]) for a in range(env.env.nA)]
            v[s] = max(q_sa)
        if np.sum(np.fabs(v - old_v)) <= epsilon:
            print ('Value-iteration converged at iteration #%d.' %(i+1))
            break
    
    return v

start = timeit.default_timer()
values = value_iteration(env, gamma = 1)
froz_lake_vi = timeit.default_timer() - start

pd.Series(values, name="value").to_csv("value_iteration.csv", header=True)
froz_lake_vi

Value-iteration converged at iteration #2357.


1.748051114998816

In [189]:
def extract_policy(v, gamma = 1.0):
    """ Extract the policy given a value-function """
    policy = np.zeros(env.env.nS)
    for s in range(env.env.nS):
        q_sa = np.zeros(env.env.action_space.n)
        for a in range(env.env.action_space.n):
            for next_sr in env.env.P[s][a]:
                # next_sr is a tuple of (probability, next state, reward, done)
                p, s_, r, _ = next_sr
                q_sa[a] += (p * (r + gamma * v[s_]))
        policy[s] = np.argmax(q_sa)
    return policy

policy = extract_policy(values)
pd.Series(policy, name="policy").to_csv("vi_policy.csv", header=True)

In [190]:
evaluate_policy(env, policy)

0.854

In [191]:
run_episode(env, policy, render=True)


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
F[41mF[0mFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
FF[41mF[0mFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
FFF[41mF[0mFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
FF[41mF[0mFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FF

1.0

# Policy Iteration

In [192]:
def compute_policy_v(env, policy, gamma=1.0):
    """ Iteratively evaluate the value-function under policy.
    Alternatively, we could formulate a set of linear equations in iterms of v[s] 
    and solve them to find the value function.
    """
    v = np.zeros(env.env.nS)
    eps = 1e-10
    while True:
        prev_v = np.copy(v)
        for s in range(env.env.nS):
            policy_a = policy[s]
            v[s] = sum([p * (r + gamma * prev_v[s_]) for p, s_, r, _ in env.env.P[s][policy_a]])
        if (np.sum((np.fabs(prev_v - v))) <= eps):
            # value converged
            break
    return v

def policy_iteration(env, gamma = 1.0):
    """ Policy-Iteration algorithm """
    policy = np.random.choice(env.env.nA, size=(env.env.nS))  # initialize a random policy
    for i in range(max_iterations):
        old_policy_v = compute_policy_v(env, policy, gamma)
        new_policy = extract_policy(old_policy_v, gamma)
        if (np.all(policy == new_policy)):
            print ('Policy-Iteration converged at step %d.' %(i+1))
            break
        policy = new_policy
    return policy

In [193]:
start = timeit.default_timer()
pi_policy = policy_iteration(env)
froz_lake_pi = timeit.default_timer() - start

pd.Series(pi_policy, name="policy").to_csv("pi_policy.csv", header=True)

Policy-Iteration converged at step 8.


In [194]:
froz_lake_pi

2.5997387309980695

In [290]:
def q_learning(env, gamma = 1.0, alpha = 0.9):
    # np.abs(np.random.randn())
    first_reward = True
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    eps = 0.99999999
    total_reward = 0
    for i in range(max_iterations):
        
        if (i + 1) % 10000 == 0:
            print((i + 1) / 10000, "% complete -- alpha: ", round(alpha, 2),
                  "-- epsilon: ", round(eps, 2), "-- reward:", round(total_reward, 2))
            total_reward = 0
            
        obs = env.reset()
        for t in range(t_max):
            if np.random.uniform(0, 1) < eps:
                action = np.random.choice(env.action_space.n)
            else:
                action = np.argmax(Q[obs])
            
            old_obs = obs
            obs, reward, done, _ = env.step(action)
                                    
            predict = Q[old_obs, action]
            target = reward + gamma * np.max(Q[obs])
            Q[old_obs, action] = predict + alpha * (target - predict)
            
            total_reward += reward
            
            if reward != 0 and first_reward:
                first_reward = False
                print("first reward at iteration ", i, ". reward: ", reward)
            
            if done:
                break
        
        alpha = alpha * .999999
        eps = eps * 0.999999
    
    return Q

def extract_q_policy(env, Q):
    policy = np.zeros(env.observation_space.n)
    for state in range(env.observation_space.n):
        policy[state] = np.argmax(Q[state])
        
    return policy

In [291]:
Q = q_learning(env)

first reward at iteration  104 . reward:  1.0
1.0 % complete -- alpha:  0.89 -- epsilon:  0.99 -- reward: 22.0
2.0 % complete -- alpha:  0.88 -- epsilon:  0.98 -- reward: 22.0
3.0 % complete -- alpha:  0.87 -- epsilon:  0.97 -- reward: 6.0
4.0 % complete -- alpha:  0.86 -- epsilon:  0.96 -- reward: 19.0
5.0 % complete -- alpha:  0.86 -- epsilon:  0.95 -- reward: 24.0
6.0 % complete -- alpha:  0.85 -- epsilon:  0.94 -- reward: 19.0
7.0 % complete -- alpha:  0.84 -- epsilon:  0.93 -- reward: 20.0
8.0 % complete -- alpha:  0.83 -- epsilon:  0.92 -- reward: 23.0
9.0 % complete -- alpha:  0.82 -- epsilon:  0.91 -- reward: 26.0
10.0 % complete -- alpha:  0.81 -- epsilon:  0.9 -- reward: 36.0
11.0 % complete -- alpha:  0.81 -- epsilon:  0.9 -- reward: 26.0
12.0 % complete -- alpha:  0.8 -- epsilon:  0.89 -- reward: 27.0
13.0 % complete -- alpha:  0.79 -- epsilon:  0.88 -- reward: 28.0
14.0 % complete -- alpha:  0.78 -- epsilon:  0.87 -- reward: 19.0
15.0 % complete -- alpha:  0.77 -- epsilon:

KeyboardInterrupt: 

In [282]:
new_policy = extract_q_policy(env, Q)
pd.Series(new_policy, name="policy").to_csv("q_policy.csv", header=True)

In [283]:
evaluate_policy(env, new_policy)

0.0