In [1]:
import numpy as np

In [662]:
################ Environment ################

import numpy as np
import contextlib

# Configures numpy print options
@contextlib.contextmanager
def _printoptions(*args, **kwargs):
    original = np.get_printoptions()
    np.set_printoptions(*args, **kwargs)
    try:
        yield
    finally: 
        np.set_printoptions(**original)

        
class EnvironmentModel:
    def __init__(self, n_states, n_actions, seed=None):
        self.n_states = n_states
        self.n_actions = n_actions
        
        self.random_state = np.random.RandomState(seed)
        
    def p(self, next_state, state, action):
        raise NotImplementedError()
    
    def r(self, next_state, state, action):
        raise NotImplementedError()
        
    def draw(self, state, action):
        p = [self.p(ns, state, action) for ns in range(self.n_states)]
        next_state = self.random_state.choice(self.n_states, p=p)
        reward = self.r(next_state, state, action)
        
        return next_state, reward

        
class Environment(EnvironmentModel):
    def __init__(self, n_states, n_actions, max_steps, pi, seed=None):
        EnvironmentModel.__init__(self, n_states, n_actions, seed)
        
        self.max_steps = max_steps
        
        self.pi = pi
        if self.pi is None:
            self.pi = np.full(n_states, 1./n_states)
        
    def reset(self):
        self.n_steps = 0
        self.state = self.random_state.choice(self.n_states, p=self.pi)
        
        return self.state
        
    def step(self, action):
        if action < 0 or action >= self.n_actions:
            raise Exception('Invalid action.')
        
        self.n_steps += 1
        done = (self.n_steps >= self.max_steps)
        
        self.state, reward = self.draw(self.state, action)
        
        return self.state, reward, done
    
    def render(self, policy=None, value=None):
        raise NotImplementedError()

        
class FrozenLake(Environment):
    def __init__(self, lake, slip, max_steps, seed=None):
        """
        lake: A matrix that represents the lake. For example:
         lake =  [['&', '.', '.', '.'],
                  ['.', '#', '.', '#'],
                  ['.', '.', '.', '#'],
                  ['#', '.', '.', '$']]
        slip: The probability that the agent will slip
        max_steps: The maximum number of time steps in an episode
        seed: A seed to control the random number generator (optional)
        """
        # start (&), frozen (.), hole (#), goal ($)
        self.lake = np.array(lake)
        self.lake_flat = self.lake.reshape(-1)
        
        self.slip = slip
        self.max_steps = max_steps
        
        self.n_states = self.lake.size + 1
        self.n_actions = 4
        
        self.pi = np.zeros(self.n_states, dtype=float)
        self.pi[np.where(self.lake_flat == '&')[0]] = 1.0
        
        self.pin = np.zeros(self.n_states, dtype=float)
        self.pin[np.where(self.lake_flat == '&')[0]] = 1.0
        self.pin[np.where(self.lake_flat == '#')[0]] = -1.0
        
        self.holes = []
        i = 0
        for index in self.pin:
            if index == -1:
                self.holes.append(i)
            i += 1
        
        self.absorbing_state = self.n_states - 1
        
        # TODO:
        self.rows = np.sqrt(self.n_states-1)
        
        self.random_state = np.random.RandomState(seed)
        
    def step(self, action):
        state, reward, done = Environment.step(self, action)
        
        done = (state == self.absorbing_state) or done
        
        return state, reward, done
        
    def p(self, next_state, state, action):
        
        # TODO:
        prob = 0
        
        if state in self.holes or state == self.rows*self.rows - 1 or state == self.rows*self.rows:
            if next_state == self.rows*self.rows:
                prob += 1
            return prob
#         elif state == self.rows*self.rows:
#             if next_state == self.rows*self.rows:
#                 prob += 1
                
        else:
            if next_state == state:
                if state < self.rows:
                    if state == 0:
                        prob += self.slip/2
                        if action == 0 or action == 1:
                            prob += 1-self.slip
                    elif state == self.rows - 1:
                        prob += self.slip/2
                        if action == 0 or action == 3:
                            prob += 1-self.slip
                    else:
                        prob += self.slip/4
                        if action == 0:
                            prob += 1-self.slip
                elif state%self.rows == 0 and not state == self.rows*self.rows:
                    if state == self.rows*(self.rows-1):
                        prob += self.slip/2
                        if action == 1 or action == 2:
                            prob += 1-self.slip
                    else:
                        prob += self.slip/4
                        if action == 1:
                            prob += 1-self.slip
                elif state%self.rows == self.rows-1:
                    if state == self.rows*(self.rows)-1:
                        prob += self.slip/2
                        if action == 2 or action == 3:
                            prob += 1-self.slip
                    else:
                        prob += self.slip/4
                        if action == 3:
                            prob += 1-self.slip
                elif state > self.rows*(self.rows-1) and state < self.rows*self.rows-1:
                    prob += self.slip/4
                    if action == 2:
                        prob += 1-self.slip

            if next_state - state == 1 and not next_state%self.rows == 0 :
                if action == 3:
                    prob += 1 - self.slip*3/4
                else:
                    prob += self.slip/4
            if next_state - state == -1 and not state%self.rows == 0:
                if action == 1:
                    prob += 1 - self.slip*3/4
                else:
                    prob += self.slip/4
            if next_state - state == self.rows and not next_state == self.rows*self.rows:
                if action == 2:
                    prob += 1 - self.slip*3/4
                else:
                    prob += self.slip/4
            if next_state - state == -self.rows:
                if action == 0:
                    prob += 1 - self.slip*3/4
                else:
                    prob += self.slip/4
                
        return prob
                
                
    
    def r(self, next_state, state, action):
        # TODO:
        reward = 0
        if next_state == self.rows*self.rows:
            if state == self.rows*self.rows - 1:
                reward += 1 
        return reward
   
    def render(self, policy=None, value=None):
        if policy is None:
            lake = np.array(self.lake_flat)
            
            if self.state < self.absorbing_state:
                lake[self.state] = '@'
                
            print(lake.reshape(self.lake.shape))
        else:
            # UTF-8 arrows look nicer, but cannot be used in LaTeX
            # https://www.w3schools.com/charsets/ref_utf_arrows.asp
            actions = ['^', '<', '_', '>']
            
            print('Lake:')
            print(self.lake)
        
            print('Policy:')
            policy = np.array([actions[a] for a in policy[:-1]])
            print(policy.reshape(self.lake.shape))
            
            print('Value:')
            with _printoptions(precision=3, suppress=True):
                print(value[:-1].reshape(self.lake.shape))
                
    def reset(self):
        self.n_steps = 0
        self.state = self.random_state.choice(self.n_states, p=self.pi)
        
        return self.state
                
def play(env):
    actions = ['w', 'a', 's', 'd']
    
    state = env.reset()
    env.render()
    
    done = False
    while not done:
        c = input('\nMove: ')
        if c not in actions:
            raise Exception('Invalid action')
            
        state, r, done = env.step(actions.index(c))
        
        env.render()
        print('Reward: {0}.'.format(r))

################ Model-based algorithms ################

def policy_evaluation(env, policy, gamma, theta, max_iterations):
    val = np.zeros(env.n_states, dtype=np.float)
    for i in range (max_iterations):
        delta = 0
        for s in range (env.n_states):
            v = 0
            for ns in range (env.n_states):
                v += env.p(ns, s, policy[s])*(env.r(ns, s, policy[s])+gamma*val[ns])
            delta = max(delta, abs(v-val[s]))
            val[s] = v
        if delta < theta:
            break
    return val
    
def policy_improvement(env, policy, value, gamma):
    
    policy_new = np.copy(policy)
    
    for s in range(env.n_states):
        bestAction = 0
        bestValue = 0
        for a in range(env.n_actions):
            result = 0
            for ns in range(env.n_states):
                result += env.p(ns, s, a)*(env.r(ns, s, a)+gamma*value[ns])
            if result>bestValue:
                bestAction = a
                bestValue = result
        policy_new[s] = bestAction
    return policy_new

    
def policy_iteration(env, gamma, theta, max_iterations, policy=None):
    if policy is None:
        policy = np.zeros(env.n_states, dtype=int)
    else:
        policy = np.array(policy, dtype=int)
        
    iters = 0
    for i in range(max_iterations):
        iters += 1
        value = policy_evaluation (env, policy, gamma, theta, max_iterations)
        policy_new = policy_improvement(env, policy, value, gamma)
        if np.array_equal(policy_new, policy): 
            break
        policy = policy_new
    return policy, value, iters
        
    
def value_iteration(env, gamma, theta, max_iterations, value=None):
    if value is None:
        value = np.zeros(env.n_states)
    else:
        value = np.array(value, dtype=np.float)
        
    policy = np.zeros(env.n_states, dtype=np.int)
    
    iters = 0
    
    #Value optimization
    
    for i in range (max_iterations):
        iters += 1
        delta = 0
        for s in range(env.n_states):
            val = 0
            for a in range(env.n_actions):
                result = 0
                for ns in range(env.n_states):
                    result += env.p(ns, s, a)*(env.r(ns, s, a)+gamma*value[ns])
                val = max(result, val)
            delta = max(delta, abs(val-value[s]))
            value[s] = val
        if delta < theta:
            break
            
    #Policy retrival
    for s in range(env.n_states):
        val = 0
        bestAction = 0
        for a in range(env.n_actions):
            result = 0
            for ns in range(env.n_states):
                result += env.p(ns, s, a)*(env.r(ns, s, a)+gamma*value[ns])
            if result > val:
                val = result
                bestAction = a
        policy[s] = int(bestAction)
            
    return policy, value, iters

################ Tabular model-free algorithms ################

def epsilon_greedy(env, q, epsilon, state, random_state):
    
    bestAction = np.argmax(q[state, :])
    p = np.zeros(env.n_actions)
    for i in range(env.n_actions):
        if i == bestAction: p[i]=1-epsilon
        else: p[i]=epsilon/(env.n_actions-1)                          
    return random_state.choice(env.n_actions, p=p)
    
#     if np.random.random() < epsilon:
#           return np.random.randint(env.n_actions)
#     else:
#           return np.argmax(q[state, :])

def sarsa(env, max_episodes, eta, gamma, epsilon, optimalValue, seed=None):

    random_state = np.random.RandomState(seed)
    
    eta = np.linspace(eta, 0, max_episodes)
    epsilon = np.linspace(epsilon, 0, max_episodes)
    
    q = np.zeros((env.n_states, env.n_actions))
    
    iters = 0
    for i in range(max_episodes):
        iters +=1
        s = env.reset()
        # TODO:
        a = epsilon_greedy(env, q, epsilon[i], s, random_state)
        done = False
        while not done:
            ns, reward, done = env.step(a)
            na = epsilon_greedy(env, q, epsilon[i], ns, random_state)
            q[s, a] += eta[i] * (reward + (gamma * q[ns, na]) - q[s, a])
            s = ns
            a = na
            
        policy = q.argmax(axis=1)
        valueForPolicy = policy_evaluation(env, policy, gamma, 0.01, 100)
        if np.array_equal(valueForPolicy, optimalValue):
            print(iters)
            break
            
    policy = q.argmax(axis=1)   
    value = q.max(axis=1)
        
    return policy, value
    
def q_learning(env, max_episodes, eta, gamma, epsilon, optimalValue, seed=None):
    random_state = np.random.RandomState(seed)
    
    eta = np.linspace(eta, 0, max_episodes)
    epsilon = np.linspace(epsilon, 0, max_episodes)
    
    q = np.zeros((env.n_states, env.n_actions))
    
    iters = 0
    for i in range(max_episodes):
        iters +=1
        s = env.reset()
        # TODO:
        done = False
        while not done:
            a = epsilon_greedy(env, q, epsilon[i], s, random_state)
            ns, reward, done = env.step(a)
            maxQ = 0
            for action in range(env.n_actions):
                if q[ns, action]>maxQ: maxQ = q[ns, action]
            q[s, a] += eta[i] * (reward + (gamma * maxQ) - q[s, a])
            s = ns

        policy = q.argmax(axis=1)
        valueForPolicy = policy_evaluation(env, policy, gamma, 0.01, 100)
        if np.array_equal(valueForPolicy, optimalValue):
            print(iters)
            break
        
#     policy = q.argmax(axis=1)
    value = q.max(axis=1)
        
    return policy, value

################ Non-tabular model-free algorithms ################

class LinearWrapper:
    def __init__(self, env):
        self.env = env
        
        self.n_actions = self.env.n_actions
        self.n_states = self.env.n_states
        self.n_features = self.n_actions * self.n_states
        
    def encode_state(self, s):
        features = np.zeros((self.n_actions, self.n_features))
        for a in range(self.n_actions):
            i = np.ravel_multi_index((s, a), (self.n_states, self.n_actions))
            features[a, i] = 1.0
          
        return features
    
    def decode_policy(self, theta):
        policy = np.zeros(self.env.n_states, dtype=int)
        value = np.zeros(self.env.n_states)
        
        for s in range(self.n_states):
            features = self.encode_state(s)
            q = features.dot(theta)
            
            policy[s] = np.argmax(q)
            value[s] = np.max(q)
        
        return policy, value
        
    def reset(self):
        return self.encode_state(self.env.reset())
    
    def step(self, action):
        state, reward, done = self.env.step(action)
        
        return self.encode_state(state), reward, done
    
    def render(self, policy=None, value=None):
        self.env.render(policy, value)
        
def epsilon_greedy_linear(env, q, epsilon, random_state):
    
    bestAction = np.argmax(q[:])
    p = np.zeros(env.n_actions)
    for i in range(env.n_actions):
        if i == bestAction: p[i]=1-epsilon
        else: p[i]=epsilon/(env.n_actions-1)                          
    return random_state.choice(env.n_actions, p=p)
        
def linear_sarsa(env, max_episodes, eta, gamma, epsilon, seed=None):
    random_state = np.random.RandomState(seed)
    
    eta = np.linspace(eta, 0, max_episodes)
    epsilon = np.linspace(epsilon, 0, max_episodes)
    
    theta = np.zeros(env.n_features)
    
    for i in range(max_episodes):
        features = env.reset()
        
        q = features.dot(theta)

        done = False

        while not done:
            a = epsilon_greedy_linear(env, q, epsilon[i], random_state)
            nfeatures, r, done = env.step(a)
            delta = r - q[a]
            q = np.dot(nfeatures, theta)
            na = epsilon_greedy_linear(env, q, epsilon[i], random_state)
            delta += gamma * q[na]
            theta += eta[i] * delta * features[a]
            features = nfeatures
            a = na
    
    return theta
    
def linear_q_learning(env, max_episodes, eta, gamma, epsilon, seed=None):
    random_state = np.random.RandomState(seed)
    
    eta = np.linspace(eta, 0, max_episodes)
    epsilon = np.linspace(epsilon, 0, max_episodes)
    
    theta = np.zeros(env.n_features)
    
    for i in range(max_episodes):
        features = env.reset()
        
        q = features.dot(theta)

        done = False

        while not done:
            a = epsilon_greedy_linear(env, q, epsilon[i], random_state)
            features_prime, r, done = env.step(a)
            delta = r - q[a]
            q = np.dot(features_prime, theta)
            maxQ = 0
            for action in range(env.n_actions):
                if q[action]>maxQ: maxQ = q[action]
            delta += gamma * maxQ
            theta += eta[i] * delta * features[a]
            features = features_prime

    return theta    

################ Main function ################

def main():
    seed = 0
    
    # Small lake
    lake =   [['&', '.', '.', '.'],
              ['.', '#', '.', '#'],
              ['.', '.', '.', '#'],
              ['#', '.', '.', '$']]

    env = FrozenLake(lake, slip=0.1, max_steps=16, seed=seed)
    
    print('# Model-based algorithms')
    gamma = 0.9
    theta = 0.001
    max_iterations = 100
    
    print('')
    
    print('## Policy iteration')
    policy, value, iters = policy_iteration(env, gamma, theta, max_iterations)
    env.render(policy, value)
    print("Number of iterations: "+ str(iters))
    print('')
    
    print('## Value iteration')
    policy, value, iters = value_iteration(env, gamma, theta, max_iterations)
    env.render(policy, value)
    print("Number of iterations: "+ str(iters))
    
    print('')
    
    print('# Model-free algorithms')
    max_episodes = 200000
    eta = 0.5
    epsilon = 0.5
    
    print('')
    
    print('## Sarsa')
    policy, value = sarsa(env, max_episodes, eta, gamma, epsilon, seed=seed)
    env.render(policy, value)
    
    print('')
    
    print('## Q-learning')
    policy, value = q_learning(env, max_episodes, eta, gamma, epsilon, seed=seed)
    env.render(policy, value)
    
    print('')
    
    linear_env = LinearWrapper(env)
    
    print('## Linear Sarsa')
    
    parameters = linear_sarsa(linear_env, max_episodes, eta,
                              gamma, epsilon, seed=seed)
    policy, value = linear_env.decode_policy(parameters)
    linear_env.render(policy, value)
    
    print('')
    
    print('## Linear Q-learning')
    
    parameters = linear_q_learning(linear_env, max_episodes, eta,
                                   gamma, epsilon, seed=seed)
    policy, value = linear_env.decode_policy(parameters)
    linear_env.render(policy, value)

def main_large():
    seed = 0
    
    # Large lake
    lake =   [['&', '.', '.', '.', '.', '.', '.', '.'],
              ['.', '.', '.', '.', '.', '.', '.', '.'],
              ['.', '.', '.', '#', '.', '.', '.', '.'],
              ['.', '.', '.', '.', '.', '#', '.', '.'],
              ['.', '.', '.', '#', '.', '.', '.', '.'],
              ['.', '#', '#', '.', '.', '.', '#', '.'],
              ['.', '#', '.', '.', '#', '.', '#', '.'],
              ['.', '.', '.', '#', '.', '.', '.', '$']]

    env = FrozenLake(lake, slip=0.1, max_steps=16, seed=seed)
    
    print('# Model-based algorithms')
    gamma = 0.9
    theta = 0.001
    max_iterations = 100
    
    print('')
    
    print('## Policy iteration')
    policy, value = policy_iteration(env, gamma, theta, max_iterations)
    env.render(policy, value)
    
    print('')
    
    print('## Value iteration')
    policy, value = value_iteration(env, gamma, theta, max_iterations)
    env.render(policy, value)
    
    print('')
    
    print('# Model-free algorithms')
    max_episodes = 2000
    eta = 0.5
    epsilon = 0.5
    
    print('')
    
    print('## Sarsa')
    policy, value = sarsa(env, max_episodes, eta, gamma, epsilon, seed=seed)
    env.render(policy, value)
    
    print('')
    
    print('## Q-learning')
    policy, value = q_learning(env, max_episodes, eta, gamma, epsilon, seed=seed)
    env.render(policy, value)
    
    print('')
    
    linear_env = LinearWrapper(env)
    
    print('## Linear Sarsa')
    
    parameters = linear_sarsa(linear_env, max_episodes, eta,
                              gamma, epsilon, seed=seed)
    policy, value = linear_env.decode_policy(parameters)
    linear_env.render(policy, value)
    
    print('')
    
    print('## Linear Q-learning')
    
    parameters = linear_q_learning(linear_env, max_episodes, eta,
                                   gamma, epsilon, seed=seed)
    policy, value = linear_env.decode_policy(parameters)
    linear_env.render(policy, value)


In [663]:
seed = 0
    
# Small lake
lake =   [['&', '.', '.', '.'],
          ['.', '#', '.', '#'],
          ['.', '.', '.', '#'],
          ['#', '.', '.', '$']]

env = FrozenLake(lake, slip=0.1, max_steps=100, seed=seed)

In [664]:
for j in range (17):
    sumof = 0 
    for i in range(17):
        sumof += env.p(i,j,3)
    if not sumof == 1.0:
        print(sumof, j)

In [665]:
env.p(6,5,3)

0

In [679]:
_, value, _ = policy_iteration(env, 0.9, 0.01, 500)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  val = np.zeros(env.n_states, dtype=np.float)


In [680]:
sarsa(env, 20000, 0.5, 0.9, 0.5, value, seed=0)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  val = np.zeros(env.n_states, dtype=np.float)


466


(array([2, 3, 2, 1, 2, 0, 2, 0, 3, 2, 2, 0, 0, 3, 3, 2, 0]),
 array([0.01341984, 0.02940991, 0.01892601, 0.01154619, 0.03946482,
        0.        , 0.20411077, 0.        , 0.05141579, 0.08845898,
        0.25184301, 0.        , 0.        , 0.33342023, 0.77470978,
        0.99942679, 0.        ]))

In [681]:
q_learning(env, 20000, 0.5, 0.9, 0.5, value, seed=0)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  val = np.zeros(env.n_states, dtype=np.float)


1651


(array([2, 3, 2, 1, 2, 0, 2, 0, 3, 2, 2, 0, 0, 3, 3, 0, 0]),
 array([0.49287918, 0.48811403, 0.546153  , 0.47842626, 0.55752958,
        0.        , 0.58983145, 0.        , 0.63440045, 0.72372404,
        0.69095815, 0.        , 0.        , 0.80978268, 0.89999719,
        1.        , 0.        ]))

In [669]:
value_iteration(env, 0.9, 0.01, 500)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  policy = np.zeros(env.n_states, dtype=np.int)


(array([2, 3, 2, 1, 2, 0, 2, 0, 3, 2, 2, 0, 0, 3, 3, 0, 0]),
 array([0.45445927, 0.50358629, 0.57904696, 0.50476979, 0.50802096,
        0.        , 0.65265184, 0.        , 0.5842415 , 0.67227225,
        0.76831633, 0.        , 0.        , 0.77097869, 0.88709375,
        1.        , 0.        ]),
 10)

In [433]:
sarsa(env, 2000, 0.5, 0.9, 0.5, seed=0)

(array([2, 3, 2, 1, 2, 0, 2, 0, 3, 2, 2, 0, 0, 3, 3, 2, 0]),
 array([0.42626583, 0.38584404, 0.47352703, 0.31483086, 0.48382264,
        0.        , 0.58289165, 0.        , 0.53801505, 0.64805014,
        0.7202116 , 0.        , 0.        , 0.75037442, 0.87733795,
        1.        , 0.        ]))

In [435]:
q_learning(env, 2000, 0.5, 0.9, 0.5, seed=0)

(array([3, 3, 2, 1, 0, 0, 2, 0, 0, 3, 2, 0, 0, 3, 3, 1, 0]),
 array([0.44506135, 0.51224076, 0.59070131, 0.50008675, 0.3898141 ,
        0.        , 0.67083654, 0.        , 0.31950971, 0.66594854,
        0.78180981, 0.        , 0.        , 0.77643929, 0.88959393,
        1.        , 0.        ]))

In [499]:
linear_env = LinearWrapper(env)
    
print('## Linear Sarsa')

parameters = linear_sarsa(linear_env, 2000, 0.5,
                          0.9, 0.5, seed=0)
policy, value = linear_env.decode_policy(parameters)
print(policy)
print(value)

## Linear Sarsa
[3 3 2 1 2 0 2 0 3 2 2 0 0 3 3 0 0]
[0.40068182 0.45837644 0.54904965 0.37239854 0.38160776 0.
 0.63738049 0.         0.4349277  0.63081756 0.76329268 0.
 0.         0.75816945 0.87870467 1.         0.        ]


In [350]:
parameters = linear_q_learning(linear_env, 20000, 0.5,
                          0.9, 0.5, seed=0)
policy, value = linear_env.decode_policy(parameters)
print(policy)
print(value)

[2 3 2 1 2 0 2 0 3 3 2 0 0 3 3 0 0]
[0.45880241 0.49445097 0.5694942  0.49983582 0.51589666 0.
 0.63861416 0.         0.59008092 0.67945124 0.77423918 0.
 0.         0.7438747  0.88429343 1.         0.        ]


In [672]:
seed = 0
    
# Large lake
lake =   [['&', '.', '.', '.', '.', '.', '.', '.'],
          ['.', '.', '.', '.', '.', '.', '.', '.'],
          ['.', '.', '.', '#', '.', '.', '.', '.'],
          ['.', '.', '.', '.', '.', '#', '.', '.'],
          ['.', '.', '.', '#', '.', '.', '.', '.'],
          ['.', '#', '#', '.', '.', '.', '#', '.'],
          ['.', '#', '.', '.', '#', '.', '#', '.'],
          ['.', '.', '.', '#', '.', '.', '.', '$']]

envLarge = FrozenLake(lake, slip=0.1, max_steps=16, seed=seed)

In [673]:
import time

In [676]:
start = time.process_time()
_, val, _ = policy_iteration(envLarge, 0.9, 0.01, 500)
print(time.process_time() - start)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  val = np.zeros(env.n_states, dtype=np.float)


0.8734360000003107


In [615]:
start = time.process_time()
value_iteration(envLarge, 0.9, 0.01, 500)
print(time.process_time() - start)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  policy = np.zeros(env.n_states, dtype=np.int)


0.6775689999999486


In [678]:
sarsa(envLarge, 20000, 0.5, 0.9, 0.5, val, seed=0)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  val = np.zeros(env.n_states, dtype=np.float)


(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))