# Reinforcement Learning Prediction

In [79]:
import numpy as np
import gym

from itertools import count
from collections import deque
from tqdm.notebook import tqdm

# utility functions from local folder
from utils.helper import create_random_policy
from utils.helper import print_state_value_func, print_policy
from utils.helper import generate_episode

## First Visit Monte Carlo

In [75]:
def mc_fv_predict(env, pi, alpha=0.001, gamma=0.99, num_episodes=10000):
    '''
    Calculates the state value function of a policy pi using first visit monte carlo
    
    Args:
        env:           OpenAI gym environment to interract with
        pi:            Policy that is used to select actions
        alpha:         Learning rate
        gamma:         Discounting rate
        num_episodes:  Number of episodes to play
    
    Returns:
        V: Value function
    '''
    nS = env.observation_space.n
    V = np.zeros(nS, dtype=np.float64)
    
    for episode in tqdm(range(num_episodes)):
        generated_episode = generate_episode(env, pi)
        visited = np.zeros_like(V, dtype=np.bool)
    
        for t, (obs, _, _, _, _) in enumerate(generated_episode):
            if visited[obs] == True:
                continue
            visited[obs] = True
            
            remaining_rewards = np.array(generated_episode, dtype=np.object)[t:, 2]
            remaining_steps = len(remaining_rewards)
            
            # calculate discounting
            bases = [gamma for _ in range(remaining_steps)]
            exponents = [x for x in range(remaining_steps)]
            discounts = np.power(bases, exponents)
            
            #calculate discounted return
            mc_target = np.sum(remaining_rewards * discounts)
            
            #update value function
            V[obs] = V[obs] + alpha * (mc_target - V[obs])
            
    return V

## TD (Temporal Difference) Learning

In [77]:
def td_predict(env, pi, alpha=0.001, gamma=0.99, num_episodes=10000):
    '''
    Calculates the state value function of a policy pi using temporal difference
    
    Args:
        env:           OpenAI gym environment to interract with
        pi:            Policy that is used to select actions
        alpha:         Learning rate
        gamma:         Discounting rate
        num_episodes:  Number of episodes to play
    
    Returns:
        V: Value function
    '''
    nS = env.observation_space.n
    V = np.zeros(nS, dtype=np.float64)
    
    for episode in tqdm(range(num_episodes)):
        obs, done = env.reset(), False
        while not done:
            action = pi(obs)
            next_obs, reward, done, _ = env.step(action)
            V[obs] = V[obs] + alpha * (reward + gamma * V[next_obs] * (not done) - V[obs])
            obs = next_obs
    return V

## n - step Learning

In [101]:
def n_step_predict(env, pi, n=2, alpha=0.001, gamma=0.99, num_episodes=10000):
    '''
    Calculates the state value function of a policy pi using n_step temporal difference
    
    Args:
        env:           OpenAI gym environment to interract with
        pi:            Policy that is used to select actions
        n:             Number of steps (intermediate value between TD and full Monte Carlo)
        alpha:         Learning rate
        gamma:         Discounting rate
        num_episodes:  Number of episodes to play
    
    Returns:
        V: Value function
    '''
    
    nS = env.observation_space.n
    V = np.zeros(nS, dtype=np.float64)
    
    for episode in tqdm(range(num_episodes)):
        experiences = deque(maxlen=n)
        obs, done = env.reset(), False
        for t in count():
            if not done:
                action = pi(obs)
                next_obs, reward, done, _ = env.step(action)
                experiences.append((obs, action, reward, next_obs, done))
                obs = next_obs
            
            if t < n - 1 and not done:
                continue
            
            adjust_state = experiences[0][0]
            last_next_state = experiences[-1][3]
            
            #calculating the target value
            target = 0
            for i, (_, _, reward, _, _) in enumerate(experiences):
                target += reward * gamma**(len(experiences)-i-1)
            target += V[last_next_state] * gamma**len(experiences) * (not experiences[-1][4])
            
            experiences.popleft()
                
            V[adjust_state] = V[adjust_state] + alpha * (target - V[adjust_state])
            
            if len(experiences) > 0:
                obs = experiences[-1][3]
                done = experiences[-1][4]
            else:
                break
            
            
    return V

## TD (Lambda) with eligibility traces

In [113]:
def td_lambda_predict(env, pi, elig_lambda=0.5, alpha=0.001, gamma=0.99, num_episodes=10000):
    
    '''
    Calculates the state value function of a policy pi using temporal difference with eligibility traces
    
    Args:
        env:           OpenAI gym environment to interract with
        pi:            Policy that is used to select actions
        elig_lambda:   Factor for discounting eligibility traces
        alpha:         Learning rate
        gamma:         Discounting rate
        num_episodes:  Number of episodes to play
    
    Returns:
        V: Value function
    '''
    nS = env.observation_space.n
    V = np.zeros(nS, dtype=np.float64)
    
    for episode in tqdm(range(num_episodes)):
        obs, done = env.reset(), False
        eligibility_traces = np.zeros(nS, dtype=np.float64)
        while not done:
            eligibility_traces[obs] += 1
            action = pi(obs)
            next_obs, reward, done, _ = env.step(action)
            target = reward + gamma * V[next_obs] * (not done)
            error = target - V[obs]
            
            V = V + alpha * error * eligibility_traces
            eligibility_traces *= gamma * elig_lambda
            obs = next_obs
    return V

## Frozen Lake

In [7]:
env = gym.make('FrozenLake-v0')

In [8]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [9]:
nS = env.observation_space.n
nA = env.action_space.n

In [53]:
pi = create_random_policy(nS, nA, seed=0)

In [61]:
print_policy(pi, nS, 4, name='Random Policy')



[1mRandom Policy[0m


         ↑          ↑          ←          →
         ↑          ■          →          ■
         →          ↓          ↓          ■
         ■          ←          →          ■


In [76]:
V = mc_fv_predict(env, pi, num_episodes=10000)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [58]:
print_state_value_func(V, 4, name='')

0.09235 0.09504 0.10056 0.00000
0.00885 0.00000 0.10983 0.00000
0.03353 0.10363 0.24142 0.00000
0.00000 0.03315 0.53212 0.00000


In [78]:
V = td_predict(env, pi, num_episodes=10000)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [60]:
print_state_value_func(V, 4)

0.04787 0.05113 0.05820 0.00000
0.00334 0.00000 0.07137 0.00000
0.00543 0.04562 0.16876 0.00000
0.00000 0.00776 0.51077 0.00000


In [107]:
V= n_step_predict(env, pi, n=3, num_episodes=10000)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [109]:
print_state_value_func(V, 4)

0.09234 0.09566 0.10431 0.00000
0.01162 0.00000 0.11249 0.00000
0.02423 0.09897 0.24186 0.00000
0.00000 0.03344 0.60675 0.00000


In [114]:
V = td_lambda_predict(env, pi, elig_lambda=.5, num_episodes=10000)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [115]:
print_state_value_func(V, 4)

0.08022 0.08210 0.08663 0.00000
0.00921 0.00000 0.09414 0.00000
0.01747 0.07470 0.20643 0.00000
0.00000 0.02157 0.57060 0.00000
