# Reinforcement Learning Prediction

In [2]:
import numpy as np
import gym

## First Visit Monte Carlo

In [89]:
def generate_episode(env, pi):
    '''
    Generates an episode following a policy pi.
    An episode is a succession of experiences until the terminal state. 
    A tuple of state, action, reward, next_state and done constitute an exprience.
    
    Args: 
        env: OpenAI gym environment to interract with
        pi:  Policy that is used to create an episode
    
    Returns:
        episode
    '''
    episode = []
    obs, done = env.reset(), False
    while not done:
        action = pi(obs)
        new_obs, reward, done, _ = env.step(action)
        experience = (obs, action, reward, new_obs, done)
        episode.append(experience)
        obs = new_obs
    
    return episode

In [88]:
def monte_carlo_fv_prediction(env, pi, alpha=0.001, gamma=0.99, num_episodes=10000):
    '''
    Calculates the state value function of a policy pi using first visit monte carlo
    
    Args:
        env:           OpenAI gym environment to interract with
        pi:            Policy that is used to select actions
        alpha:         Learning rate
        gamma:         Discounting rate
        num_episodes:  Number of episodes to play
    
    Returns:
        V: Value function
    '''
    nS = env.observation_space.n
    V = np.zeros(nS, dtype=np.float64)
    
    for episode in range(num_episodes):
        generated_episode = generate_episode(env, pi)
        visited = np.zeros_like(V, dtype=np.bool)
    
        for t, (obs, _, _, _, _) in enumerate(generated_episode):
            if visited[obs] == True:
                continue
            visited[obs] = True
            
            remaining_rewards = np.array(generated_episode, dtype=np.object)[t:, 2]
            remaining_steps = len(remaining_rewards)
            
            # calculate discounting
            bases = [gamma for _ in range(remaining_steps)]
            exponents = [x for x in range(remaining_steps)]
            discounts = np.power(bases, exponents)
            
            #calculate discounted return
            mc_target = np.sum(remaining_rewards * discounts)
            
            #update value function
            V[obs] = V[obs] + alpha * (mc_target - V[obs])
            
    return V

## TD (Temporal Difference) Learning

In [115]:
def td_prediction(env, pi, alpha=0.001, gamma=0.99, num_episodes=10000):
    '''
    Calculates the state value function of a policy pi using temporal difference
    
    Args:
        env:           OpenAI gym environment to interract with
        pi:            Policy that is used to select actions
        alpha:         Learning rate
        gamma:         Discounting rate
        num_episodes:  Number of episodes to play
    
    Returns:
        V: Value function
    '''
    nS = env.observation_space.n
    V = np.zeros(nS, dtype=np.float64)
    
    for episode in range(num_episodes):
        obs, done = env.reset(), False
        while not done:
            action = pi(obs)
            next_obs, reward, done, _ = env.step(action)
            V[obs] = V[obs] + alpha * (reward + gamma * V[next_obs] * (not done) - V[obs])
            obs = next_obs
    return V

## n - step Learning

## TD (Lambda)

## TESTS

In [53]:
env = gym.make('FrozenLake-v0')

In [54]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [90]:
# LEFT = 0
# DOWN = 1
# RIGHT = 2
# UP = 3

#strategy = {s: 0 for s in range(env.observation_space.n)}    
strategy = {0: 2, 1: 2, 2: 1, 3: 0, 4: 1, 5:0, 6: 1, 7: 0, 8: 2, 9: 2, 10: 1, 11: 0, 12:0, 13:2, 14: 2, 15: 0}

In [91]:
def pi(s):
    return strategy[s]

In [111]:
V = monte_carlo_fv_prediction(env, pi, num_episodes=100000)

In [112]:
V

array([0.03517821, 0.0290139 , 0.05970268, 0.02996512, 0.04660632,
       0.        , 0.11588832, 0.        , 0.09765313, 0.25246365,
       0.3186096 , 0.        , 0.        , 0.45735877, 0.68128761,
       0.        ])

In [116]:
V = td_prediction(env, pi, num_episodes=100000)

In [117]:
V

array([0.03105021, 0.0202269 , 0.04051933, 0.02002985, 0.04384035,
       0.        , 0.08993763, 0.        , 0.08579095, 0.21719199,
       0.27030225, 0.        , 0.        , 0.39716272, 0.62556548,
       0.        ])