# Reinforcment Learning Control

In [2]:
import numpy as np
import gym

## Helper

In [3]:
def select_action(Q, state, epsilon):
    '''
    Selects an action using epsillon-greedy policy
    
    Args: 
        Q:       Action Value function of a policy pi
        state:   State in the MDP context
        epsilon: Probability with which a random action is selected
        
    Returns:
        Action
    '''
    
    if np.random.rand() > epsilon:
        action = np.argmax(Q[state])
    else:
        action = np.random.randint(len(Q[state]))

    return action

In [4]:
def generate_episode(env, pi, Q, epsilon):
    '''
    Generates an episode following a policy pi.
    An episode is a succession of experiences until the terminal state. 
    A tuple of state, action, reward, next_state and done constitute an exprience.
    
    Args: 
        env: OpenAI gym environment to interract with
        pi:  Policy that is used to create an episode
    
    Returns:
        episode
    '''
    episode = []
    obs, done = env.reset(), False
    while not done:
        action = pi(Q, obs, epsilon)
        new_obs, reward, done, _ = env.step(action)
        experience = (obs, action, reward, new_obs, done)
        episode.append(experience)
        obs = new_obs
    
    return episode

In [5]:
def linear_decay(start_value, end_value, decay_duration, full_duration):
    '''
    Generates a list of float values that decrease from a start_value to the end_value in a linear fashion
    
    Args:
        start_value:      The starting value for a float value
        end_value:        The lowest value that is allowed
        decay_duration:   Number of episodes the value is decreasing
        full_duration:    Number of episodes the value is needed
        
    Returns:
        decay_values:     Numpy array of decaying values
    '''
    
    decay_values = np.arange(decay_duration)[::-1]
    decay_values = decay_values / np.max(decay_values)
    
    values_range = start_value - end_value
    decay_values = decay_values * values_range + end_value
    
    decay_values = np.pad(decay_values, (0, full_duration - decay_duration), mode='edge')
    
    return decay_values

## Moncte Carlo First Visit Control

In [163]:
def mc_fv_control(env, eps_start, eps_end, alpha_start, alpha_end, decay_duration, gamma=0.99, num_episodes=20000):
    '''
    
    '''
    
    nS = env.observation_space.n
    nA = env.action_space.n
    epsilons = linear_decay(eps_start, eps_end, decay_duration, num_episodes)
    alphas = linear_decay(alpha_start, alpha_end, decay_duration, num_episodes)
    
    Q = np.zeros((nS, nA), dtype=np.float32)
    
    for episode in range(num_episodes):
        visited = np.zeros_like(Q, dtype=np.bool)
        generated_episode = generate_episode(env, select_action, Q, epsilons[episode])
        
        for t, (obs, action, _, _, _) in enumerate(generated_episode):
            if visited[obs][action] == True:
                continue
            visited[obs][action] = True
            
            remaining_rewards = np.array(generated_episode, dtype=np.object)[t:, 2]
            remaining_steps = len(remaining_rewards)
            
            # calculate discounting
            bases = [gamma for _ in range(remaining_steps)]
            exponents = [x for x in range(remaining_steps)]
            discounts = np.power(bases, exponents)
            
            #calculate discounted return
            mc_target = np.sum(remaining_rewards * discounts)
            
            
            Q[obs][action] = Q[obs][action] + alphas[episode] * (mc_target - Q[obs][action])
    V = np.max(Q, axis=1)      
    return Q, V

## SARSA (TD On Policy learning)

In [175]:
def sarsa(env, eps_start, eps_end, alpha_start, alpha_end, decay_duration, gamma=0.99, num_episodes=20000):
    '''
    '''
    nS = env.observation_space.n
    nA = env.action_space.n
    epsilons = linear_decay(eps_start, eps_end, decay_duration, num_episodes)
    alphas = linear_decay(alpha_start, alpha_end, decay_duration, num_episodes)
    
    Q = np.zeros((nS, nA), dtype=np.float32)
    
    for episode in range(num_episodes):
        obs, done = env.reset(), False
        action = select_action(Q, obs, epsilons[episode])
        while not done:
            next_obs, reward, done, _ = env.step(action)
            next_action = select_action(Q, next_obs, epsilons[episode])
            sarsa_target = reward + gamma * Q[next_obs][next_action] * (not done)
            Q[obs][action] = Q[obs][action] + alphas[episode] * (sarsa_target - Q[obs][action])
            obs, action = next_obs, next_action
    V = np.max(Q, axis=1)
    return Q, V

## Q-Learning (TD Off Policy learning)

In [179]:
def q_learning(env, eps_start, eps_end, alpha_start, alpha_end, decay_duration, gamma=0.99, num_episodes=20000):
    '''
    '''
    nS = env.observation_space.n
    nA = env.action_space.n
    epsilons = linear_decay(eps_start, eps_end, decay_duration, num_episodes)
    alphas = linear_decay(alpha_start, alpha_end, decay_duration, num_episodes)
    
    Q = np.zeros((nS, nA), dtype=np.float32)
    
    for episode in range(num_episodes):
        obs, done = env.reset(), False
        while not done:
            action = select_action(Q, obs, epsilons[episode])
            next_obs, reward, done, _ = env.step(action)
            next_action = np.argmax(Q[next_obs])
            sarsa_target = reward + gamma * Q[next_obs][next_action] * (not done)
            Q[obs][action] = Q[obs][action] + alphas[episode] * (sarsa_target - Q[obs][action])
            obs = next_obs
    V = np.max(Q, axis=1)
    return Q, V

## Double Q-Learning (reducing maximization bias)

In [185]:
def double_q_learning(env, eps_start, eps_end, alpha_start, alpha_end, decay_duration, gamma=0.99, num_episodes=20000):
    '''
    '''
    nS = env.observation_space.n
    nA = env.action_space.n
    epsilons = linear_decay(eps_start, eps_end, decay_duration, num_episodes)
    alphas = linear_decay(alpha_start, alpha_end, decay_duration, num_episodes)
 
    Q_1 = np.zeros((nS, nA), dtype=np.float32)
    Q_2 = np.zeros((nS, nA), dtype=np.float32)
    
    for episode in range(num_episodes):
        obs, done = env.reset(), False
        while not done:
            action = select_action((Q_1 +  Q_2) / 2, obs, epsilons[episode])
            next_obs, reward, done, _ = env.step(action)
            if np.random.random() < 0.5:
                next_action = np.argmax(Q_1[next_obs])
                sarsa_target = reward + gamma * Q_2[next_obs][next_action] * (not done)
                Q_1[obs][action] = Q_1[obs][action] + alphas[episode] * (sarsa_target - Q_1[obs][action])
                obs = next_obs
            else:
                next_action = np.argmax(Q_2[next_obs])
                sarsa_target = reward + gamma * Q_1[next_obs][next_action] * (not done)
                Q_2[obs][action] = Q_2[obs][action] + alphas[episode] * (sarsa_target - Q_2[obs][action])
                obs = next_obs
    Q = (Q_1 + Q_2) / 2
    V = np.max(Q, axis=1)
    return Q, V

## Testing

In [106]:
env = gym.make('FrozenLake-v0')

In [107]:
def value_iteration(env, gamma=0.99, delta=1e-10):
    '''
    Finds an optimal policy
    
    Args: 
        env:     openai gym environment
        gamma:   discount factor
        delta:   threshhold value to interrupt the policy evaluation
    
    Returns: 
        Optimal policy and value function
    ''' 
    nS = env.observation_space.n
    nA = env.action_space.n
    V = np.zeros(nS, dtype=np.float64)
    P = env.env.P
    
    while True:
        V_old = V.copy()
        Q = np.zeros(shape=(nS, nA), dtype=np.float64)
        for state in range(nS):
            for action in range(nA):
                for pr, next_state, reward, done in P[state][action]:
                    Q[state][action] += pr * (reward + gamma * V[next_state] * (not done))
        V = np.max(Q, axis=1)
        max_diff = np.max(np.abs(V_old - V))
        if max_diff < delta:
            break

    strategy = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
    def pi(s):
        return strategy[s]
    
    return pi, V

In [198]:
_, V = value_iteration(env, gamma=0.99)

In [199]:
print(V.reshape(4, 4))

[[0.54202593 0.49880319 0.47069569 0.4568517 ]
 [0.55845096 0.         0.35834807 0.        ]
 [0.59179874 0.64307982 0.61520756 0.        ]
 [0.         0.74172044 0.86283743 0.        ]]


In [200]:
#parameters
num_episodes = 10000
decay_duration = 5000

In [201]:
_, V = mc_fv_control(env, eps_start=1.0, eps_end=0.01, alpha_start=0.05, alpha_end=0.01, \
                     decay_duration=decay_duration, num_episodes=num_episodes)

In [202]:
print(V.reshape(4, 4))

[[0.48527676 0.2907662  0.19433592 0.08634876]
 [0.4999712  0.         0.2172153  0.        ]
 [0.52989995 0.57752734 0.5292619  0.        ]
 [0.         0.684992   0.8154243  0.        ]]


In [203]:
_, V = sarsa(env, eps_start=1.0, eps_end=0.01, alpha_start=0.05, alpha_end=0.01, \
             decay_duration=decay_duration, num_episodes=num_episodes)

In [204]:
print(V.reshape(4, 4))

[[0.38965628 0.3313999  0.2796144  0.03633502]
 [0.40628433 0.         0.2583913  0.        ]
 [0.4462176  0.5085071  0.49611053 0.        ]
 [0.         0.6055947  0.7751972  0.        ]]


In [205]:
_, V = q_learning(env, eps_start=1.0, eps_end=0.01, alpha_start=0.5, alpha_end=0.01, \
                  decay_duration=decay_duration, num_episodes=num_episodes)

In [206]:
print(V.reshape(4, 4))

[[0.5226891  0.46078998 0.42943972 0.41381168]
 [0.54267126 0.         0.31804314 0.        ]
 [0.57574356 0.62868685 0.59484756 0.        ]
 [0.         0.73314536 0.84061    0.        ]]


In [207]:
_, V = double_q_learning(env, eps_start=1.0, eps_end=0.01, alpha_start=0.5, alpha_end=0.01, \
                         decay_duration=decay_duration, num_episodes=num_episodes)

In [208]:
print(V.reshape(4, 4))

[[0.49951607 0.43738    0.39781386 0.3816498 ]
 [0.5215821  0.         0.32810134 0.        ]
 [0.5581316  0.61367244 0.5924568  0.        ]
 [0.         0.7297714  0.856019   0.        ]]
