# Crossentropy method

This notebook will teach you to solve reinforcement learning problems with crossentropy method.

In [None]:
import gym
import numpy as np
import pandas as pd

env = gym.make("Taxi-v2")
env.reset()
env.render()

In [None]:
n_states = env.observation_space.n
n_actions = env.action_space.n

print("n_states=%i, n_actions=%i"%(n_states, n_actions))

# Create stochastic policy

This time our policy should be a probability distribution.

```policy[s,a] = P(take action a | in state s)```

Since we still use integer state and action representations, you can use a 2-dimensional array to represent the policy.

Please initialize policy __uniformly__, that is, probabililities of all actions should be equal.


In [None]:
# NOTE: The probability of all actions should sum up to 1
#       We will therefore have that the sum of a row should be 1
policy = np.full((n_states, n_actions), 1/n_actions)

In [None]:
assert type(policy) in (np.ndarray,np.matrix)
assert np.allclose(policy,1./n_actions)
assert np.allclose(np.sum(policy,axis=1), 1)
print('You nailed it!')

# Play the game

Just like before, but we also record all states and actions we took.

In [None]:
def generate_session(policy, t_max=10**4):
    """
    Play game until end or for t_max ticks
    
    Parameters
    ----------
    policy : np.array, shape (n_states, n_actions)
        The array containing the probabilities for each action given a state
        
    Returns
    -------
    states : list
        The list of states
    actions : list
        Thes list of actions
    total_reward : float
        The sum of the reward
    """
    
    states = list()
    actions = list()
    total_reward = 0.
    
    s = env.reset()
    
    for t in range(t_max):
        # Sample action from policy
        # NOTE: n_actions is an integer
        #       When we use an integer as a number, it will us it as is was a np.arange(int)
        a = np.random.choice(n_actions, p=policy[s, :])
        
        new_s, r, done, info = env.step(a)
        
        #Record state, action and add up reward to states,actions and total_reward accordingly. 
        states.append(s)
        actions.append(a)
        total_reward += r
        
        s = new_s
        if done:
            break
            
    return states, actions, total_reward
        

In [None]:
s,a,r = generate_session(policy)
assert type(s) == type(a) == list
assert len(s) == len(a)
assert type(r) in [float,np.float]
print('You nailed it again!!!')

In [None]:
#let's see the initial reward distribution
import matplotlib.pyplot as plt
%matplotlib inline

sample_rewards = [generate_session(policy,t_max=1000)[-1] for _ in range(200)]

plt.hist(sample_rewards,bins=20);
plt.vlines([np.percentile(sample_rewards, 50)], [0], [100], label="50'th percentile", color='green')
plt.vlines([np.percentile(sample_rewards, 90)], [0], [100], label="90'th percentile", color='red')
plt.legend()

### Crossentropy method steps (2pts)

In [None]:
def select_elites(states_batch, actions_batch, rewards_batch, percentile=50):
    """
    Select states and actions from games that have rewards >= percentile.
    
    Notes
    -----
    It is not assumed that states are integers (they'll get different later)
    
    Parameters
    ----------
    states_batch : list
        List of list of states given as
        >>> states_batch[session_i][t]
        Where session_i is the session and t is the step
    action_batch : list
        List of list of actions given as
        >>> actions_batch[session_i][t]
        Where session_i is the session and t is the step
    rewards_batch : list
        List of rewards given in the sessions
    percentile : float
        The percentile to select the elites from
        We are selecting states from games that have rewards >= percentile
    
    Returns
    -------
    elite_states : list
        A list of the states where the elite actions took place
        Sorted by  session number and timestep within session
    elite_actions : list
        A list of the elite actions
        Sorted by  session number and timestep within session
    """
    
    # Compute minimum reward for elite sessions
    reward_threshold = np.percentile(rewards_batch, percentile)
    
    # NOTE: [0] as rewards_batch is a 1-d list
    indices = np.where(rewards_batch >= reward_threshold)[0]
    
    elite_states  = [state for session, state_session in enumerate(states_batch) 
                     for state in state_session if session in indices]
    elite_actions  = [action for session, action_session in enumerate(actions_batch) 
                      for action in action_session if session in indices]
    
    return elite_states, elite_actions

In [None]:
states_batch = [
    [1,2,3],   #game1
    [4,2,0,2], #game2
    [3,1]      #game3
]

actions_batch = [
    [0,2,4],   #game1
    [3,2,0,1], #game2
    [3,3]      #game3
]
rewards_batch = [
    3,         #game1
    4,         #game2
    5,         #game3
]

test_result_0 = select_elites(states_batch, actions_batch, rewards_batch, percentile=0)
test_result_40 = select_elites(states_batch, actions_batch, rewards_batch, percentile=30)
test_result_90 = select_elites(states_batch, actions_batch, rewards_batch, percentile=90)
test_result_100 = select_elites(states_batch, actions_batch, rewards_batch, percentile=100)

assert np.all(test_result_0[0] == [1, 2, 3, 4, 2, 0, 2, 3, 1])  \
   and np.all(test_result_0[1] == [0, 2, 4, 3, 2, 0, 1, 3, 3]),\
        "For percentile 0 you should return all states and actions in chronological order"
assert np.all(test_result_40[0] == [4, 2, 0, 2, 3, 1]) and \
        np.all(test_result_40[1] ==[3, 2, 0, 1, 3, 3]),\
        "For percentile 30 you should only select states/actions from two first"
assert np.all(test_result_90[0] == [3,1]) and \
        np.all(test_result_90[1] == [3,3]),\
        "For percentile 90 you should only select states/actions from one game"
assert np.all(test_result_100[0] == [3,1]) and\
       np.all(test_result_100[1] == [3,3]),\
        "Please make sure you use >=, not >. Also double-check how you compute percentile."
print("Ok!")

In [None]:
from collections import Counter

def update_policy(elite_states, elite_actions):
    """
    Given old policy and a list of elite states/actions from select_elites,
    return new updated policy where each action probability is proportional to
    
    policy[s_i, a_i] ~ #[occurences of si and ai in elite states/actions]
    
    Don't forget to normalize policy to get valid probabilities and handle 0/0 case.
    In case you never visited a state, set probabilities for all actions to 1./n_actions
    
    Parameters
    ----------
    elite_states : list
        A list of the states where the elite actions took place
        Sorted by  session number and timestep within session
    elite_actions : list
        A list of the elite actions
        Sorted by  session number and timestep within session    
    
    Returns
    -------
    new_policy , shape (n_states, n_actions)
        The array containing the updated probabilities for the actions given a state
    """
    
    new_policy = np.zeros([n_states,n_actions])
    
    pair_counts = Counter(zip(elite_states, elite_actions))
    action_count = len(elite_actions)
    
    for row_state, col_action in pair_counts:
        new_policy[row_state, col_action] = pair_counts[(row_state, col_action)]/action_count
    
    # Normalize and fix unvisited_states
    for row in range(new_policy.shape[0]):
        # Fix unvisited states
        if row not in elite_states:
            new_policy[row, :] = 1/n_actions
        else:
            new_policy[row, :] /= new_policy[row, :].sum()
    
    return new_policy

In [None]:
elite_states, elite_actions = ([1, 2, 3, 4, 2, 0, 2, 3, 1], [0, 2, 4, 3, 2, 0, 1, 3, 3])

new_policy = update_policy(elite_states,elite_actions)

assert np.isfinite(new_policy).all(), "Your new policy contains NaNs or +-inf. Make sure you don't divide by zero."
assert np.all(new_policy>=0), "Your new policy can't have negative action probabilities"
assert np.allclose(new_policy.sum(axis=-1),1), "Your new policy should be a valid probability distribution over actions"
reference_answer = np.array([
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.5       ,  0.        ,  0.        ,  0.5       ,  0.        ],
       [ 0.        ,  0.33333333,  0.66666667,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.5       ,  0.5       ]])
assert np.allclose(new_policy[:4,:5],reference_answer)
print("Ok!")

# Training loop
Generate sessions, select N best and fit to those.

In [None]:
from IPython.display import clear_output

def show_progress(batch_rewards, log, percentile, reward_range=[-990,+10]):
    """
    A convenience function that displays training progress. 
    No cool math here, just charts.
    """
    
    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    log.append([mean_reward,threshold])

    clear_output(True)
    print("mean reward = %.3f, threshold=%.3f"%(mean_reward, threshold))
    plt.figure(figsize=[8,4])
    plt.subplot(1,2,1)
    plt.plot(list(zip(*log))[0], label='Mean rewards')
    plt.plot(list(zip(*log))[1], label='Reward thresholds')
    plt.legend()
    plt.grid()
    
    plt.subplot(1,2,2)
    plt.hist(batch_rewards,range=reward_range);
    plt.vlines([np.percentile(batch_rewards, percentile)], [0], [100], label="percentile", color='red')
    plt.legend()
    plt.grid()

    plt.show()

In [None]:
#reset policy just in case
policy = np.ones([n_states, n_actions]) / n_actions 

n_sessions = 250  #sample this many sessions
percentile = 50  #take this percent of session with highest rewards
learning_rate = 0.5  #add this thing to all counts for stability

log = []

for i in range(100):
    
    %time sessions = [generate_session(policy) for i in range(n_sessions)]
    
    batch_states, batch_actions, batch_rewards = zip(*sessions)

    elite_states, elite_actions = select_elites(batch_states, batch_actions, batch_rewards, percentile=percentile)
    
    new_policy = update_policy(elite_states, elite_actions)
    
    policy = learning_rate * new_policy + (1-learning_rate) * policy
    
    #display results on chart
    show_progress(batch_rewards, log, percentile)

### Reflecting on results

You may have noticed that the taxi problem quickly converges from <-1000 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.

In case CEM failed to learn how to win from one distinct starting point, it will siply discard it because no sessions from that starting point will make it into the "elites".

To mitigate that problem, you can either reduce the threshold for elite sessions (duct tape way) or  change the way you evaluate strategy (theoretically correct way). You can first sample an action for every possible state and then evaluate this choice of actions by running _several_ games and averaging rewards.

### Submit to coursera

In [None]:
from submit import submit_taxi
EMAIL = ''
TOKEN = ''
submit_taxi(generate_session, policy, EMAIL, TOKEN)