In [1]:
%matplotlib inline

import gym
import matplotlib
import numpy as np
import sys
import time

from collections import defaultdict

from blackjack import BlackjackEnv
import plotting

matplotlib.style.use('ggplot')

In [2]:
env = BlackjackEnv()

In [3]:
def make_epsilon_greedy_policy(Q, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function and epsilon.
    
    Args:
        Q: A dictionary that maps from state -> action-values.
            Each value is a numpy array of length nA (see below)
        epsilon: The probability to select a random action . float between 0 and 1.
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    
    """
    def policy_fn(observation):
        # np.random.seed(0)
        optimal_action = np.argmax(Q[observation])
        pi = np.full(nA, epsilon / nA)
        pi[optimal_action] = epsilon / nA + 1 - epsilon
        
        return pi
    return policy_fn 

Why nested function?

In [14]:
from collections import namedtuple
def mc_control_epsilon_greedy(env, num_episodes, discount_factor=1.0, epsilon=0.1):
    """
    Monte Carlo Control using Epsilon-Greedy policies.
    Finds an optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities
    """

    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    
    # state -> action-value array (whose index is action). 
    Q = defaultdict(lambda: np.zeros(env.action_space.n)) # initial policy evaluation
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) # initial policy improvement
    action_space = list(range(env.action_space.n))
    
    # Generate episodes (i.e. full blackjack games)
    Experience = namedtuple('Experience', ['state', 'action', 'reward'])
    for i_episode in range(1, num_episodes + 1):
            
        # Generate experiences (state, action, reward); store in list 'experiences'
        experiences = []
        state = env.reset()
        for t in range(100):
            action = np.random.choice(action_space, p=policy(state))
            next_state, reward, done, _ = env.step(action)
            experiences.append(Experience(state, action, reward))
            if done:
                break
            state = next_state
            
        # Compute average returns in states experienced in this episode
        Q = evaluate_policy(experiences, returns_sum, returns_count, Q, discount_factor)
        policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
        
    #total number of policy improvement cycles
    print("policy improvement ", sum(returns_count.values()))
    return Q, policy

In [15]:
def evaluate_policy(experiences, returns_sum, returns_count, Q, discount_factor):
    experienced_state_actions = set([(experience.state, experience.action) for experience in experiences])
    for state_action in experienced_state_actions:
        # Find the first occurance of the state in the episode
        first_occurence_idx = next(i for i, experience in enumerate(experiences) if (experience.state, experience.action) == state_action)
        # Sum up all rewards since the first occurance
        G = sum([experience.reward * (discount_factor**i) for i, experience in enumerate(experiences[first_occurence_idx:])])
        # Calculate average return for this state over all sampled episodes
        returns_sum[state_action] += G
        returns_count[state_action] += 1.0
            
        s, a = state_action
        Q[s][a] = returns_sum[state_action] / returns_count[state_action]
        
    return Q
    

In [16]:
start = time.clock()

Q, policy = mc_control_epsilon_greedy(env, num_episodes=1000000, epsilon=0.1)

end = time.clock()
print("\n",end - start,"s")

  """Entry point for launching an IPython kernel.


policy improvement  1276598.0

 143.35381099999998 s


  """


In [13]:
# For plotting: Create value function from action-value function
# by picking the best action at each state
#total number of policy improvement cycles
print("policy improvement ", sum(returns_count.values()))
V = defaultdict(float)
for state, actions in Q.items():
    action_value = np.max(actions)
    V[state] = action_value
plotting.plot_value_function(V, title="Optimal Value Function")

NameError: name 'returns_count' is not defined