In [1]:
%matplotlib inline

import gym
import matplotlib
import numpy as np
import sys

from collections import defaultdict
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.blackjack import BlackjackEnv
from lib import plotting

matplotlib.style.use('ggplot')

In [2]:
env = gym.make("CartPole-v0")
env.reset()

array([-0.01228541,  0.04647694, -0.04716877, -0.0305521 ])

In [3]:
tuple(env.reset())

(-0.0099407758236392252,
 -0.012253603932741074,
 0.0076925279818170628,
 -0.048329171708184439)

In [3]:
def make_epsilon_greedy_policy(Q, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function and epsilon.
    
    Args:
        Q: A dictionary that maps from state -> action-values.
            Each value is a numpy array of length nA (see below)
        epsilon: The probability to select a random action . float between 0 and 1.
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    
    """
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[tuple(observation)])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [9]:
def mc_control_epsilon_greedy(env, num_episodes, discount_factor=1.0):
    """
    Monte Carlo Control using Epsilon-Greedy policies.
    Finds an optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities
    """
    
    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    
    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    
    for i_episode in range(1, num_episodes + 1):
        epsilon = 0.3
        
        # The policy we're following
        policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
        
        # Print out which episode we're on, useful for debugging.
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        # Generate an episode.
        # An episode is an array of (state, action, reward) tuples
        episode = []
        total_score = 0
        state = tuple(env.reset())
        for t in range(500):
            probs = policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            total_score += reward
            if done:
                break
            state = tuple(next_state)

        # Find all (state, action) pairs we've visited in this episode
        # We convert each state to a tuple so that we can use it as a dict key
        sa_in_episode = set([(tuple(x[0]), x[1]) for x in episode])
        for state, action in sa_in_episode:
            sa_pair = (state, action)
            # Find the first occurance of the (state, action) pair in the episode
            first_occurence_idx = next(i for i,x in enumerate(episode)
                                       if x[0] == state and x[1] == action)
            # Sum up all rewards since the first occurance
            G = sum([x[2]*(discount_factor**i) for i,x in enumerate(episode[first_occurence_idx:])])
            if total_score > 50:
                G += total_score
            elif total_score > 100:
                G += total_score*2
            # Calculate average return for this state over all sampled episodes
            returns_sum[sa_pair] += G
            returns_count[sa_pair] += 1.0
            Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair]
        
        # The policy is improved implicitly by changing the Q dictionar
    
    return Q, policy

In [10]:
Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000)

Episode 500000/500000.

In [23]:
import pickle
with open('Q_cart_pole.pkl', 'wb') as f:
    pickle.dump(dict(Q), f)

MemoryError: 

In [None]:
read_dictionary = np.load('my_file.npy').item()
print(read_dictionary['hello']) # displays "world"

In [16]:
Q = defaultdict(lambda: np.zeros(env.action_space.n))
print('Q:',Q,'env.action_space.n:',env.action_space.n)
policy = make_epsilon_greedy_policy_breakdown(Q, 0.1, env.action_space.n)
state=tuple(env.reset())
print('state:',state)
policy(state)

Q: defaultdict(<function <lambda> at 0x000000000A067BF8>, {}) env.action_space.n: 2
state: (-0.040211342124594034, 0.0058351505182356878, -0.023569106267655762, -0.031373358129583341)
**************************
Q: defaultdict(<function <lambda> at 0x000000000A067BF8>, {(-0.040211342124594034, 0.0058351505182356878, -0.023569106267655762, -0.031373358129583341): array([ 0.,  0.])}) Q[observation]: (-0.040211342124594034, 0.0058351505182356878, -0.023569106267655762, -0.031373358129583341) [ 0.  0.]
best_action: 0 --A: [ 0.95  0.05]
**************************


array([ 0.95,  0.05])

In [7]:
def make_epsilon_greedy_policy_breakdown(Q, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function and epsilon.
    
    Args:
        Q: A dictionary that maps from state -> action-values.
            Each value is a numpy array of length nA (see below)
        epsilon: The probability to select a random action . float between 0 and 1.
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    
    """
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        print('**************************')
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        print('Q:',Q, 'Q[observation]:',observation,Q[observation])
        print('best_action:',best_action, '--A:',A)
        print('**************************')
        return A
    return policy_fn

In [11]:
Q

defaultdict(<function __main__.mc_control_epsilon_greedy.<locals>.<lambda>>,
            {(-0.036487387285506136,
              -0.0055654912989295605,
              0.045450637312893977,
              0.022128094560223494): array([ 11.,   0.]),
             (-0.036598697111484724,
              -0.20130878529751134,
              0.045893199204098449,
              0.32879770592644564): array([ 10.,   0.]),
             (-0.040624872817434952,
              -0.39705300447959802,
              0.05246915332262736,
              0.63559274844901759): array([ 0.,  9.]),
             (-0.048565932907026911,
              -0.20270060189427849,
              0.065181008291607709,
              0.35988419748473971): array([ 8.,  0.]),
             (-0.052619944944912483,
              -0.39868562870320323,
              0.0723786922413025,
              0.67238711755407365): array([ 7.,  0.]),
             (-0.060593657518976551,
              -0.5947350600402691,
              0.08582643459

In [8]:
import random
scores = []
choices = []
for each_game in range(10):
    score = 0
    game_memory = []
    prev_obs = []
    state = tuple(env.reset())
    for _ in range(500):

        if state in Q:
            print('got it')
            action = np.argmax(Q[state])
        else:
            ('noooo')
            action = random.randrange(0,2)
                
        new_observation, reward, done, info = env.step(action)
        score+=reward
        if done: break

    scores.append(score)

print('Average Score:',sum(scores)/len(scores))


Average Score: 20.1


In [6]:
Q

defaultdict(<function __main__.mc_control_epsilon_greedy.<locals>.<lambda>>,
            {(-0.022609533136250571,
              0.017421658164445047,
              0.049736360066494997,
              0.022580046359963593): array([ 32.,   0.]),
             (-0.022261099972961669,
              -0.17837697388304471,
              0.050187960993694269,
              0.33053127017470346): array([  0.,  31.]),
             (-0.025828639450622564,
              0.015995969887382361,
              0.056798586397188336,
              0.054088038936751126): array([ 30.,   0.]),
             (-0.025508720052874918,
              -0.17989245270572932,
              0.057880347175923358,
              0.36413664377519384): array([  0.,  29.]),
             (-0.029106569106989506,
              0.014361181045149746,
              0.065163080051427236,
              0.090251460986007948): array([  0.,  28.]),
             (-0.028819345486086511,
              0.20849152816312486,
              0.06