### Single Action Cart

Mountain car actions:

* 0 - Apply left force
* 1 - Apply no force
* 2 - Apply right force

State values:

* state[0] - Position 
* state[1] - Velocity

The following shows a cart that simply applies full-force to climb the hill.  The cart is simply not strong enough.  It will need to use momentum from the hill behind it.

$ Q^{new}(s_{t},a_{t}) \leftarrow (1-\alpha) \cdot \underbrace{Q(s_{t},a_{t})}_{\text{old value}} + \underbrace{\alpha}_{\text{learning rate}} \cdot  \overbrace{\bigg( \underbrace{r_{t}}_{\text{reward}} + \underbrace{\gamma}_{\text{discount factor}} \cdot \underbrace{\max_{a}Q(s_{t+1}, a)}_{\text{estimate of optimal future value}} \bigg) }^{\text{learned value}} $

In [1]:
import gym
import numpy as np

env = gym.make("MountainCar-v0")

DISCRETE_GRID_SIZE = [10, 10]
buckets = (env.observation_space.high - env.observation_space.low)/DISCRETE_GRID_SIZE

def calc_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/buckets
    return tuple(discrete_state.astype(np.int))

def show_q_table(q_table):
    import pandas as pd
    df = pd.DataFrame(q_table.argmax(axis=2))
    df.columns = [f'v-{x}' for x in range(DISCRETE_GRID_SIZE[0])]
    df.index = [f'p-{x}' for x in range(DISCRETE_GRID_SIZE[1])]
    print(df)




In [2]:
LEARNING_RATE = 0.5
DISCOUNT = 0.95
EPISODES = 80

START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2

epsilon = 1  
epsilon_change = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

In [3]:
q_table = np.zeros((DISCRETE_GRID_SIZE + [env.action_space.n]))
show_q_table(q_table)


# np.argmax(q_table[(2,0)])

     v-0  v-1  v-2  v-3  v-4  v-5  v-6  v-7  v-8  v-9
p-0    0    0    0    0    0    0    0    0    0    0
p-1    0    0    0    0    0    0    0    0    0    0
p-2    0    0    0    0    0    0    0    0    0    0
p-3    0    0    0    0    0    0    0    0    0    0
p-4    0    0    0    0    0    0    0    0    0    0
p-5    0    0    0    0    0    0    0    0    0    0
p-6    0    0    0    0    0    0    0    0    0    0
p-7    0    0    0    0    0    0    0    0    0    0
p-8    0    0    0    0    0    0    0    0    0    0
p-9    0    0    0    0    0    0    0    0    0    0


In [4]:
def run_game(q_table):
    done = False
    discrete_state = calc_discrete_state(env.reset())
    
    while not done:
        
        # Exploit or explore
        if np.random.random() > epsilon:
            # Exploit - use q-table to take current best action (and probably refine)
            action = np.argmax(q_table[discrete_state])
        else:
            # Explore - t
            action = np.random.randint(0, env.action_space.n)
            

        new_state, reward, done, _ = env.step(action)
        
 
        new_state_disc = calc_discrete_state(new_state)
          
        # Update q-table
        max_future_q = np.max(q_table[new_state_disc])
        current_q = q_table[discrete_state + (action,)]
        new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
        q_table[discrete_state + (action,)] = new_q

        discrete_state = new_state_disc
        
        env.render()

In [5]:
episode = 0

while episode < EPISODES:
    
    run_game(q_table)

    # Move epsilon towards its ending value, if it still needs to move
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_change
        
    episode += 1
env.close()

In [6]:
show_q_table(q_table)

     v-0  v-1  v-2  v-3  v-4  v-5  v-6  v-7  v-8  v-9
p-0    0    0    0    2    0    2    2    0    0    0
p-1    0    1    1    2    1    0    2    2    2    0
p-2    0    2    0    0    1    2    2    2    2    0
p-3    0    2    0    0    1    2    2    2    1    0
p-4    2    1    1    0    0    0    2    1    2    1
p-5    0    2    2    0    0    1    1    1    2    0
p-6    0    1    0    1    2    1    2    2    2    0
p-7    0    0    1    0    0    0    0    2    0    0
p-8    0    0    0    1    2    1    0    1    0    0
p-9    0    0    0    0    0    0    1    0    0    0
