In [1]:
import gym
import time
import numpy as np

In [2]:
env = gym.make('CartPole-v1')
env.reset()

for step in range(100):
    env.render()
    action = env.action_space.sample()
#     print(action)
    # sample: (array([ 0.20307615,  0.24289648, -5.11968664, -5.43760231]), 0.0, True, {})
    observation, reward, done, info = env.step(action)
    time.sleep(0.02)
env.close()



In [3]:
env.step(env.action_space.sample())

(array([-1.94200436, -1.45084837, -2.09692214, -6.45398374]), 0.0, True, {})

In [4]:
env.action_space.n

2

In [5]:
env.observation_space

Box(4,)

In [6]:
action_size = env.action_space.n 

In [7]:
def craete_bins(number_of_bins=10):
    bins_cart_position = np.linspace(-4.8, 4.8, number_of_bins)
    bins_cart_velocity = np.linspace(-5, 5, number_of_bins)
    bins_pole_angle = np.linspace(-0.418, 0.418, number_of_bins)
    bins_pole_angular_velocity = np.linspace(-5, 5, number_of_bins)
    
    bins = np.array([bins_cart_position, bins_cart_velocity, bins_pole_angle, bins_pole_angular_velocity])
    
    return bins

In [8]:
# Return the indices of the bins to which each value in input array belongs.
# Example:
bins_smple = [1,2,3,4,5,6]
np.digitize(4.4, bins_smple)

4

In [34]:
num_bins = 9
all_bins = craete_bins(num_bins)
def digitised_observation(observation):
    digitised_bins = list()
    for i, obs in enumerate(observation):
        digitised_bins.append(np.digitize(obs, all_bins[i]))
        
    return tuple(digitised_bins)

In [35]:
digitised_observation([-0.24667234,  0.65470702, -4.31367009,  5.02866106])

(4, 5, 0, 9)

In [11]:
# Creating Q-Table
q_table = np.zeros([num_bins, num_bins, num_bins, num_bins, env.action_space.n])

In [12]:
q_table.shape

(10, 10, 10, 10, 2)

In [13]:
np.argmax(q_table[0,0,0,0])

0

In [14]:
env.action_space.sample()

1

In [29]:
q_table[(5, 6, 0, 10) + (1,)]

IndexError: index 10 is out of bounds for axis 3 with size 10

In [72]:
# Hyperparameters
# Exploration vs. Exploitation parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability
decay_rate = 0.001             # Exponential decay rate for exploration prob


# It is common to leave Hyperparameters in ALL CAPS to easily locate them

EPOCHS=20000  # number of epochs/episodes to train for
ALPHA = 0.8 # aka the learning rate
GAMMA = 0.95 # aka the discount rate
# MAX_EPISODES = 100  # optional, also defined in env setup above

In [73]:
q_table[(5, 6, 0, 9)].shape

(2,)

In [74]:
# q_table[(5, 6, 0, 9),:,6:,8:,9:].shape

In [75]:
def epsilon_greedy_action_selection(epsilon, q_table, discrete_state):
    '''
    Returns an action for the agent. Note how it uses a random number to decide on
    exploration versus explotation trade-off.
    '''
    random_number = np.random.random()

    # EXPLOITATION, USE BEST Q(s,a) Value
    if random_number > epsilon:
        # Action row for a particular state
        action = np.argmax(q_table[discrete_state])

    # EXPLORATION, USE A RANDOM ACTION
    else:
        # Return a random 0,1 action
        action = env.action_space.sample()

    return action

In [76]:
BURN_IN = 1
EPSILON_END = 10000
EPSILON_REDUCE = 0.0001

def reduce_epsilon(epsilon,epoch):
    if BURN_IN <= epoch <= EPSILON_END:
        epsilon -= EPSILON_REDUCE
    return epsilon

In [77]:
def fail(done, points, reward):
    if done and points < 150:
        reward = -200
    return reward

In [78]:
def compute_next_q_value(old_q_value, reward, next_optimal_q_value):

    return old_q_value +  ALPHA * (reward + GAMMA * next_optimal_q_value - old_q_value)

In [79]:
env.reset()

array([ 0.02973516,  0.04505068, -0.01412676, -0.01070191])

In [80]:
env.step(1)

(array([ 0.03063617,  0.24037235, -0.0143408 , -0.30780831]), 1.0, False, {})

In [81]:
digitised_observation(env.reset())

(5, 5, 5, 5)

In [82]:
# Training Process
points_log = []  # to store all achieved points
epochs = []  # store the epoch for plotting
rewards = []
epoch = 200
for episode in range(EPOCHS):
    state = env.reset()
    done = False
    epochs.append(epoch)
    points = 0  # store result
    digitised_state = digitised_observation(state)
    while not done:
        action = epsilon_greedy_action_selection(epsilon, q_table, digitised_state)
        new_state, reward, done, info = env.step(action)
        reward = fail(done, points, reward)  # Check if reward or fail state
        digitised_new_state = digitised_observation(new_state)
        
        # Look up current/old qtable value Q(s_t,a_t)
        old_q_value =  q_table[digitised_state + (action,)]  

        # Get the next optimal Q-Value
        next_optimal_q_value = np.max(q_table[digitised_new_state])  
#         print("next_optimal_q_value: ", next_optimal_q_value)
        
        # Compute next q value
        next_q = compute_next_q_value(old_q_value, reward, next_optimal_q_value) 
#         print("next_q: ", next_q)

        # Update Q Table
        q_table[digitised_state + (action,)] = next_q
        
        points += 1
        
                
        # Our new state is state
        digitised_state = digitised_new_state
        
#     episode += 1
    # Reduce epsilon (because we need less and less exploration)
    epsilon = reduce_epsilon(epsilon,episode) 
    rewards.append(total_rewards)


env.close()
        

In [85]:
observation = env.reset()
rewards = 0
for _ in range(1000):
    env.render()
    discrete_state = digitised_observation(observation)  # get bins
    action = np.argmax(q_table[discrete_state])  # and chose action from the Q-Table
    observation, reward, done, info = env.step(action) # Finally perform the action
    rewards+=1
    if done:
        print(f"You got {rewards} points!")
        break
env.close()

You got 200 points!
