In [1]:
import gym
from time import sleep
import numpy as np
import math
import random
env = gym.make('CartPole-v0')
env.reset()

array([ 0.04686096,  0.03516489, -0.00973275,  0.03354434])

In [2]:
no_buckets = (1, 1, 6, 3)
no_actions = env.action_space.n

state_value_bounds = list(zip(env.observation_space.low, env.observation_space.high))
state_value_bounds[1] = [-0.5, 0.5]
state_value_bounds[3] = [-math.radians(50), math.radians(50)]

action_index = len(no_buckets)
q_value_table = np.zeros(no_buckets + (no_actions,))

min_exp_rate = 0.01
min_learning_rate = 0.1

max_episodes = 1000
max_time_steps = 250
streak_to_end = 120
solved_time = 199
discount = 0.99
no_streaks = 0

In [3]:
def select_action(state_value, explore_rate): #epsilon greedy action selection
    if random.random() < explore_rate:
        action = env.action_space.sample() #random action
    else:
        action = np.argmax(q_value_table[state_value]) #greedy action

def select_explore_rate(x):
    return max(min_exp_rate, min(1, 1.0-math.log10((x+1)/25)))

def select_learning_rate(x):
    return max(min_learning_rate, min(0.5, 1.0-math.log10((x+1)/25)))

def bucketize_state_value(state_value):
    bucket_indexes = []
    for i in range(len(state_value)):
        if state_value[i] <= state_value_bounds[i][0]:
            bucket_index = 0
        elif state_value[i] >= state_value_bounds[i][1]:
            bucket_index = no_buckets[i] - 1
        else:
            bound_width = state_value_bounds[i][1] - state_value_bounds[i][0]
            offset = (no_buckets[i]-1) * state_value_bounds[i][0]/bound_width
            scaling =  (no_buckets[i]-1)/bound_width
            bucket_index = int(round(scaling * state_value[i] - offset))

        bucket_indexes.append(bucket_index)
        return tuple(bucket_indexes)

In [4]:
for episode in range(max_episodes):
    explore_rate = select_explore_rate(episode)
    learning_rate = select_learning_rate(episode)

    observation = env.reset()

    start_state_value = bucketize_state_value(observation)
    previous_state_value = start_state_value

    for time_step in range(max_time_steps):
        env.render(close=True)
        selected_action = select_action(previous_state_value, explore_rate)
        observation, reward, completed, info = env.step(selected_action)
        sleep(0.02)
        state_value = bucketize_state_value(observation)
        best_q_value = np.amax(q_value_table[state_value])

        q_value_table[previous_state_value + (selected_action,)] = learning_rate * (
        reward + discount * best_q_value - q_value_table[previous_state_value, + (selected_action,)]
        )

        print('Episode : {}'.format(episode))
        print('Time step : {}'.format(time_step))
        print('Selected action : {}'.format(selected_action))
        print('Current state : {}'.format(str(state_value)))
        print('Reward : {}'.format(reward))
        print('Best Q-value : {}'.format(best_q_value))
        print('Learning Rate : {}'.format(learning_rate))
        print('Explore Rate : %f'.format(explore_rate))

        if completed:
            print('Episode {} finished after {} time steps'.format(episode, time_step))
            if time_step>=solved_time:
                no_streaks+=1
            else:
                no_streaks = 0
            break

        previous_state_value = state_value_bounds

        if no_streaks > streak_to_end:
            break
env.close()

TypeError: render() got an unexpected keyword argument 'close'