In [2]:
import numpy as np
import gym
import time
import math

In [3]:
env = gym.make("CartPole-v1")
Observation = [30, 30, 30, 50]
np_array_win_size = np.array([0.25, 0.25, 0.01, 0.1])
# q = np.random.uniform(low=0, high=1, size=(Observation + [env.action_space.n]))
q = np.loadtxt("q.txt", delimiter=' ').reshape(30, 30, 30, 50, 2)

In [5]:
def get_discrete_state(state):
    discrete_state = state / np_array_win_size + np.array([15, 10, 1, 10])
    return tuple(discrete_state.astype(int))

In [9]:
def train(q_table):
    LEARNING_RATE = 0.1
    gamma = 0.95
    EPISODES = 150000

    total = 0
    total_reward = 0
    prior_reward = 100

    epsilon = 1
    epsilon_decay_value = 0.99995
    for episode in range(EPISODES + 1):
        t0 = time.time()
        discrete_state = get_discrete_state(env.reset())
        done = False
        episode_reward = 0

        if episode % 10000 == 0:
            print("Episode: " + str(episode))
        while not done:

            if np.random.random() > epsilon:
                action = np.argmax(q_table[discrete_state])
            else:
                action = np.random.randint(0, env.action_space.n)
            new_state, reward, done, _ = env.step(action)
            episode_reward += reward
            new_discrete_state = get_discrete_state(new_state)

            if episode % 2000 == 0:
                env.render()

            if not done:
                max_future_q = np.max(q_table[new_discrete_state])
                current_q = q_table[discrete_state + (action,)]
                new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + gamma * max_future_q)
                q_table[discrete_state + (action,)] = new_q

            discrete_state = new_discrete_state
        if epsilon > 0.05:
            if episode_reward > prior_reward and episode > 10000:
                epsilon = math.pow(epsilon_decay_value, episode - 10000)
        t1 = time.time()
        episode_total = t1 - t0
        total = total + episode_total

        total_reward += episode_reward
        prior_reward = episode_reward

        if episode % 1000 == 0:
            mean = total / 1000
            total = 0

            mean_reward = total_reward / 1000
            print("Mean Reward: " + str(mean_reward))
            total_reward = 0


In [10]:
train(q_table=q)
np.savetxt("q.txt", q.reshape(-1, 2), delimiter=' ')
discrete_state_ = get_discrete_state(env.reset())
for i in range(1000):
    env.render()
    action_ = np.argmax(q[discrete_state_])
    state_, _, d, _ = env.step(action_)  # take a random action
    if d:
        print(i)
        break
    else:
        discrete_state_ = get_discrete_state(state_)
env.close()

Episode: 0
Mean Reward: 0.033
Mean Reward: 21.804
Mean Reward: 22.643
Mean Reward: 22.544
Mean Reward: 21.93
Mean Reward: 22.924
Mean Reward: 22.855
Mean Reward: 22.565
Mean Reward: 23.029
Mean Reward: 22.791
Episode: 10000
Mean Reward: 21.497
Mean Reward: 23.136
Mean Reward: 25.835
Mean Reward: 27.576
Mean Reward: 31.382
Mean Reward: 34.759
Mean Reward: 37.405
Mean Reward: 40.296
Mean Reward: 44.896
Mean Reward: 48.617
Episode: 20000
Mean Reward: 52.787
Mean Reward: 58.838
Mean Reward: 61.158
Mean Reward: 68.71
Mean Reward: 74.856
Mean Reward: 86.972
Mean Reward: 93.526
Mean Reward: 99.103
Mean Reward: 114.704
Mean Reward: 123.261
Episode: 30000
Mean Reward: 123.299
Mean Reward: 141.675
Mean Reward: 144.547
Mean Reward: 152.087
Mean Reward: 167.813
Mean Reward: 183.005
Mean Reward: 188.725
Mean Reward: 217.505
Mean Reward: 204.906
Mean Reward: 223.79
Episode: 40000
Mean Reward: 249.745
Mean Reward: 247.735
Mean Reward: 256.644
Mean Reward: 272.664
Mean Reward: 280.0
Mean Reward: 265.5

In [14]:
discrete_state_ = get_discrete_state(env.reset())
for i in range(1000):
    env.render()
    action_ = np.argmax(q[discrete_state_])
    state_, _, d, _ = env.step(action_)
    if d:
        print(i)
        break
    else:
        discrete_state_ = get_discrete_state(state_)
env.close()

325
