In [28]:
import gym
import numpy as np
from tqdm import tqdm_notebook as tqdm

In [3]:
env = gym.make("Taxi-v2")
env.reset()

[2017-10-24 22:04:37,207] Making new env: Taxi-v2


268

In [4]:
env.render()

+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [11]:
n_states = env.observation_space.n
n_actions = env.action_space.n
print(n_states)
print(n_actions)

500
6


In [17]:
def generate_session(policy, t_max=10 ** 4):
    session = []
    s = env.reset()
    total_reward = 0
    for i in range(t_max):
        action = np.random.choice(n_actions, p=policy[s])
        session.append((s, action))
        s, r, done, _ = env.step(action)
        total_reward += r
        if done:
            break
    return total_reward, session

In [18]:
def get_uniform_policy(eps=1 / n_actions):
    return np.zeros((n_states, n_actions)) + eps

In [19]:
def policy_from_session(session):
    policy = get_uniform_policy(0.1)
    for sa in session:
        policy[sa] += 1
    return policy / policy.sum(axis=1, keepdims=True)

In [40]:
curr = get_uniform_policy()

In [41]:
def evaluate(policy, times=1000):
    return np.mean([generate_session(policy)[0] for i in range(times)])

In [51]:
n_epochs = 25
n_samples = 5000
percentile = 0.9

In [110]:
for epoch in tqdm(range(n_epochs)):
    print("Epoch " + str(epoch) + ":")
    scores, samples = zip(*[generate_session(curr) for i in range(n_samples)])
    threshold = min(scores) + np.percentile((max(scores) - min(scores)), percentile)
    curr = policy_from_session(sum([samples[i] for i, score in enumerate(scores) if score > threshold], []))
    print("score:", np.mean(scores))

Epoch 0:

score: -74.9394
Epoch 1:
score: -11.8054
Epoch 2:
score: 0.291
Epoch 3:
score: 7.1052
Epoch 4:


KeyboardInterrupt: 

In [112]:
evaluate(curr)

7.6479999999999997