## Random Guessing Algorithm for CartPole OpenAI Gym
This file implements the simplest Reinforcement Learning solution to the intro-to-RL CartPole problem from the OpenAI Gym (https://openai.com/requests-for-research/#cartpole), the "*random guessing algorithm*".

In [1]:
from __future__ import print_function
import gym
import numpy as np

In [2]:
env = gym.make('CartPole-v0')
print("Highs:", env.observation_space.high)
print("Lows: ", env.observation_space.low)

print(env.action_space)

[2016-07-11 06:19:34,660] Making new env: CartPole-v0


Highs: [  4.80000000e+00   3.40282347e+38   4.18879020e-01   3.40282347e+38]
Lows:  [ -4.80000000e+00  -3.40282347e+38  -4.18879020e-01  -3.40282347e+38]
Discrete(2)


In [3]:
observation = env.reset()
observation

array([ 0.04240727, -0.00041078,  0.0115244 ,  0.02211273])

In [4]:
# Here I'm just keeping track of my personal best. This has to be updated manually.
# ... I happened to randomly get a perfect model in the first 100 I tried (number 63)!
# They seem to happen in about 1/150 of the models I generate.
personal_best_reward = 1000
personal_best_weight = np.array([ 0.10047517,  0.45675998,  0.99510988,  0.75130867])

In [5]:
def random_range(a,b,shape):
    return (b-a) * np.random.random(shape) + a

In [6]:
# Create a bunch of random weights vectors, of the same shape as the input.
# That way, you can dot-product the input and the weight to make a choice.
Ws = [random_range(-1,1,env.observation_space.shape) for _ in range(100)]
Ws[:5]

[array([ 0.5325113 ,  0.87059501, -0.61881113,  0.91631689]),
 array([-0.59392468, -0.90155906, -0.83717758, -0.25264425]),
 array([-0.88522628, -0.41954885,  0.42927628,  0.38708858]),
 array([ 0.37939376, -0.58231667,  0.36979234,  0.58605575]),
 array([-0.15183026, -0.20212978, -0.75272117, -0.86942718])]

In [7]:
def model_step(W,x):
    ''' Simplest model ever: Just the linear multiplication, i.e. dot product! '''
    y = np.dot(W,x)
    return [0,1][y >= 0] # Use sign of result to decide left or right.
model_step(Ws[0],observation)

1

In [8]:
def test_model(W):
    total_reward = 0
    num_batches = 100
    for i_episode in range(num_batches):
        observation = env.reset()
        done = False
        batch_reward = 0
        for _ in range(1000):
            #env.render()
            action = model_step(W,observation)
            observation, reward, done, info = env.step(action)
            batch_reward += reward
            if done:
                break
        #print("Batch Reward: {}".format(batch_reward))
        total_reward += batch_reward
    average_reward = total_reward/num_batches
    return total_reward,average_reward

test_model(Ws[2])

(8201.0, 82.01)

In [9]:
best_weights = None
best_weights_idx = 0
best_weight_reward = 0

In [10]:
for idx,W in enumerate(Ws):
    total_reward,average_reward = test_model(W)
    print("{0}/{1}: Average Reward: {2} Total Reward: {3}".format(idx, len(Ws), average_reward, total_reward))
    if average_reward > best_weight_reward:
        best_weight_reward = average_reward
        best_weights = W
        best_weights_idx = idx
print("Best Reward:", best_weight_reward)
print("Best Weight:", best_weights_idx)

if best_weight_reward > personal_best_reward:
    print("It's a NEW LAP RECORD!: {0}".format(best_weight_reward))
    print(best_weights)


0/100: Average Reward: 49.61 Total Reward: 4961.0
1/100: Average Reward: 25.28 Total Reward: 2528.0
2/100: Average Reward: 86.94 Total Reward: 8694.0
3/100: Average Reward: 89.66 Total Reward: 8966.0
4/100: Average Reward: 8.9 Total Reward: 890.0
5/100: Average Reward: 108.13 Total Reward: 10813.0
6/100: Average Reward: 139.21 Total Reward: 13921.0
7/100: Average Reward: 39.98 Total Reward: 3998.0
8/100: Average Reward: 9.17 Total Reward: 917.0
9/100: Average Reward: 9.69 Total Reward: 969.0
10/100: Average Reward: 40.86 Total Reward: 4086.0
11/100: Average Reward: 52.21 Total Reward: 5221.0
12/100: Average Reward: 9.08 Total Reward: 908.0
13/100: Average Reward: 60.44 Total Reward: 6044.0
14/100: Average Reward: 25.94 Total Reward: 2594.0
15/100: Average Reward: 9.55 Total Reward: 955.0
16/100: Average Reward: 9.08 Total Reward: 908.0
17/100: Average Reward: 44.31 Total Reward: 4431.0
18/100: Average Reward: 9.27 Total Reward: 927.0
19/100: Average Reward: 32.85 Total Reward: 3285.0
2

In [11]:
Ws[best_weights_idx]

array([ 0.11573203, -0.28547568,  0.769634  ,  0.14903186])

In [12]:
def render_model(W):
    total_reward = 0
    num_batches = 5
    for i_episode in range(num_batches):
        observation = env.reset()
        done = False
        batch_reward = 0
        print("{0}/{1}:".format(i_episode, num_batches))
        for _ in range(1000):
            #env.render()  # I don't think you can get this to render from MyBinder. :(
            action = model_step(W,observation)
            observation, reward, done, info = env.step(action)
            batch_reward += reward
            if done:
                break
        print("{0}/{1}: Batch Reward: {2}".format(i_episode, num_batches, batch_reward))
        total_reward += batch_reward
    average_reward = total_reward/num_batches
    return total_reward,average_reward

In [13]:
render_model(Ws[best_weights_idx])

0/5:
0/5: Batch Reward: 1000.0
1/5:
1/5: Batch Reward: 1000.0
2/5:
2/5: Batch Reward: 1000.0
3/5:
3/5: Batch Reward: 1000.0
4/5:
4/5: Batch Reward: 1000.0


(5000.0, 1000.0)

In [14]:
render_model(personal_best_weight)

0/5:
0/5: Batch Reward: 1000.0
1/5:
1/5: Batch Reward: 1000.0
2/5:
2/5: Batch Reward: 1000.0
3/5:
3/5: Batch Reward: 1000.0
4/5:
4/5: Batch Reward: 1000.0


(5000.0, 1000.0)