Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym.

1 -> Import modules

In [1]:
from __future__ import print_function
import numpy as np
import random
import time
import gym

2 -> Define parameters

In [2]:
input_dim = 80*80
hidden_dim = 200
output_dim = 1
learning_rate = 1e-4
weight_decay = 0.99
gamma = 0.99
batch_size = 5 # update weights every %batch_size% episodes
test_interval = 10 # test every %test_interval% episodes
test_episode = 10 # test %test_episodes% episodes
snapshot = 20 # save model every %snapshot% episodes
snapshot_path = "./model/"
log_path = "./log/" + str(time.time()) + ".log"
logfile = open(log_path, "w")

3 -> Initialize policy parameters

In [3]:
model = {}
model["W1"] = np.random.rand(hidden_dim, input_dim)
model["W2"] = np.random.rand(output_dim, hidden_dim)

4 -> Preprocess

In [4]:
def preprocess(image):
    image = image[35:195] # crop to 160*160*3
    image = image[::2,::2,0] # downsample by factor of 2 80*80
    image[image==109] = 0 # erase background
    image[image!=0] = 1 # paddles,ball set to 1
    return image.astype(np.float).ravel() # uint8 to float and 80*80 to 6400

5 -> Policy forward

In [5]:
def softmax(x):
    return 1.0 / (1.0 + np.exp(-x))
def forward(x):
    h = np.dot(model["W1"], x.reshape(input_dim,1))
    h[h<0] = 0
    y = np.dot(model["W2"], h)
    p = softmax(y)
    return h, p

6 -> Policy backward

In [6]:
def backward(experience):
    grad = {}
    advantages = discounted_rewards(experience["rewards"]).reshape(1, -1)
    gradlogp2 = (1 - experience["actions"] - experience["aprobs"]) * experience["hiddens"]
    grad["W2"] = np.dot(advantages, gradlogp2)
    experience["hiddens"][experience["hiddens"]>0] = 1
    gradlogp1 = experience["hiddens"].reshape(-1, hidden_dim, 1) * experience["states"].reshape(-1, 1, input_dim)
    gradlogp1 = np.array(model["W2"]).reshape(1, hidden_dim, 1) * gradlogp1
    gradlogp1 = (1 - experience["actions"] - experience["aprobs"]).reshape(-1,1,1) * gradlogp1
    grad["W1"] = np.sum(advantages.reshape(-1, 1, 1) * gradlogp1, axis=0)
    return grad

In [7]:
def zero_grad():
    grad = {}
    for name, weight in model.items():
        grad[name] = np.zeros(weight.shape)
    return grad

In [8]:
def discounted_rewards(rewards):
    for i in range(len(rewards)-1):
        rewards[-(i+2)] = rewards[-(i+2)] + gamma*rewards[-(i+1)]
    return rewards

In [9]:
def update_weights(grad):
    for name in model:
        model[name] = model[name] + learning_rate * grad[name]

In [10]:
def clear_experience():
    experience = {}
    experience["states"] = np.array([])
    experience["hiddens"] = np.array([])
    experience["aprobs"] = np.array([])
    experience["actions"] = np.array([])
    experience["rewards"] = np.array([])
    return experience

In [11]:
def insert_experience(experience, state, hidden, aprob, action, reward):
    if (experience["states"].shape[0] != 0):
        experience["states"] = np.vstack((experience["states"], state.reshape(1, input_dim)))
        experience["hiddens"] = np.vstack((experience["hiddens"], hidden.reshape(1, hidden_dim)))
        experience["aprobs"] = np.vstack((experience["aprobs"], np.array(aprob).reshape(1, 1)))
        experience["actions"] = np.vstack((experience["actions"], np.array(action).reshape(1, 1)))
        experience["rewards"] = np.vstack((experience["rewards"], np.array(reward).reshape(1, 1)))
    else:
        experience["states"] = state.reshape(1, input_dim)
        experience["hiddens"] = hidden.reshape(1, hidden_dim)
        experience["aprobs"] = np.array(aprob).reshape(1, 1)
        experience["actions"] = np.array(action).reshape(1, 1)
        experience["rewards"] = np.array(reward).reshape(1, 1)
    return experience

In [12]:
def save_model(episode_num):
    prefix = "Pongv0-PG-episode-"
    surfix = ".npy"
    path = snapshot_path + prefix + str(episode_num) + surfix
    np.save(path, model)
    print("Save model " + path)
    logfile.write("Save model " + path)

In [None]:
def test():
    test_num = 0
    total_reward = 0
    while test_num < test_episode:
        observation = env.reset()
        prev_obs = None
        cur_obs = preprocess(observation)
        while True:
            state = cur_obs - (prev_obs if prev_obs is not None else 0)
            hidden, aprob = forward(state)
            action = 2 if random.random() <= aprob else 3
            observation, reward, done, info = env.step(action)
            total_reward += reward
            prev_obs = cur_obs
            cur_obs = preprocess(observation)
            if done:
                print("Test episode " + str(test_num+1) + " terminated!")
                logfile.write("Test episode " + str(test_num+1) + " terminated!")
                break
        test_num = test_num + 1
    avg_reward = float(total_reward) / float(test_episode)
    print("Average rewards: " + str(avg_reward))
    logfile.write("Average rewards: " + str(avg_reward))

Main loop

In [None]:
env = gym.make("Pong-v0")
episode_num = 1
grads = zero_grad()
while True:
    observation = env.reset()
    experience = clear_experience()
    prev_obs = None
    cur_obs = preprocess(observation)
    while True:
        state = cur_obs - (prev_obs if prev_obs is not None else 0)
        hidden, aprob = forward(state)
        action = 2 if random.random() <= aprob else 3
        observation, reward, done, info = env.step(action)
        action = 0 if action == 2 else 1
        experience = insert_experience(experience, state, hidden, aprob, action, reward)
        prev_obs = cur_obs
        cur_obs = preprocess(observation)
        if done:
            print("Train episode " + str(episode_num) + " terminated!")
            logfile.write("Train episode " + str(episode_num) + " terminated!")
            grad = backward(experience)
            for name in grad:
                grads[name] = grads[name] + grad[name]
            break
    if episode_num % batch_size == 0:
        update_weights(grads)
        grads = zero_grad()
    if episode_num % test_interval == 0:
        test()
    if episode_num % snapshot == 0:
        save_model(episode_num)
    episode_num = episode_num + 1
logfile.close()

[2017-04-13 21:19:05,409] Making new env: Pong-v0


Train episode 1 terminated!
Train episode 2 terminated!
Train episode 3 terminated!
Train episode 4 terminated!
Train episode 5 terminated!
Train episode 6 terminated!
Train episode 7 terminated!
Train episode 8 terminated!
Train episode 9 terminated!
Train episode 10 terminated!
Test episode 1 terminated!
Test episode 2 terminated!
Test episode 3 terminated!
Test episode 4 terminated!
Test episode 5 terminated!
Test episode 6 terminated!
Test episode 7 terminated!
Test episode 8 terminated!
Test episode 9 terminated!
Test episode 10 terminated!
Average rewards: -20.3
Train episode 11 terminated!
Train episode 12 terminated!
Train episode 13 terminated!
Train episode 14 terminated!
Train episode 15 terminated!
Train episode 16 terminated!
Train episode 17 terminated!
Train episode 18 terminated!
Train episode 19 terminated!
Train episode 20 terminated!
Test episode 1 terminated!
Test episode 2 terminated!
Test episode 3 terminated!
Test episode 4 terminated!
Test episode 5 terminated!
