## Model-based Reinforcement Learning

In [1]:
import tensorflow as tf
import numpy as np

tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
import gym

env = gym.make('CartPole-v0')

### ***Hyper parameters***

In [3]:
# H: number of hidden layer neurons
# gamma: discount factor
# decay_rate: decay factor for RMSProp leaky sum of grad^2
# resume: resume from previous checkpoint?
# model_bs: batch size when learning from model
# real_bs: batch size when learning from real environment

H = 8
lr = 1e-2
gamma = 0.99
decay_rate = 0.99
resume = False
model_bs = 3
real_bs = 3

### ***Policy gradient network***

In [4]:
tf.reset_default_graph()
observations = tf.placeholder(shape=[None,4], dtype=tf.float32, name='input_x')

W1 = tf.get_variable(shape=[4,H], name='W1', initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations, W1))

W2 = tf.get_variable(shape=[H,1], name='W2', initializer=tf.contrib.layers.xavier_initializer())
score = tf.nn.relu(tf.matmul(layer1, W2))

probability = tf.nn.sigmoid(score)

tvars = tf.trainable_variables()
input_y = tf.placeholder(shape=[None,1], dtype=tf.float32, name='input_y')
advantages = tf.placeholder(dtype=tf.float32, name='reward_signal')

loglik = tf.log(input_y*(input_y-probability)+(1-input_y)*(input_y+probability))
loss = -tf.reduce_mean(loglik*advantages)
newGrads = tf.gradients(loss, tvars)

adam = tf.train.AdamOptimizer(learning_rate=lr)
W1Grad = tf.placeholder(tf.float32, name='batch_grad1')
W2Grad = tf.placeholder(tf.float32, name='batch_grad2')
batchGrad = [W1Grad, W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad, tvars))

### ***Model network***

In [6]:
mH = 256
input_data = tf.placeholder(shape=[None,5], dtype=tf.float32)
previous_state = tf.placeholder(shape=[None,5], dtype=tf.float32, name='previous_state')

W1M = tf.get_variable(shape=[5,mH], name='W1M', initializer=tf.contrib.layers.xavier_initializer())
B1M = tf.Variable(tf.zeros([mH]), name='B1M')
layer1M = tf.nn.relu(tf.matmul(previous_state, W1M)+B1M)

W2M = tf.get_variable(shape=[mH,mH], name='W2M', initializer=tf.contrib.layers.xavier_initializer())
B2M = tf.Variable(tf.zeros([mH]), name='B2M')
layer2M = tf.nn.relu(tf.matmul(layer1M, W2M)+B2M)

WO = tf.get_variable(shape=[mH,4], name='WO', initializer=tf.contrib.layers.xavier_initializer())
WR = tf.get_variable(shape=[mH,1], name='WR', initializer=tf.contrib.layers.xavier_initializer())
WD = tf.get_variable(shape=[mH,1], name='WD', initializer=tf.contrib.layers.xavier_initializer())

bO = tf.Variable(tf.zeros([4]), name='bO')
bR = tf.Variable(tf.zeros([1]), name='bR')
bD = tf.Variable(tf.zeros([1]), name='bD')

predicted_observation = tf.matmul(layer2M, WO, name='predicted_observation')+bO
predicted_reward = tf.matmul(layer2M, WR, name='predicted_reward')+bR
predicted_done = tf.sigmoid(tf.matmul(layer2M, WD, name='predicted_done')+bD)

true_observation = tf.placeholder(shape=[None,4], dtype=tf.float32, name='true_observation')
true_reward = tf.placeholder(shape=[None,1], dtype=tf.float32, name='true_reward')
true_done = tf.placeholder(shape=[None,1], dtype=tf.float32, name='true_done')

predicted_state = tf.concat([predicted_observation, predicted_reward, predicted_done], 1)

observation_loss = tf.square(true_observation-predicted_observation)
reward_loss = tf.square(true_reward-predicted_reward)
done_loss = tf.multiply(predicted_done, true_done)+tf.multiply(1-predicted_done, 1-true_done)
done_loss = -tf.log(done_loss)

model_loss = tf.reduce_mean(observation_loss+done_loss+reward_loss)
model_adam = tf.train.AdamOptimizer(learning_rate=lr)
updateModel = model_adam.minimize(model_loss)

### ***Helper-functions***

In [11]:
# initializer gradBuffer
def resetGradBuffer(gradBuffer) :
    for idx, grad in enumerate(gradBuffer) :
        gradBuffer[idx] = grad*0
        
    return gradBuffer

# get discounting reward
def discount_rewards(r) :
    discounted = np.zeros_like(r)
    running_add = 0
    
    for t in reversed(range(0, r.size)) :
        running_add = running_add*gamma+r[t]
        discounted[t] = running_add
        
    return discounted

# to produce a new state when given a previous state and action
def stepModel(sess, xs, action) :
    toFeed = np.reshape(np.hstack([xs[-1][0], np.array(action)]), [1,5])
    myPredict = sess.run([predicted_state], feed_dict={previous_state: toFeed})
    reward = myPredict[0][:,4]
    
    observation = myPredict[0][:,0:4]
    observation[:,0] = np.clip(observation[:,0], -2.4, 2.4)
    observation[:,2] = np.clip(observation[:,2], -0.4, 0.4)
    
    doneP = np.clip(myPredict[0][:,5], 0, 1)
    
    if doneP > 0.1 or len(xs) >= 300 :
        done = True
    else :
        done = False
        
    return observation, reward, done

### ***Training***

In [16]:
xs, drs, ys, ds = [], [], [], []
running_reward = None
reward_sum = 0
episode_number = 1
EPOCHS = 5000
real_episodes = 1
init = tf.global_variables_initializer()
batch_size = real_bs

drawFromModel = False
trainTheModel = True
trainThePolicy = False
switch_point = 1

with tf.Session() as sess :
    rendering = False
    sess.run(init)
    observation = env.reset()
    x = observation
    gradBuffer = sess.run(tvars)
    gradBuffer = resetGradBuffer(gradBuffer)
    
    while episode_number <= EPOCHS :
        if (reward_sum/batch_size > 150 and not drawFromModel) or rendering :
            rendering = True
            
        x = np.reshape(observation, [1,4])
        
        tfprob = sess.run(probability, feed_dict={observations: x})
        action = 1 if np.random.uniform() < tfprob else 0
        
        xs.append(x)
        
        y = 0 if action else 1
        ys.append(y)
        
        if not drawFromModel :
            observation, reward, done, info = env.step(action)
        else :
            observation, reward, done = stepModel(sess, xs, action)
            
        reward_sum += reward
        
        ds.append(done*1)
        drs.append(reward)
        
        if done :
            if not drawFromModel :
                real_episodes += 1
                
            episode_number += 1
            
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            epd = np.vstack(ds)
            xs, ys, drs, ds = [], [], [], []
            
            if trainTheModel :
                actions = np.array([np.abs(y-1) for y in epy][:-1])
                state_prevs = epx[:-1,:]
                state_prevs = np.hstack([state_prevs, actions])
                
                state_nexts = epx[1:,:]
                rewards = np.array(epr[1:,:])
                dones = np.array(epd[1:,:])
                
                state_nextsAll = np.hstack([state_nexts, rewards, dones])
                
                feed_dict = {previous_state: state_prevs, true_observation: state_nexts, true_done: dones, true_reward: rewards}
                loss, pState, _ = sess.run([model_loss, predicted_state, updateModel], feed_dict=feed_dict)
                
            if trainThePolicy :
                discounted_epr = discount_rewards(epr).astype('float32')
                discounted_epr -= np.mean(discounted_epr)
                discounted_epr /= np.std(discounted_epr)
                
                tGrad = sess.run(newGrads, feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
                
                if np.sum(tGrad[0]==tGrad[0]) == 0 :
                    break
                    
                for idx, grad in enumerate(tGrad) :
                    gradBuffer[idx] += grad
                    
            if switch_point+batch_size == episode_number :
                switch_point = episode_number
                
                if trainThePolicy :
                    sess.run(updateGrads, feed_dict={W1Grad: gradBuffer[0], W2Grad: gradBuffer[1]})
                    gradBuffer = resetGradBuffer(gradBuffer)
                    
                running_reward = reward_sum if running_reward is None else running_reward*0.99+reward_sum*0.01
                
                if not drawFromModel :
                    print("World Perf: Episode %f. Reward %f. action %f. mean reward %f." %(real_episodes, reward_sum/real_bs, action, running_reward/real_bs))
                    
                    if reward_sum/batch_size > 200 :
                        break
                        
                reward_sum = 0
                
                if episode_number > 100 :
                    drawFromModel = not drawFromModel
                    trainTheModel = not trainTheModel
                    trainThePolicy = not trainThePolicy
                    
            if drawFromModel :
                observation = np.random.uniform(-0.1, 0.1, [4])
                batch_size = model_bs
            else :
                observation = env.reset()
                batch_size = real_bs
                
print('\nReal episodes is : ', real_episodes)

World Perf: Episode 4.000000. Reward 20.000000. action 1.000000. mean reward 20.000000.
World Perf: Episode 7.000000. Reward 28.000000. action 1.000000. mean reward 20.080000.
World Perf: Episode 10.000000. Reward 35.000000. action 0.000000. mean reward 20.229200.
World Perf: Episode 13.000000. Reward 21.333333. action 1.000000. mean reward 20.240241.
World Perf: Episode 16.000000. Reward 19.333333. action 0.000000. mean reward 20.231172.
World Perf: Episode 19.000000. Reward 25.000000. action 0.000000. mean reward 20.278861.
World Perf: Episode 22.000000. Reward 19.666667. action 1.000000. mean reward 20.272739.
World Perf: Episode 25.000000. Reward 17.666667. action 0.000000. mean reward 20.246678.
World Perf: Episode 28.000000. Reward 23.666667. action 1.000000. mean reward 20.280878.
World Perf: Episode 31.000000. Reward 28.333333. action 0.000000. mean reward 20.361402.
World Perf: Episode 34.000000. Reward 14.666667. action 1.000000. mean reward 20.304455.
World Perf: Episode 37.

World Perf: Episode 286.000000. Reward 24.666667. action 1.000000. mean reward 24.948709.
World Perf: Episode 289.000000. Reward 13.666667. action 1.000000. mean reward 24.702614.
World Perf: Episode 292.000000. Reward 17.000000. action 0.000000. mean reward 24.604385.
World Perf: Episode 295.000000. Reward 20.333333. action 0.000000. mean reward 24.409613.
World Perf: Episode 298.000000. Reward 37.000000. action 1.000000. mean reward 24.511774.
World Perf: Episode 301.000000. Reward 16.000000. action 0.000000. mean reward 24.739769.
World Perf: Episode 304.000000. Reward 26.333333. action 0.000000. mean reward 24.652517.
World Perf: Episode 307.000000. Reward 14.000000. action 0.000000. mean reward 24.366297.
World Perf: Episode 310.000000. Reward 18.666667. action 1.000000. mean reward 24.134148.
World Perf: Episode 313.000000. Reward 23.666667. action 0.000000. mean reward 23.958311.
World Perf: Episode 316.000000. Reward 19.000000. action 1.000000. mean reward 23.741823.
World Perf

World Perf: Episode 562.000000. Reward 13.000000. action 0.000000. mean reward 36.560833.
World Perf: Episode 565.000000. Reward 30.333333. action 0.000000. mean reward 39.039845.
World Perf: Episode 568.000000. Reward 14.333333. action 1.000000. mean reward 38.494415.
World Perf: Episode 571.000000. Reward 27.666667. action 1.000000. mean reward 38.085220.
World Perf: Episode 574.000000. Reward 14.333333. action 1.000000. mean reward 37.515320.
World Perf: Episode 577.000000. Reward 13.666667. action 0.000000. mean reward 37.021221.
World Perf: Episode 580.000000. Reward 14.666667. action 1.000000. mean reward 36.599194.
World Perf: Episode 583.000000. Reward 16.000000. action 0.000000. mean reward 36.474545.
World Perf: Episode 586.000000. Reward 30.000000. action 1.000000. mean reward 36.265682.
World Perf: Episode 589.000000. Reward 22.000000. action 1.000000. mean reward 35.873997.
World Perf: Episode 592.000000. Reward 29.333333. action 0.000000. mean reward 35.948647.
World Perf

World Perf: Episode 841.000000. Reward 32.000000. action 0.000000. mean reward 27.122278.
World Perf: Episode 844.000000. Reward 19.666667. action 1.000000. mean reward 26.861830.
World Perf: Episode 847.000000. Reward 27.333333. action 1.000000. mean reward 26.745920.
World Perf: Episode 850.000000. Reward 14.000000. action 1.000000. mean reward 26.490095.
World Perf: Episode 853.000000. Reward 33.666667. action 1.000000. mean reward 26.362869.
World Perf: Episode 856.000000. Reward 32.000000. action 0.000000. mean reward 26.284185.
World Perf: Episode 859.000000. Reward 13.666667. action 0.000000. mean reward 26.008108.
World Perf: Episode 862.000000. Reward 19.333333. action 0.000000. mean reward 25.746595.
World Perf: Episode 865.000000. Reward 15.000000. action 0.000000. mean reward 26.971769.
World Perf: Episode 868.000000. Reward 12.666667. action 1.000000. mean reward 27.111052.
World Perf: Episode 871.000000. Reward 28.666667. action 0.000000. mean reward 26.927345.
World Perf

World Perf: Episode 1114.000000. Reward 18.333333. action 1.000000. mean reward 33.611271.
World Perf: Episode 1117.000000. Reward 20.666667. action 0.000000. mean reward 33.219845.
World Perf: Episode 1120.000000. Reward 28.666667. action 0.000000. mean reward 33.005207.
World Perf: Episode 1123.000000. Reward 16.000000. action 1.000000. mean reward 32.725849.
World Perf: Episode 1126.000000. Reward 21.666667. action 1.000000. mean reward 32.460846.
World Perf: Episode 1129.000000. Reward 18.000000. action 0.000000. mean reward 32.154789.
World Perf: Episode 1132.000000. Reward 28.666667. action 0.000000. mean reward 31.924726.
World Perf: Episode 1135.000000. Reward 49.000000. action 1.000000. mean reward 31.879950.
World Perf: Episode 1138.000000. Reward 23.333333. action 1.000000. mean reward 31.672838.
World Perf: Episode 1141.000000. Reward 24.333333. action 1.000000. mean reward 31.380806.
World Perf: Episode 1144.000000. Reward 18.333333. action 1.000000. mean reward 31.032249.

World Perf: Episode 1393.000000. Reward 16.000000. action 0.000000. mean reward 26.239737.
World Perf: Episode 1396.000000. Reward 21.000000. action 0.000000. mean reward 26.066895.
World Perf: Episode 1399.000000. Reward 22.333333. action 1.000000. mean reward 25.951075.
World Perf: Episode 1402.000000. Reward 12.666667. action 1.000000. mean reward 25.884535.
World Perf: Episode 1405.000000. Reward 21.000000. action 1.000000. mean reward 25.684275.
World Perf: Episode 1408.000000. Reward 21.666667. action 1.000000. mean reward 25.561884.
World Perf: Episode 1411.000000. Reward 33.666667. action 0.000000. mean reward 27.506384.
World Perf: Episode 1414.000000. Reward 24.333333. action 1.000000. mean reward 27.586182.
World Perf: Episode 1417.000000. Reward 27.333333. action 1.000000. mean reward 27.434164.
World Perf: Episode 1420.000000. Reward 20.666667. action 0.000000. mean reward 27.171844.
World Perf: Episode 1423.000000. Reward 29.000000. action 0.000000. mean reward 27.046503.

World Perf: Episode 1672.000000. Reward 41.666667. action 0.000000. mean reward 23.894197.
World Perf: Episode 1675.000000. Reward 14.333333. action 0.000000. mean reward 23.723923.
World Perf: Episode 1678.000000. Reward 25.666667. action 1.000000. mean reward 23.567999.
World Perf: Episode 1681.000000. Reward 20.333333. action 1.000000. mean reward 23.370895.
World Perf: Episode 1684.000000. Reward 16.333333. action 1.000000. mean reward 23.175203.
World Perf: Episode 1687.000000. Reward 13.000000. action 0.000000. mean reward 23.039980.
World Perf: Episode 1690.000000. Reward 22.666667. action 1.000000. mean reward 23.000658.
World Perf: Episode 1693.000000. Reward 16.000000. action 1.000000. mean reward 22.782225.
World Perf: Episode 1696.000000. Reward 19.000000. action 0.000000. mean reward 22.698626.
World Perf: Episode 1699.000000. Reward 17.000000. action 0.000000. mean reward 22.513479.
World Perf: Episode 1702.000000. Reward 17.333333. action 1.000000. mean reward 22.383787.

World Perf: Episode 1945.000000. Reward 23.000000. action 0.000000. mean reward 27.145586.
World Perf: Episode 1948.000000. Reward 21.000000. action 1.000000. mean reward 26.917532.
World Perf: Episode 1951.000000. Reward 20.333333. action 1.000000. mean reward 26.785295.
World Perf: Episode 1954.000000. Reward 24.666667. action 1.000000. mean reward 26.679064.
World Perf: Episode 1957.000000. Reward 36.000000. action 0.000000. mean reward 26.600637.
World Perf: Episode 1960.000000. Reward 25.333333. action 1.000000. mean reward 26.403154.
World Perf: Episode 1963.000000. Reward 23.000000. action 0.000000. mean reward 26.342203.
World Perf: Episode 1966.000000. Reward 23.000000. action 0.000000. mean reward 26.232216.
World Perf: Episode 1969.000000. Reward 18.000000. action 1.000000. mean reward 26.989401.
World Perf: Episode 1972.000000. Reward 13.666667. action 0.000000. mean reward 26.675489.
World Perf: Episode 1975.000000. Reward 24.333333. action 0.000000. mean reward 26.485975.

World Perf: Episode 2224.000000. Reward 15.000000. action 0.000000. mean reward 22.209459.
World Perf: Episode 2227.000000. Reward 18.333333. action 0.000000. mean reward 22.038946.
World Perf: Episode 2230.000000. Reward 20.000000. action 1.000000. mean reward 21.942160.
World Perf: Episode 2233.000000. Reward 18.000000. action 1.000000. mean reward 21.929031.
World Perf: Episode 2236.000000. Reward 15.666667. action 1.000000. mean reward 21.789824.
World Perf: Episode 2239.000000. Reward 17.333333. action 1.000000. mean reward 21.835510.
World Perf: Episode 2242.000000. Reward 20.666667. action 0.000000. mean reward 21.755476.
World Perf: Episode 2245.000000. Reward 19.666667. action 0.000000. mean reward 22.955725.
World Perf: Episode 2248.000000. Reward 17.666667. action 0.000000. mean reward 22.764265.
World Perf: Episode 2251.000000. Reward 28.666667. action 1.000000. mean reward 22.714914.
World Perf: Episode 2254.000000. Reward 35.666667. action 1.000000. mean reward 22.780844.