In [51]:
import numpy as np
import gym
import time
import tensorflow as tf
import math

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Meaning of actions:
- action = 0 - go left \
- action = 1 - stand still \
- action = 2 - go right

In [29]:
env = gym.make('MountainCar-v0')
obs = env.reset()

for t in range(10):
    env.render()
    # observation, reward, done, info = env.step(env.action_space.sample())
    obs, reward, done, info = env.step(2)
    time.sleep(0.01)
    print(obs, reward, done, info)
    
env.close()

[-0.56521661  0.00132115] -1.0 False {}
[-0.56258414  0.00263248] -1.0 False {}
[-0.55865994  0.0039242 ] -1.0 False {}
[-0.55347326  0.00518667] -1.0 False {}
[-0.54706283  0.00641043] -1.0 False {}
[-0.53947656  0.00758627] -1.0 False {}
[-0.53077125  0.00870531] -1.0 False {}
[-0.52101216  0.0097591 ] -1.0 False {}
[-0.51027246  0.0107397 ] -1.0 False {}
[-0.49863269  0.01163978] -1.0 False {}


In [3]:
### Building the policy network ###
model = Sequential()
model.add(Dense(8, input_shape = (2, ), activation = 'relu'))
model.add(Dense(3, activation = 'softmax'))

# Parameters
learning_rate = 0.001
batch_size = 1

opt = Adam(learning_rate = learning_rate)
model.compile(optimizer = opt, loss = 'mse')
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8)                 24        
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 27        
Total params: 51
Trainable params: 51
Non-trainable params: 0
_________________________________________________________________


In [31]:
### Forward pass of the policy network (one fully-connected layer) ###
def forward_pass(x, w, b):
    return np.dot(x, w) + b

In [None]:
### Initialize network ### 
w = np.random.normal(size = (num_features))

In [70]:
class simple_perceptron:
    
    def __init__(self, num_inputs, num_outputs):
        self.num_inputs = num_inputs
        self.num_outputs = num_outputs
        self.w = np.random.normal(size = (num_inputs, num_outputs))
        self.b = np.random.normal(size = num_outputs)
        
    # Using softmax as the activation function!
    def forward_pass(self, x):
        h = np.dot(x, self.w) + self.b
        return np.exp(h) / np.sum(np.exp(h))
    
    def get_w(self):
        return self.w
    
    def get_b(self):
        return self.b

In [69]:
model = simple_perceptron(2, 3)
x = np.array([1, 2])
print(x.shape)
y = model.forward_pass(x)
print(y)
#print(model.get_w().shape)
#print(model.get_b().shape)

(2,)
[0.98596315 0.00650538 0.00753146]


In [4]:
### Trying to make some initial predictions to make sure dimensions etc are as expected ###
print(obs.shape)
obs = np.reshape(obs, (1, 2))
#t = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
#print(t, t.shape)
print('obs = ', obs, obs.shape)
pred = model.predict(obs)
#pred = model.predict(t)
print(pred)

(2,)
obs =  [[-0.47847833  0.00948019]] (1, 2)
[[0.40123922 0.29580644 0.30295435]]


In [5]:
print(pred)

[[0.40123922 0.29580644 0.30295435]]


In [6]:
samples = tf.random.categorical(tf.math.log(pred), 1)
samples = np.array(samples)
print(samples, type(samples), samples.shape)

[[1]] <class 'numpy.ndarray'> (1, 1)


In [21]:
batch_size = 32 # Number of episodes to run before updating the network
num_epochs = 1
t_max = 100 # Number of time-steps before the episode is automatically terminated

env = gym.make('MountainCar-v0')


def train_one_epoch(policy_network, num_episodes):
    
    batch_obs = []
    batch_rets = []
    batch_lens = []
    
    for episode in range(num_episodes):
    
        obs = env.reset()             # Initial observation
        obs = np.reshape(obs, (1, 2)) # Reshape to make it fit the network
        
        ep_obs = []
        ep_ret = 0
    
        for t in range(t_max):

            env.render()
            time.sleep(0.01) # For visualizing the episode
            
            pred = policy_network.predict(obs)                # Making a prediction from the policy network, remember that the last layer is a softmax layer
            act = tf.random.categorical(tf.math.log(pred), 1) # Sample an action based on the probabilities given from the softmax layer
            act = int(np.array(act))
            #print('act = ', act)
            
            obs, reward, done, info = env.step(act)
            #print(obs, reward, done, info)
            ep_ret += reward
            ep_obs.append(obs)
            obs = np.reshape(obs, (1, 2))
            
            
            # If the agent has reached its goal
            if done:
                break
        
        env.close()
        
        batch_obs.append(ep_obs)
        batch_rets.append(ep_ret)
        batch_lens.append(t+1)
        
    return batch_obs, batch_rets, batch_lens

In [24]:
batch_obs, batch_rets, batch_lens = train_one_epoch(policy_network = model, num_episodes = 2)

In [27]:
print('batch_obs = ', batch_obs, len(batch_obs))
print('----------')
print('batch_rets = ', batch_rets)
print('batch_lens = ', batch_lens)

batch_obs =  [[array([-0.49846256, -0.0011973 ]), array([-0.4998482 , -0.00138564]), array([-0.50241182, -0.00256362]), array([-0.50613423, -0.00372241]), array([-0.51098757, -0.00485334]), array([-0.51693547, -0.0059479 ]), array([-0.52293334, -0.00599787]), array([-0.5289362 , -0.00600286]), array([-0.53389903, -0.00496283]), array([-0.53978462, -0.00588559]), array([-0.54554887, -0.00576425]), array([-0.55114861, -0.00559974]), array([-0.55554196, -0.00439335]), array([-0.5606961 , -0.00515414]), array([-0.56657259, -0.00587649]), array([-0.57312766, -0.00655508]), array([-0.57931264, -0.00618497]), array([-0.5860817 , -0.00676906]), array([-0.59138488, -0.00530318]), array([-0.59618316, -0.00479828]), array([-0.60144136, -0.00525819]), array([-0.60612102, -0.00467966]), array([-0.60918807, -0.00306705]), array([-0.61262023, -0.00343216]), array([-0.61439264, -0.00177241]), array([-6.14492481e-01, -9.98429354e-05]), array([-0.61291904,  0.00157344]), array([-0.61168368,  0.00123536]