# found in 
https://github.com/philtabor/Youtube-Code-Repository/blob/master/ReinforcementLearning/PolicyGradient/reinforce/reinforce_keras.py

In [4]:
#from agents import PolicyGradientAgent

In [29]:
from tensorflow.keras.layers import Dense, Activation, Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
import numpy as np

class PolicyGradientAgent(object):
    def __init__(self, alpha, gamma=0.99, n_actions=4,
                 layer1_size=16, layer2_size=16, input_dims=128,
                 fname='reinforce.h5'):
        self.gamma = gamma
        self.lr = alpha
        self.G = 0
        self.input_dims = input_dims
        self.fc1_dims = layer1_size
        self.fc2_dims = layer2_size
        self.n_actions = n_actions
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.policy, self.predict = self.build_policy_network()
        self.action_space = [i for i in range(n_actions)]

        self.model_file = fname

    def build_policy_network(self):
        _input = Input(shape=(self.input_dims,))
        advantages = Input(shape=[1])
        dense1 = Dense(self.fc1_dims, activation='relu')(_input)
        dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
        probs = Dense(self.n_actions, activation='softmax')(dense2)

        def custom_loss(y_true, y_pred):
            out = K.clip(y_pred, 1e-8, 1-1e-8)
            log_lik = y_true*K.log(out)

            return K.sum(-log_lik*advantages)

        policy = Model([_input, advantages], outputs=[probs])

        policy.compile(optimizer=Adam(lr=self.lr), loss=custom_loss)

        predict = Model([_input], outputs=[probs])

        return policy, predict

    def choose_action(self, observation):
        state = observation[np.newaxis, :]
        probabilities = self.predict.predict(state)[0]
        action = np.random.choice(self.action_space, p=probabilities)

        return action

    def store_transition(self, observation, action, reward):
        self.state_memory.append(observation)
        self.action_memory.append(action)
        self.reward_memory.append(reward)

    def learn(self):
        state_memory = np.array(self.state_memory)
        action_memory = np.array(self.action_memory)
        reward_memory = np.array(self.reward_memory)

        actions = np.zeros([len(action_memory), self.n_actions])
        actions[np.arange(len(action_memory)), action_memory] = 1

        G = np.zeros_like(reward_memory)
        for t in range(len(reward_memory)):
            G_sum = 0
            discount = 1
            for k in range(t, len(reward_memory)):
                G_sum += reward_memory[k] * discount
                discount *= self.gamma
            G[t] = G_sum
        mean = np.mean(G)
        std = np.std(G) if np.std(G) > 0 else 1
        self.G = (G - mean) / std

        cost = self.policy.train_on_batch([state_memory, self.G], actions)

        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []

        return cost

    def save_model(self):
        self.policy.save(self.model_file)

    def load_model(self):
        self.policy = load_model(self.model_file)

In [None]:

import os
# for keras the CUDA commands must come before importing the keras libraries
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
import gym

env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

pg_agent = PolicyGradientAgent(alpha=0.0005, gamma=0.99, n_actions=action_size, 
                               input_dims=state_size)
n_games = 500
pg_scores = []


for i in range(n_games):
    done = False
    score = 0
    observation = env.reset()
    for time in range(500):
        action = pg_agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        reward = reward if not done else -10
        score += reward
        pg_agent.store_transition(observation, action, reward)
        observation = observation_
        #ddqn_agent.learn()
        if done:
            break
            
    pg_scores.append(score)

    avg_score = np.mean(pg_scores[max(0, i-100):(i+1)])
    print('episode: ', i,'score: %.2f' % score,
          ' average score %.2f' % avg_score)

    if i % 10 == 0 and i > 0:
        #pg_agent.save_model()

        pass
    
x = [i+1 for i in range(n_games)]


episode:  0 score: 4.00  average score 4.00
episode:  1 score: 47.00  average score 25.50
episode:  2 score: 7.00  average score 19.33
episode:  3 score: -2.00  average score 14.00
episode:  4 score: 1.00  average score 11.40
episode:  5 score: 22.00  average score 13.17
episode:  6 score: 1.00  average score 11.43
episode:  7 score: 4.00  average score 10.50
episode:  8 score: -1.00  average score 9.22
episode:  9 score: 6.00  average score 8.90
episode:  10 score: -2.00  average score 7.91
episode:  11 score: 20.00  average score 8.92
episode:  12 score: -2.00  average score 8.08
episode:  13 score: -1.00  average score 7.43
episode:  14 score: 14.00  average score 7.87
episode:  15 score: 4.00  average score 7.62
episode:  16 score: 11.00  average score 7.82
episode:  17 score: -1.00  average score 7.33
episode:  18 score: 0.00  average score 6.95
episode:  19 score: 12.00  average score 7.20
episode:  20 score: 1.00  average score 6.90
episode:  21 score: 18.00  average score 7.41


In [3]:
from tensorflow.keras.models import Model

In [18]:
Model(inputs )

NameError: name 'inputs' is not defined

In [17]:
!pip

/bin/sh: pip: command not found


In [20]:
_input = Input(shape=(8,))
dense1 = Dense(64, activation='relu')(_input)
dense2 = Dense(64, activation='relu')(dense1)
probs = Dense(4, activation='softmax')(dense2)

In [26]:
Model([_input])

<tensorflow.python.keras.engine.training.Model at 0x130eded10>