In [None]:
import gym
from keras.models import Sequential
from keras.layers import Dense, Input
import numpy as np
import tensorflow as tf
import math
from keras import backend

In [None]:
env = gym.make('CartPole-v1')

In [None]:
# Creating the actor critic model

def customLoss(yTrue, yPred):
    out = backend.clip(yPred, 1e-8, 1)
    yLik = -backend.log(out)*yTrue
    return backend.sum(yLik)

def createActor(inputSize, outputSize):
    model = Sequential()
    model.add(Input(shape=(inputSize,)))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(outputSize, activation='softmax'))
    model.compile(optimizer='adam', 
        loss=customLoss,
        metrics=['accuracy']
    )
    return model

def createCritic(inputSize, outputSize):
    model = Sequential()
    model.add(Input(shape=(inputSize,)))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(outputSize, activation='linear'))
    model.compile(optimizer='adam', 
        loss='mean_squared_error',
        metrics=['accuracy']
    )
    return model

In [None]:
# Running the main loop
EPISODES = 100
GAMMA = 0.99
inputSize = env.observation_space.shape[0]
actionSize = env.action_space.n
actor = createActor(inputSize, actionSize)
critic = createCritic(inputSize, 1)

def train(curState, action, nextState, reward, done):
    curState = curState[np.newaxis,:]
    nextState = nextState[np.newaxis,:]
    action = np.argmax(action)
    
    curValue = critic.predict(curState, verbose=0)[0]
    nextValue = critic.predict(nextState, verbose=0)[0]
    target = reward + GAMMA * (1-int(done)) * nextValue
    critic.fit(curState, target, verbose=0)
    delta = target - curValue

    actions = np.zeros(actionSize)
    actions[action] = -delta
    actor.fit(curState, actions[np.newaxis,:], verbose=0)

for i in range(EPISODES):
    observation = env.reset()
    rewards = 0
    done = False
    while not done:
        env.render()
        action = actor.predict(observation[np.newaxis,:], verbose=0)[0]
        nextState, reward, done, info = env.step(np.argmax(action))
        train(observation, action, nextState, reward, done)
        rewards += reward
        observation = nextState
    print('Episode', i, 'Rewards:', rewards)
env.close()