In [1]:
import numpy as np
import itertools
import random
import gym
from libr.envs.cliff_walking2 import CliffWalkingEnv

In [2]:
env = CliffWalkingEnv()

In [3]:
 def actorCriticJC(env, numEpisodes, discount):
    '''This is the main function for computing actor-critic method.
    Inputs - (i) env: OpenAI gym environment
            (ii) numEpisodes: number of episodes
            (iii) discount: the discount factor for infinite horizon discounted DP
    Outputs - Value, reward list from the episodes
    For example, see Section 6.6 of Sutton and Barto for the discussion on the algorithm
    '''
    
    #initialize
    p = np.zeros([env.observation_space.n,env.action_space.n])  #preference matrix, p(s,a)
    V = np.zeros([env.observation_space.n,1])
    alpha = 0.1 #learning rate for V
    beta = 0.1 #learning rate for p
    rList=[]
    #policyVec = np.zeros([env.observation_space.n,1])
    
    def gibbsSoftmax(state,p,nA):
        policy = np.zeros(nA)
        sumP = np.sum(np.exp(p[state])) 
        for action in range(nA):
            policy[action] = np.exp(p[state,action])/sumP
        return policy
    
    for ep in range(numEpisodes):
        currState = env.reset()
        rTotalFromEpisode = 0
        
        for _ in itertools.count():
            policy = gibbsSoftmax(currState,p,env.action_space.n) #probability vector for currState
            currAction = np.random.choice(range(env.action_space.n),1, p=policy)[0] 
            
            #generate a sample
            nextState, reward, done, _ = env.step(currAction)
            rTotalFromEpisode += reward
            
            TDerror = reward + discount*V[nextState] - V[currState]
            p[currState,currAction] += beta*TDerror
            V[currState] += alpha*TDerror
            
            #Alternative calculation of preference 
            #p[currState,currAction] += beta*TDerror*(1-policy[currAction]) 
            
            if done:
                break
            
            currState = nextState
        rList.append(rTotalFromEpisode)
    return V, rList
    
    

In [4]:
actorCriticJC(env, 500, 0.9)

(array([[-9.41162522],
        [-9.17854484],
        [-8.534734  ],
        [-8.30322732],
        [-8.26428793],
        [-7.96271968],
        [-7.32685501],
        [-6.61316899],
        [-5.94425163],
        [-4.95795457],
        [-4.21743785],
        [-4.99707015],
        [-9.15054399],
        [-8.28237576],
        [-6.92791991],
        [-6.56567912],
        [-6.18789014],
        [-5.79229716],
        [-5.32032608],
        [-4.76167806],
        [-4.20477052],
        [-3.52875405],
        [-2.73239472],
        [-1.90430659],
        [-7.79960687],
        [-7.53561026],
        [-7.25160129],
        [-8.38293499],
        [-8.0621505 ],
        [-8.00638226],
        [-8.25105478],
        [-7.82808449],
        [-8.60394439],
        [-6.52671392],
        [-5.20233664],
        [-1.00014558],
        [-8.01723595],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0