In [1]:
import numpy as np
#from itertools import permutations, repeat, product
import itertools
import gym
from libr.envs.cliff_walking2 import CliffWalkingEnv

In [2]:
env = CliffWalkingEnv()

In [3]:
def TD0JC(env, numEpisodes, discount):
    '''This is the main function for computing TD0 policy evaluation method. I have used a random policy 
    to be evaluated.
    Inputs - (i) env: OpenAI gym environment
            (ii) numEpisodes: number of episodes
            (iii) discount: the discount factor for infinite horizon discounted DP
    Outputs - Value
    For example, see Figure 6.2 of Sutton and Barto for the pseudocode
    '''
    #initialize
    V = np.zeros(env.observation_space.n)
    alpha = 0.1 #learning rate
    rList=[]
    
    #A random policy to be evaluated, returns an action
    def randomPolicy(state):
        a=env.action_space.sample()
        return a
    
    for ep in range(numEpisodes):
        currState = env.reset()
        rTotalFromEpisode = 0
        #for t in range(lenEp):
        for _ in itertools.count():
            currAction = randomPolicy(currState)
            
            #generate a sample
            nextState, reward, done, _ = env.step(currAction)            
            rTotalFromEpisode += reward
            
            TDerror = reward + discount*V[nextState] - V[currState]
            V[currState] += alpha*TDerror
            
            currState = nextState
            
            if done:
                break
        rList.append(rTotalFromEpisode) # keep a list of total reward from each episode
        
    return V
            
        

In [4]:
TD0JC(env,500,0.9)

array([-23.51589307, -24.35856523, -22.98883872, -19.44972571,
       -16.57030757,  -9.34143972,  -6.41473664,  -5.2962863 ,
        -2.73257334,  -0.54575598,   0.        ,   0.        ,
       -32.65740979, -31.87972409, -30.64372715, -25.62375194,
       -20.13704463, -15.15471674, -13.00239544, -13.83155877,
        -5.66359979,  -1.946058  ,   0.        ,   0.        ,
       -42.92492506, -59.5908419 , -64.75663691, -57.83832799,
       -56.40492452, -40.66806481, -37.04360635, -40.17850863,
        -7.71273572,  -9.23842075,   0.        ,   0.        ,
       -68.6033008 ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ])