In [1]:
import numpy as np
import random
import itertools 
import gym
from libr.envs.cliff_walking2 import CliffWalkingEnv

In [2]:
env = CliffWalkingEnv()

In [3]:
def offPolicyMCJC(env, numEpisodes, discount, epsilon, lenEp):
    '''This is the main function for computing off-policy (epsilon-soft policy) Monte Carlo method.
    Inputs - (i) env: OpenAI gym environment
            (ii) numEpisodes: number of episodes
            (ii) discount: the discount factor for infinite horizon discounted DP
            (iv) epsilon: 
            (v) lenEp:
    Outputs - Q-value and estimation policy  
    For example, see Figure 5.7 of Sutton and Barto for the pseudocode
    '''
    
    #Initialize
    Q = np.zeros([env.observation_space.n,env.action_space.n])
    N = np.zeros_like(Q)
    D = np.zeros_like(Q)
    aEstPolicy = np.zeros([env.observation_space.n,1])
    
    def estPolicy(Q, state):
        estPol = np.argmax(Q[state])
        return estPol
    def behaviorSoftPolicy(Q, state, epsilon): #nA
        if np.random.rand(1) > epsilon:
            a = np.argmax(Q[state])
        else:
            #a = np.random.choice(nA,1)
            a=env.action_space.sample()
        return a
    
    for ep in range(numEpisodes):
        episode = [] # start with an empty episode
        currState = env.reset()
        notMatchInstVec = [0]
        rewardEp = 0 #initialize total discounted return from the episode
        scale = 1
        for t in range(lenEp):
            #currAction = behaviorSoftPolicy(Q, currState, epsilon, env.action_space.n)
            currAction = behaviorSoftPolicy(Q, currState, epsilon)
            #generate a sample which returns nextstate, reward and if the episode terminates (done) 
            nextState, reward, done, _ = env.step(currAction)
            #add the sample, which is a tuple (state,action,reward), to the episode
            episode.append((currState, currAction, reward))
            if done:
                break
            currState = nextState
            rewardEp += discount*scale*reward #total discounted return from the episode
            scale *= discount
            
            #lenEpisode = sum(1 for x in episode) # get the length of episode <= lenEp
        saOccurence = set([(x[0], x[1]) for x in episode])
        
        for state, action in saOccurence:
            aEstPolicyState = estPolicy(Q, state)
            aEstPolicy[state] = aEstPolicyState  #estimation policy vector
            if aEstPolicyState != action:
                notMatchInstVec.append(t)
                tau = t
            else:
                tau = notMatchInstVec[-1]
            print("tau", tau)
            print("lenEpi", len(episode))
            #saTuple = (state,action)
            #first occurence of (state,action) in episode after tau
            episodeTauOnwards = episode[tau:]
            print("episodeTau",episodeTauOnwards)
            first_occurence_idx = next(i for i,x in enumerate(episodeTauOnwards)
                                       if x[0] == state and x[1] == action)

            W = 1.0 #start backwards with the action of last instant in the episode
            for k in range(len(episode)-1,first_occurence_idx,-1):
                W *= 1/(episode[k][1])
            N[state,action] += W*rewardEp
            D[state,action] += W
            Q[state,action] = N[state,action]/D[state,action]

    return Q, aEstPolicy


In [4]:
offPolicyMCJC(env,500, 0.9, 0.1, 100)

tau 99
lenEpi 100
episodeTau [(0, 0, -1.0)]


StopIteration: 