Short Corridor with Switched Actions
---
Consider the small corridor gridworld shown inset in the graph below. The reward is -1 per step, as usual. In each of the three nonterminal states there are only two actions, right and left. These actions have their usual consequences in the first and third states (left causes no movement in the first state), but in the second state they are reversed, so that right moves to the left and left moves to the right. The problem is diffcult because all the states appear identical under the function approximation. In particular, we define `x(s, right) = [1, 0]` and `x(s, left) = [0, 1]`, for all s.

<img src="corridor.png" width="600">

MC Policy Gradient
---
<img src="mc_policy_gradient.png" width="600">

In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [75]:
class ShortCorridor:
    def __init__(self, alpha=0.2, gamma=0.8):
        self.actions = ["left", "right"]
        self.x = np.array([[0, 1], [1, 0]])  # left|s, right|s
        self.theta = np.array([-1.47, 1.47])
        self.state = 0  # initial state 0
        self.gamma = gamma
        self.alpha = alpha
        
    def softmax(self, vector):
        return np.exp(vector)/sum(np.exp(vector))
        
    def chooseAction(self):
        h = np.dot(self.theta, self.x)
        prob = self.softmax(h)  # left, right probability for all state
        
        action = np.random.choice(self.actions, p=prob)
        return action
    
    def takeAction(self, action):
        if self.state == 0:
            nxtState = 0 if action == "left" else 1
        elif self.state == 1:
            nxtState = 2 if action == "left" else 0  # reversed
        elif self.state == 2:
            nxtState = 1 if action == "left" else 3
        else:
            nxtState = 2 if action == "left" else 3
        return nxtState
    
    def giveReward(self):
        if self.state == 3:
            return 0
        return -1
    
    def reset(self):
        self.state = 0
    
    def run(self, rounds=100):
        actions = []
        rewards = []
        for i in range(rounds):
            while True:
                action = self.chooseAction()
                nxtState = self.takeAction(action)
                reward = self.giveReward()

#                 print("state {} action {} reward {} next_state {}".format(self.state, action, reward, nxtState))
                actions.append(action)
                rewards.append(reward)
                
                self.state = nxtState
                # game end
                if self.state == 3:
                    print("end state")
                    T = len(rewards)
                    for t in range(T):
                        # calculate G
                        G = 0
                        for k in range(t+1, T):
                            G += np.power(self.gamma, k-t-1)*rewards[k]
                
                        j = 1 if actions[t] == "right" else 0
                        h = np.dot(self.theta, self.x)
                        prob = self.softmax(h)
                        grad = self.x[:, j] - np.dot(self.x, prob)

                        self.theta += self.alpha*np.power(self.gamma, t)*G*grad
                    # reset 
                    self.state = 0
                    actions = []
                    rewards = []
                    break

In [76]:
sc = ShortCorridor()

In [79]:
sc.run(10)

end state
end state
end state
end state
end state
end state
end state
end state
end state
end state
