Environment based on https://towardsdatascience.com/implement-grid-world-with-q-learning-51151747b455<br>
Very simple grid world with a wall at 1,1 <br>
A win state at 1,3 <br>
A lose state at 2,3 (Hole)<br>
A start at 2,0 <br>

In [69]:

import numpy as np

BOARD_ROWS = 3
BOARD_COLS = 4
WIN_STATE = (1, 3)
LOSE_STATE = (2, 3)
START = (2, 0)
WALL = (1,1)

class State:
    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.board[WIN_STATE] = 1
        self.board[LOSE_STATE] = -1
        self.state = state
        self.isEnd = False

    def giveReward(self):
        if self.state == WIN_STATE:
            return 1
        elif self.state == LOSE_STATE:
            return -1
        else:
            return 0

    def isEndFunc(self):
        if (self.state == WIN_STATE) or (self.state == LOSE_STATE):
            self.isEnd = True

    def nxtPosition(self, action):
        """
        action: up, down, left, right
        -------------
        0 | 1 | 2| 3|
        1 |
        2 |
        return next position on board
        """
        if action == 0:
            nxtState = (self.state[0] - 1, self.state[1])
        elif action == 1:
            nxtState = (self.state[0] + 1, self.state[1])
        elif action == 2:
            nxtState = (self.state[0], self.state[1] - 1)
        else:
            nxtState = (self.state[0], self.state[1] + 1)

        # if next state is legal
        if (nxtState[0] >= 0) and (nxtState[0] <= 2):
            if (nxtState[1] >= 0) and (nxtState[1] <= 3):
                if nxtState != (1, 1):
                    return nxtState
        return self.state

    def showBoard(self):
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if (i,j) == WIN_STATE:
                    token = 'W'
                elif (i,j) == LOSE_STATE:
                    token = 'L'
                elif (i,j) == START:
                    token = 'S'
                elif(i,j) == WALL:
                    token = '#'
                else:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')

Q(sigma) based on https://github.com/makaveli10/reinforcementLearning/blob/c06d3842e837e198642740eccd4f87d29dbde9b0/MultiStepBootstrapping/n_step_q_sigma.py#L21
and Sutton and Barto 2020 Reinforcement learning book Avaliable from http://incompleteideas.net/book/the-book-2nd.html

In [70]:
class Agent:

    def __init__(self, alpha = 0.2, gamma = 0.99, n = 3):
        self.states = []  # record position and action taken at the position
        self.actions = [0, 1, 2, 3] # 0 = up, 1 = down 2 = left 3 = right
        self.State = State()
        self.nA = 4
        self.isEnd = self.State.isEnd
        self.alpha = alpha
        self.gamma = gamma
        self.actionsList = []
        self.rewards = []
        self.n = n

        self.Q = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q[(i, j)] = {}
                for a in self.actions:
                    self.Q[(i, j)][a] = 0 
        
    def printHyper(self):
        print("gamma = ", self.gamma)
        print("alpha = ", self.alpha)
        print("n-step =", self.n)
        
    def bActionP(self,state,epsilon = 0.3):        
        A = np.ones(self.nA, dtype=float) * (epsilon/self.nA)
        temp = self.Q[state]
        best_action = max(temp,key=temp.get)
        A[best_action] += 1.0 - epsilon
        return A
    
    def piActionP(self,state,epsilon = 0.01): #soft max
        A = np.ones(self.nA, dtype=float) * (epsilon/self.nA)
        temp = self.Q[state]
        best_action = max(temp,key=temp.get)
        A[best_action] += 1.0 - epsilon
        
        return A

    def takeAction(self, action):
        position = self.State.nxtPosition(action)
        # update State
        return State(state=position)

    def reset(self):
        self.State = State()
        self.isEnd = self.State.isEnd
        
    def printQ(self):
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                print(i,j)
                print(self.Q[(i, j)])
                    
       
    def chooseSigma(self):
        return np.random.uniform(0, 1)
        
    def play(self, episodes=10):
        for i in range(episodes):
#             print(i)
            self.reset()
            T = np.inf
            t = 0
            tau = 0
            n = self.n
            stored_actions = {}
            stored_states = {}
            stored_rewards = {}
            stored_pho = {}
            stored_sigma={}
            
            self.reset()
            
            currentState = self.State.state
            b_prob = self.bActionP(currentState)
            pi_prob = self.piActionP(currentState)
            action = np.random.choice(self.actions,p=b_prob)
            pho = pi_prob[action] / b_prob[action]
            
            stored_actions[0] = action
            stored_states[0] = currentState
            stored_pho[0] = pho
            stored_rewards[0] = 0
            while True:
#                 print("current position {} action {}".format(self.State.state, action))
                if t < T:
                    self.State = self.takeAction(action)
                    currentState = self.State.state
                    reward = self.State.giveReward()
                    stored_states[(t+1) % (n+1)] = self.State.state
                    stored_rewards[(t+1) % (n+1)] = reward
                    
                    self.State.isEndFunc()
#                    print("nxt state", self.State.state)
 #                   print("---------------------")
                    self.isEnd = self.State.isEnd
                    if self.State.isEnd:
#                         print(reward)
#                         print("WIN")
                        T = t + 1
                    else:
                        b_prob = self.bActionP(currentState)
                        pi_prob = self.piActionP(currentState)
                        action = np.random.choice(self.actions,p=b_prob)
                        pho = pi_prob[action] / b_prob[action]
                        stored_actions[(t+1)% (n+1)] = action
                        sigma = self.chooseSigma()
                        stored_pho[(t+1)% (n+1)] = pho
                tau = t - self.n + 1
                if tau >= 0:
                    if t + 1 < T:
                        G = self.Q[stored_states[(t+1)% (n+1)]][stored_actions[(t+1)% (n+1)]]
                    for k in range(min(t+1, T), tau, -1):
                        if k == T:

                            G = stored_rewards[T% (n+1)]
                        else:
                            s_k = stored_states[k% (n+1)]
                            a_k = stored_actions[k% (n+1)]
                            r_k = stored_rewards[k% (n+1)]
                            pho_k = stored_pho[k% (n+1)]
                            
                            VBar = np.sum([(self.piActionP(s_k)[a]) * self.Q[s_k][a] for a in range(self.nA)])

                            G = r_k + self.gamma * ((sigma * pho_k) + ((1-sigma) * self.piActionP(s_k)[a_k])) * (G - self.Q[s_k][a_k])+ self.gamma * VBar
                    s_tau = stored_states[tau% (n+1)]
                    a_tau = stored_actions[tau% (n+1)]

                    self.Q[s_tau][a_tau] = self.Q[s_tau][a_tau] + self.alpha * (G - self.Q[s_tau][a_tau])
                if tau >= (T-1):
                    break
                else:
                    t = t + 1
                            
    def replay(self):
        self.reset()
        stored_states = []
        stored_actions = {}
        isEnd = False
        state = self.State.state
        stored_states.append(state)
        while (isEnd == False):
            
            temp = self.Q[state]
            action = max(temp,key=temp.get)
            if action == 0:
                stored_actions[state] = '^'
            elif action == 1:
                stored_actions[state] = 'v'
            elif action == 3:
                stored_actions[state] = '>'
            elif action == 4:
                stored_actions[state] = '<'
            self.State = self.takeAction(action)
            state = self.State.state
            stored_states.append(state)
            self.State.isEndFunc() 
            isEnd = self.State.isEnd
            
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if (i,j) == WIN_STATE:
                    token = 'W'
                elif (i,j) == LOSE_STATE:
                    token = 'L'
                elif (i,j) == START:
                    token = 'S'
                elif(i,j) == WALL:
                    token = '#'
                elif((i,j) in stored_states):
                    token = stored_actions[(i,j)]
                else:
                    token = '0'
                out += token + ' | '
            print(out)
        print('-----------------')

Chosen hyperparameters:<br>

Learning rate(alpha) = 0.2<br>
Decay(gamma) = 0.99<br>
N-step = 3<br>

Actionspace<br>
0 = up<br>
1 = down<br>
2 = left<br>
3 = right<br>
<br>
Play 100 episodes

In [76]:

ag = Agent(alpha = 0.2, gamma = 0.99, n = 3)
st = State()
st.showBoard()
print("Hyperparameters \n")
ag.printHyper()
ag.play(100)
    
print("Q-values ... \n")
ag.printQ()
print("\n")
print("Optimal play via max q values\n")
ag.replay()

-----------------
| 0 | 0 | 0 | 0 | 
-----------------
| 0 | # | 0 | W | 
-----------------
| S | 0 | 0 | L | 
-----------------
Hyperparameters 

gamma =  0.99
alpha =  0.2
n-step = 3
Q-values ... 

0 0
{0: 0.8899384404250286, 1: 0.841080164938459, 2: 0.8516756853196443, 3: 0.9669093638975295}
0 1
{0: 0.835611487833921, 1: 0.8614485567464135, 2: 0.7944791943040375, 3: 0.9778812709589971}
0 2
{0: 0.7749393935026497, 1: 0.8535126183289126, 2: 0.8802462448093975, 3: 0.9883177207031081}
0 3
{0: 0.6648992478675471, 1: 0.999999999681713, 2: 0.6554274132861289, 3: 0.8762367652672477}
1 0
{0: 0.9569780878273034, 1: 0.8653590296693154, 2: 0.7960015420828969, 3: 0.7753943016979519}
1 1
{0: 0, 1: 0, 2: 0, 3: 0}
1 2
{0: 0.9530757539396533, 1: 0.3668605048065946, 2: 0.13413738055317637, 3: 0.2}
1 3
{0: 0, 1: 0, 2: 0, 3: 0}
2 0
{0: 0.9467481681248493, 1: 0.6807487360610697, 2: 0.8519748781497763, 3: 0.6518738169599784}
2 1
{0: 0.00018411072057645698, 1: 0, 2: 0.18082970912558424, 3: 0.7604544336732