In [835]:
import numpy as np

# inspired by OpenAI's "Frozen Lake" gym https://github.com/openai/gym. Our interface will loosly follow gym.
# and Arthur Juliani's Q-Learning tutorial https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0

# signal rewuard issued once.

class temple:
    
    # These are the spatial coordinates
    # of the temple.
    
    xDim = 3
    yDim = 3
    
    # Memory has three states.
    # Always initialzed to zero.
    # Agent manipulates as it sees fit
    
    memoryDim = 3
    
    # The visible signal will have three states:
    #  0: not visible
    #  1: signal indicates reward is at altar 1
    #  2: signal indicates reward is at altar 2
    
    signalDim = 3
    
    altarOnePosition = (0,2)
    altarTwoPosition = (2,2)
    signalPosition   = (1,1)
    
    def __init__(self, altarReward = 1, altarPenalty = -1, signalReward = 0.2, memoryClearAvailable = False):
        self.altarReward  = altarReward
        self.altarPenalty = altarPenalty
        self.signalReward = signalReward
        self.memoryClearable = memoryClearAvailable        
        self.observationSpace = [self.xDim, self.yDim, self.memoryDim, self.signalDim]
        self.actionSpace      = [7 if self.memoryClearable else 6]

    def getState(self):
        return self.x, self.y, self.memory, self.signal if (self.x, self.y) == self.signalPosition else 0
         
    def reset(self):
        self.x = 1
        self.y = 0
        self.memory = 0
        self.signal = 0
        self.done  = False
        self.signalRewardIssued = False
    
        if np.random.randint(2):
            self.rewardPosition  = self.altarOnePosition
            self.penaltyPosition = self.altarTwoPosition
            self.signal = 1
        else:
            self.rewardPosition  = self.altarTwoPosition
            self.penaltyPosition = self.altarOnePosition
            self.signal = 2
        
        return self.getState()
    
    def step(self, a):
        
        reward = 0
        
        if not self.done:
            if   a == 0:  # try to move -x
                if self.x > 0:
                  self.x -= 1  
            elif a == 1:  # try to move +x
                if self.x < self.xDim - 1:
                  self.x += 1 
            elif a == 2:  # try to move -y
                if self.y > 0:
                  self.y -= 1  
            elif a == 3:  # try to move +y
                if self.y < self.yDim - 1:
                  self.y += 1
            elif a == 4:  # set memory to 1
                  self.memory = 1
            elif a == 5:  # set memory to 2
                  self.memory = 2
            elif a == 6 and self.memoryClearable:  # set memory to 0
              self.memory = 0
        
            if (self.x, self.y) == self.rewardPosition:
                reward = self.altarReward
                self.done = True
                
            elif (self.x, self.y) == self.penaltyPosition:
                reward = self.altarPenalty
                self.done = True
                
            elif (self.x, self.y) == self.signalPosition and not self.signalRewardIssued:
                self.signalRewardIssued = True
                reward = self.signalReward
        
        return (self.getState(), # state
                reward,          # reward
                self.done)       # done?
    
    renderSymbols = {0:'', 1:'*'}
    
    def render(self):
        theMap = np.chararray([self.xDim,self.yDim])
        theMap[:] = ' '
       
        theMap[self.rewardPosition] = '+'
        theMap[self.penaltyPosition] = '-'
        if self.signal == 1:
          theMap[self.signalPosition] = '<'
        else:
          theMap[self.signalPosition] = '>'
        
        theMap[self.x,self.y] = '*'
        
        print(' ' + ''.join(['---' for i in range(self.xDim)]), end='\n')
        for row in theMap.transpose():
           print('|', end='')
           for c in row.decode("utf-8"):
              print(' ' + c + ' ', end='')
           print('|\n', end='')
        print(' ' + ''.join(['---' for i in range(self.xDim)]), end='')
        print('  memory: ' + str(self.memory))
        #   print(''.join(row.decode("utf-8")) )
        
a = temple()
a.reset()
print(a.step(1))
a.render()

((2, 0, 0, 0), 0, False)
 ---------
|       * |
|    >    |
| -     + |
 ---------  memory: 0


In [1102]:
env = temple()

Q = np.zeros(env.observationSpace + env.actionSpace)
# Set learning parameters
lr = .8
y = .95
num_episodes = 5000

#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0

    ## "y': future reward for next state "s1"   

    for j in range(99):
        j+=1
        #Choose an action by greedily (with noise) picking from Q table
       # a = np.argmax(Q[s] + np.random.randn(1,env.actionSpace[0])*(10./(i+1)))
        a = np.argmax(Q[s] + np.random.randn(1,env.actionSpace[0])*0.2 * np.argmax(Q[s]))

        #Get new state and reward from environment
        s1,r,d = env.step(a) # state, reward, done?

        #Update Q-Table with new knowledge
        Q[s + tuple([a])] = Q[s + tuple([a])] + lr*(r + y*np.max(Q[s1,:]) - Q[s + tuple([a])])

        rAll += r
        s = s1
        if d == True:
            break
    #jList.append(j)
    rList.append(rAll)

In [1108]:
s = env.reset()
env.render()

 ---------
|    *    |
|    >    |
| -     + |
 ---------  memory: 0


In [1119]:
a = np.argmax(Q[s] + np.random.randn(1,env.actionSpace[0])*(1./(i+1)))
env.step(a)
env.render()

 ---------
|         |
|    >    |
| *     + |
 ---------  memory: 1


In [1040]:
a

3