In [12]:
import numpy as np

'''==================================================
Initial set up
=================================================='''

#Hyperparameters
DELTA = 0.001
GAMMA = 0.999         
NOISE = 0  

#Define all states
all_states=[]
all_states.append('W')
all_states.append('N')
all_states.append('E')
all_states.append('S')
all_states.append('C')

#Define rewards for all states
rewards = {}
for s in all_states:
    rewards[s] = 0

#Dictionnary of possible actions. We have two "end" states (1,2 and 2,2)
actions = {
    'W':('RIGHT', 'STAY', 'SHOOT', 'NONE'), 
    'N':('DOWN', 'STAY', 'CRAFT', 'NONE'),
    'E':('LEFT', 'STAY', 'SHOOT', 'HIT', 'NONE'),
    'S':('UP', 'STAY', 'GATHER', 'NONE'),
    'C':('UP', 'DOWN', 'LEFT', 'RIGHT', 'STAY', 'SHOOT', 'HIT', 'NONE')
    }

#Define an initial policy
policy={}
for s in actions.keys():
    policy[s] = np.random.choice(actions[s])
print(policy)

#Define success probabilities for states
probability = {}
for s in all_states:
    if s == 'E' or s == 'W':
        probability[s] = 1
    else:
        probability[s] = 0.85
print(probability)
        
#Define fail action for states
fail = {}
for s in all_states:
    if s == 'E' or s == 'W':
        fail[s] = s
    else:
        fail[s] = 'E'
print(fail)

#Define initial value function 
V={}
for s in all_states:
    if s in actions.keys():
        V[s] = 0
#     if s ==(2,2):
#         V[s]=-1
#     if s == (1,2):
#         V[s]=-1
#     if s == (2,3):
#         V[s]=1
print(V)

{'W': 'NONE', 'N': 'DOWN', 'E': 'NONE', 'S': 'GATHER', 'C': 'UP'}
{'W': 1, 'N': 0.85, 'E': 1, 'S': 0.85, 'C': 0.85}
{'W': 'W', 'N': 'E', 'E': 'E', 'S': 'E', 'C': 'E'}
{'W': 0, 'N': 0, 'E': 0, 'S': 0, 'C': 0}


In [19]:
'''==================================================
Value Iteration
=================================================='''

iteration = 0
while True:
    biggest_change = 0
    for s in all_states: 
        print("State: ",s,"  Value: ", V[s])           
        if s in policy:
            
            old_v = V[s]
            new_v = 0
            
            for a in actions[s]:
                if a == 'UP':
                    if s == 'S':
                        nxt = 'C'
                    else:
                        nxt = 'N'
                elif a == 'DOWN':
                    if s == 'N':
                        nxt = 'C'
                    else:
                        nxt = 'S'
                elif a == 'LEFT':
                    if s == 'E':
                        nxt = 'C'
                    else:
                        nxt = 'W'
                elif a == 'RIGHT':
                    if s == 'W':
                        nxt = 'C'
                    else:
                        nxt = 'E'
                elif a == 'STAY':
                    nxt = s
                else:
                    nxt = s
                    
#                 if a == 'SHOOT':
#                     nxt = [s[0]-1, s[1]]
#                 if a == 'HIT':
#                     nxt = [s[0]-1, s[1]]
#                 if a == 'CRAFT':
#                     nxt = [s[0]-1, s[1]]
#                 if a == 'GATHER':
#                     nxt = [s[0]-1, s[1]]
#                 if a == 'NONE':
#                     nxt = [s[0]-1, s[1]]
                
                #Choose a new random action to do (transition probability)
                nxt_2 = fail[s]
#                 print(probability[s])
                print(nxt)
                print(V[nxt])
                print(V[nxt_2])
                sigma = probability[s]*V[nxt] + (1-probability[s])*V[nxt_2]

                #Calculate the value
                v = rewards[s] + (GAMMA * sigma) 
                if v > new_v: #Is this the best action so far? If so, keep it
                    new_v = v
                    policy[s] = a

       #Save the best of all actions for the state                                 
            V[s] = new_v
            biggest_change = max(biggest_change, np.abs(old_v - V[s]))
         
   #See if the loop should stop now         
    if biggest_change < DELTA:
        break
    iteration += 1

for s in all_states: 
    print("State: ",s,"  Value: ", V[s])

State:  W   Value:  0
C
0
0
W
0
0
W
0
0
W
0
0
State:  N   Value:  0
C
0
0
N
0
0
N
0
0
N
0
0
State:  E   Value:  0
C
0
0
E
0
0
E
0
0
E
0
0
E
0
0
State:  S   Value:  0
C
0
0
S
0
0
S
0
0
S
0
0
State:  C   Value:  0
N
0
0
S
0
0
W
0
0
E
0
0
C
0
0
C
0
0
C
0
0
C
0
0
State:  W   Value:  0
State:  N   Value:  0
State:  E   Value:  0
State:  S   Value:  0
State:  C   Value:  0
