In [111]:
import numpy as np

In [112]:
class GridWorld:
    def __init__(self) :
        self.row = 5
        self.column = 5
        self.i = 4
        self.j = 0
        self.config()

    def config(self):
        reward = {}

        for p in range(self.row):
            for q in range(self.column):
                if p==0 and q==4:
                    reward[(p,q)] = 1
                elif p==1 and q==4:
                    reward[(p,q)] = -1
                elif p==2 and q==1:
                    reward[(p,q)] = 0
                elif p==3 and q==3:
                    reward[(p,q)] = 0
                else :
                    reward[(p,q)] = 0

        action = {(0, 0): {'D','R'},
        (0, 1):{'L','D','R'},
        (0, 2): {'L','D','R'},
        (0, 3): {'L','D','R'},
        (1, 0): {'U','R','D'},
        (1, 1): {'U','D','R','L'},
        (1, 2): {'U','D','R','L'},
        (1, 3): {'U','D','R','L'},
        (2, 0): {'U','R','D'},
        # (2, 1): {'U','D','R','L'},
        (2, 2): {'U','D','R','L'},
        (2, 3): {'U','D','R','L'},
        (2, 4): {'D','U','L'},
        (3, 0): {'U','R','D'},
        (3, 1): {'U','D','R','L'},
        (3, 2): {'U','D','R','L'},
        # (3, 3): {'U','D','R','L'},
        (3, 4): {'D','L','U'},
        (4, 0): {'U','R'},
        (4, 1): {'U','R','L'},
        (4, 2): {'U','R','L'},
        (4, 3): {'U','R','L'},
        (4, 4): {'U','L'},}

        self.set(reward,action)

    def set(self,reward, action):
        self.reward = reward
        self.action = action

    def get_nextState(self,si,sj,action):
        if action == "U":
            si -= 1
        elif action == "D":
            si += 1
        elif action == "R":
            sj += 1
        elif action == "L":
            sj -= 1
        return si,sj

    def get_reward(self,si,sj):
        return self.reward[(si,sj)]

    def move(self,action):
        self.i,self.j = self.get_nextState(self.i,self.j,action)
        return self.get_reward(self.i,self.j)
    
    def terminate(self):
        if (self.i,self.j) in self.action:
            return False
        return True
    
    def set_state(self,i,j):
        self.i = i
        self.j = j
    
    def current_state(self):
        return self.i,self.j
    
    def is_terminal(self,i,j):
        if (i,j) in self.action:
            return False
        return True

In [113]:
g = GridWorld()

In [114]:
def set_policy():
        policy = {
        (0, 0): 'R',
        (0, 1):'R',
        (0, 2): 'R',
        (0, 3): 'R',
        (1, 0): 'U',
        (1, 1): 'U',
        (1, 2): 'U',
        (1, 3): 'U',
        (2, 0): 'U',
        (2, 2): 'U',
        (2, 3): 'U',
        (2, 4): 'L',
        (3, 0): 'U',
        (3, 1): 'R',
        (3, 2): 'U',
        (3, 4): 'U',
        (4, 0): 'U',
        (4, 1): 'U',
        (4, 2): 'U',
        (4, 3): 'L',
        (4, 4): 'L',
        (0, 4): 'D',
        (1,4) : 'U'}
        return policy

policy = set_policy()

In [115]:
#print policy

def print_policy(policy):
    print("POLICY")
    for i in range(g.row):
        print ("---"*7)
        print("|",end=" ")
        for j in range(g.column):
            if (i,j) in policy:
                print(policy[(i,j)],"|",end=" ")
            else:
                print(" ","|",end=" ")
        print("")
    print ("---"*7)

print_policy(policy)

POLICY
---------------------
| R | R | R | R | D | 
---------------------
| U | U | U | U | U | 
---------------------
| U |   | U | U | L | 
---------------------
| U | R | U |   | U | 
---------------------
| U | U | U | L | L | 
---------------------


In [116]:
#print value
def plot_value(V):
    print("Value")
    for i in range(g.row):
        print ("---"*7)
        print("|",end=" ")
        for j in range(g.column):
            print("%.2f|" %V[(i,j)],end=" ")
        print("")
    print ("---"*7)

In [121]:
def play_game(max_step = 20):
    states_list = list(g.action.keys())
    a = np.random.choice(len(states_list))
    si,sj = states_list[a]
    g.set_state(si,sj)

    si,sj = g.current_state()

    states = [(si,sj)]
    rewards=[0]

    step = 20
    while not(g.terminate()):
        a = policy[(si,sj)]
        r = g.move(a)
        s2i,s2j = g.current_state()

        states.append((s2i,s2j))
        rewards.append(r)

        step =+1

        if step >= max_step:
            break
            
        si,sj = s2i,s2j
    return states,rewards


In [118]:
#initialize value
V = {}
returns = {}

for i in range(g.row):
    for j in range(g.column):
        V[(i,j)] = 0
        returns[(i,j)] = []



In [127]:
for iter in range(100):
    states,rewards = play_game()
    G = 0
    T = len(states)

    for t in range(T-2,-1,-1):
        s = states[t]
        r = rewards[t+1]
        G = r + 0.9*G
    
    if s not in states[:t]:
        returns[s].append(G)
        V[s] = np.mean(returns[s])

In [128]:
plot_value(V)

Value
---------------------
| 0.73| 0.81| 0.90| 1.00| 0.00| 
---------------------
| 0.66| 0.73| 0.81| 0.90| 0.00| 
---------------------
| 0.59| 0.00| 0.73| 0.81| 0.73| 
---------------------
| 0.53| 0.59| 0.66| 0.00| 0.66| 
---------------------
| 0.48| 0.53| 0.59| 0.53| 0.48| 
---------------------
