In [22]:
class environment:
  def __init__(self):
    self.i_min = 0
    self.i_max = 9
    self.j_min = 0
    self.j_max = 9
    self.goals = [(3, 3), (8, 9)]
    self.states = []
    for i in range(self.i_max + 1):
      self.states.extend([(i, j) for j in range(self.j_max + 1)])
    #self.states = [[(i, j) for j in range(self.j_max + 1)] for i in range(self.i_max + 1)] 

  def get_state_reward(self, state, action):
    #0, 1, 2, 3 - up, down, left, right
    collision = False
    if action == 0:
      if state[0] - 1 >= self.i_min:
        state = (state[0] - 1, state[1])
      else:
        collision = True
    elif action == 1:
      if state[0] + 1 <= self.i_max:
        state = (state[0] + 1, state[1])
      else:
        collision = True
    elif action == 2:
      if state[1] - 1 >= self.j_min:
        state = (state[0], state[1] - 1)
      else:
        collision = True
    elif action == 3:
      if state[1] + 1 <= self.j_max:
        state = (state[0], state[1] + 1)
      else:
        collision = True
      
    if state in self.goals:
      reward = 10
    else:
      reward = -1
    if collision:
      reward = -5

    return reward, state

In [26]:
import numpy as np

class EstimateImprove:
  def __init__(self, env):
    self.policy = None
    self.env_states = env.states
    self.actions = [0, 1, 2, 3] #up down left right
    self.env = env
    self.lamb = 0.5

  def train(self):  
    if not self.policy:
      self.policy = {}
      for state in self.env_states:
        self.policy[state] = {i : 1 / 4 for i in range(4)}
    else:
      pass
    V = {}
    for state in self.env_states:
      V[state] = 0
    theta = 0.1

    while True:
      V_old = V
      V = self.Estimate()
      self.Improve(V)
      
      #оцениваем по отношению к предыдущей value function 
      delta = 0
      for state in self.env_states:
        delta = max(delta, abs(V_old[state] - V[state]))

      if delta <= theta:
        break


  def Estimate(self):
    V = {}
    for state in self.env_states:
      V[state] = 0
    theta = 0.1

    while True:
      delta = 0

      for state in self.env_states:
        V_old = V[state]
        for action in self.actions:
          r, state_prime = self.env.get_state_reward(state, action)
          temp_sum = self.policy[state][action] * 1 * (r + self.lamb * V[state_prime]) 
        V[state] = temp_sum
        delta = max(delta, abs(V_old - V[state]))

      if delta <= theta:
        break

    return V


  def Improve(self, V):
    values = []
    for state in self.env_states:
      values = []
      for action in self.actions:
        r, state_prime = self.env.get_state_reward(state, action)
        values.append(1 * (r + self.lamb * V[state_prime]))

      act_to_max = np.argmax(np.array(values))
      self.policy[state] = [0] * len(self.policy[state])
      self.policy[state][act_to_max] = 1

In [27]:
env = environment()
agent = EstimateImprove(env)

agent.train()

In [42]:
for i in range(10):
  for j in range(10):
    if (i, j) in env.goals:
      print("\u274E", end="   ")
    else:
      if np.argmax(np.array(agent.policy[(i, j)])) == 0:
        print("\u2191", end="   ")
      if np.argmax(np.array(agent.policy[(i, j)])) == 1:
        print("\u2193", end="   ")
      if np.argmax(np.array(agent.policy[(i, j)])) == 2:
        print("\u2190", end="   ")
      if np.argmax(np.array(agent.policy[(i, j)])) == 3:
        print("\u2192", end="   ")
  print()

↓   ↓   ↓   ↓   ↓   ↓   ↓   ↓   ↓   ↓   
↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   
↓   ↓   ↓   ↓   ↑   ↑   ↑   ↑   ↑   ↑   
→   →   →   ❎   ←   ↑   ↑   ↑   ↑   ↑   
↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   
↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   
↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   ↑   
↑   ↑   ↑   ↑   ↑   ↑   ↓   ↓   ↓   ↓   
↑   ↑   ↑   ↑   ↑   →   →   →   →   ❎   
↑   ↑   ↑   ↑   ↑   ←   ↑   ↑   ↑   ↑   
