In [1]:
from model import NSFrozenLake
import numpy as np
from amalearn.agent import AgentBase

In [2]:
class NSFrozenLake_new(NSFrozenLake):
    def __init__(self, studentNum: int = 256, nonStationary=False):
        super().__init__(studentNum, nonStationary)
        self.actions = {0:(0,-1), 1:(1,0), 2:(0,1), 3:(-1,0)}
        self.states = np.array([[(i,j) for j in range (0,4)] for i in range(0,4)])

In [3]:
class valueIterationAgent(AgentBase):

    def __init__(self, env, id, discount, theta):
        super(valueIterationAgent, self).__init__(id, env)
        self.gamma = discount
        self.theta = theta
        self.values = np.zeros((4,4))
        # print("values shape: ", self.values.shape)
        self.q_table = np.zeros((4,4,4))
        self.policy = np.random.randint(0, 4, size=(4,4))

    
    def calc_qsa(self, action, state_now):
        final_reward = 0
        # print(state_now)
        next_states, probs, fail_probs, dones = self.environment.possible_consequences(action, tuple(state_now))
        
        for i in range(len(next_states)):
            final_reward += probs[i] * (fail_probs[i]*(-10) + (-1) + (dones[i] * 50) + self.gamma*self.values[tuple(next_states[i])])

        return final_reward

    def gather_qvalues(self, state):
        q_values_s = np.array([])
        for action in list(self.environment.actions.keys()):
            qsa = self.calc_qsa(action, state)
            self.q_table[action][tuple(state)] = qsa
            q_values_s = np.append(q_values_s, qsa)
        return q_values_s 
    
    def value_iteration(self):
        steps = 0
        while(True):
            delta = 0
            for row in range(len(self.environment.states)):
                for s in self.environment.states[row]:
                    # print("state: ", s)
                    temp_value_s = self.values[tuple(s)]
                    q_values_s = self.gather_qvalues(s)
                    # print("max_qvalues_s: ", np.max(q_values_s))
                    # print(tuple(s))
                    self.values[tuple(s)] = np.max(q_values_s)
                    # print("values[tuple(s)]: ", self.values[tuple(s)])

                    delta = max(delta, abs(temp_value_s - self.values[tuple(s)]))
            steps += 1
            if delta < self.theta:
                # print("okay that's enough!")
                break

        for row in range(len(self.environment.states)):
            for s in self.environment.states[row]:
                self.policy[tuple(s)] = np.argmax(self.gather_qvalues(s)) 
        
        return self.policy, steps

In [4]:
STUDENT_NUM = 810196662
env = NSFrozenLake_new(STUDENT_NUM)
state = env.reset()
agent = valueIterationAgent(env, 0, 0.9, 0.5)


In [5]:
final_policy, steps = agent.value_iteration()
print("final policy: \n", final_policy)
print("final q_values: \n", agent.q_table)
print(steps)

final policy: 
 [[1 1 1 1]
 [2 1 1 1]
 [2 2 1 1]
 [2 2 2 1]]
final q_values: 
 [[[225.55948707 227.64942052 253.97105688 279.05107082]
  [252.81168329 255.46652121 287.57242763 322.72949219]
  [277.81026012 280.84092442 325.02396053 365.5663774 ]
  [304.3107108  308.02773237 355.93695765 411.05017888]]

 [[250.65460557 281.56842809 316.90758012 348.03495889]
  [274.90616454 317.21003296 357.8536099  400.39791549]
  [302.24847875 348.19760894 403.9768947  455.29978427]
  [304.3107108  350.94619827 407.1822378  459.19221116]]

 [[249.01329661 274.06313565 301.28768743 303.40779579]
  [281.63557236 316.7617355  349.05102326 351.7367626 ]
  [317.45929062 357.8515145  402.24408637 405.42494365]
  [347.2291767  402.19147841 455.32427009 459.19221116]]

 [[225.55948707 251.10323006 276.93096246 303.40779579]
  [227.71656478 253.82521225 280.06713519 307.1095995 ]
  [255.71577887 287.57033224 324.57566307 356.76379076]
  [279.87249218 323.23854424 365.59086322 409.31737054]]]
43
