In [None]:
%run Imports.ipynb
%run Discrete_Agent.ipynb

In [None]:
class ValueIterationAgent_rsa(DiscreteAgent):
    
    def __init__(self, env):
        
        self.env = env
        self.values = {}
        self.delta = 1.0
        self.theta = 0.01
        self.policy = {}
        
        self.sweep_no = 0
        self.is_converged = False
        self.max_sweeps = 1000
        
        for state in self.env.states:
            self.values[state] = 0
            actions_list = self.env.actions[state]
            self.policy[state] = random.choice(actions_list)
        
    def update(self):
        self.sweep_no = 0
        while self.delta >= self.theta and self.sweep_no < self.max_sweeps:
            self.sweep_no += 1
            self.delta = 0
            for state in self.env.states:
                val = self.values[state]
                val_max = self.values[state]
                act_max = self.policy[state]

                for act in self.env.actions[state]:
                    val_sum = self.env.rewards[(state,act)]

                    for dest in self.env.states:
                        if (state,act,dest) in self.env.transitions:
                            val_sum += (self.env.gamma * self.values[dest]) * self.env.transitions[(state,act,dest)]

                    if val_sum > val_max:
                        val_max = val_sum
                        act_max = act   
                    elif val_sum == val_max:
                        # issue with floats
                        act_max = random.choice([act_max, act])

                self.values[state] = copy.deepcopy(val_max)
                self.policy[state] = copy.deepcopy(act_max)
                
                self.delta = max(self.delta, abs(val - val_max))
        
        if self.delta < self.theta:
            self.is_converged = True
            
        # calculated optimal deterministic policy
        
        return self.sweep_no, self.is_converged, self.values, self.policy
        
    def get_action(self, state):
        action = self.policy[state]
        return action

In [None]:
class ValueIterationAgent_rsas(DiscreteAgent):
    
    def __init__(self, env, theta):
        
        self.env = env
        self.values = {}
        self.delta = 1.0
        self.theta = theta
        self.policy = {}
        
        self.sweep_no = 0
        self.is_converged = False
        self.max_sweeps = 1000
        
        for state in self.env.states:
            self.values[state] = 0
            actions_list = self.env.actions[state]
            self.policy[state] = random.choice(actions_list)
        
    def update(self):
        self.sweep_no = 0
        while self.delta >= self.theta and self.sweep_no < self.max_sweeps:
            self.sweep_no += 1
            self.delta = 0
            for state in self.env.states:
                val = self.values[state]
                val_max = self.values[state]
                act_max = self.policy[state]

                for act in self.env.actions[state]:
                    val_sum = 0
                    for dest in self.env.states:
                        if (state,act,dest) in self.env.transitions:
                            val_sum += (self.env.gamma * self.values[dest]) * self.env.transitions[(state,act,dest)]
                            val_sum += self.env.rewards[(state,act,dest)] * self.env.transitions[(state,act,dest)]
#                     if act == 100 - state:
#                         val_sum += self.env.bias * (self.values[100] + 1000)
#                         val_sum += (1 - self.env.bias) * (self.values[state-act])
#                     elif state == 100 + act:
#                         val_sum += self.env.bias * (self.values[state + act])
#                         val_sum += (1 - self.env.bias) * (self.values[100] + 1000)
#                     else:
#                         val_sum += (self.env.bias * (self.values[state + act])) + ((1 - self.env.bias) * (self.values[state-act]))
#                     if val_sum > val_max:
#                         val_max = val_sum
#                         act_max = act   
#                     elif val_sum == val_max:
#                         # issue with floats
#                         act_max = random.choice([act_max, act])

                self.values[state] = copy.deepcopy(val_max)
                self.policy[state] = copy.deepcopy(act_max)
                
                self.delta = max(self.delta, abs(val - val_max))
                
        if self.delta < self.theta:
#             print("Converged")
            self.is_converged = True
            
        # calculated optimal deterministic policy
        
        return self.sweep_no, self.is_converged, self.values, self.policy
        
    def get_action(self, state):
        action = self.policy[state]
        return action