# 11.1.2 preparation

In [None]:
import sys
%matplotlib inline

In [None]:
sys.path.append('../scripts/')

In [None]:
from dp_policy_agent import *

In [None]:
class QAgent(DpPolicyAgent):
    '''
    Initial version
    '''
    def __init__(self, time_interval, estimator, goal, puddle_coeff=100, widths=np.array([0.2,0.2,math.pi/18]).T,\
                lowerleft=np.array([-4,-4]).T, upperright=np.array([4,4]).T):
        super().__init__(time_interval, estimator, goal, puddle_coeff, widths, lowerleft, upperright)

In [None]:
def trial(): 
    time_interval = 0.1
    world = PuddleWorld(400000, time_interval, debug=False)  #長時間アニメーション時間をとる

    ## 地図を生成して3つランドマークを追加 ##
    m = Map()
    for ln in [(-4,2), (2,-3), (4,4), (-4,-4)]: m.append_landmark(Landmark(*ln))
    world.append(m)   

    ##ゴールの追加##
    goal = Goal(-3,-3) 
    world.append(goal)
    
    ##水たまりの追加##
    world.append(Puddle((-2, 0), (0, 2), 0.1)) 
    world.append(Puddle((-0.5, -2), (2.5, 1), 0.1)) 

    ##ロボットを1台登場させる##
    init_pose = np.array([3, 3, 0]).T
    kf = KalmanFilter(m, init_pose)
    a = QAgent(time_interval, kf, goal)
    r = Robot(init_pose, sensor=Camera(m, distance_bias_rate_stddev=0, direction_bias_stddev=0), 
              agent=a, color="red", bias_rate_stds=(0,0))
    world.append(r)
    
    world.draw()
    return a

In [None]:
a = trial()

# 11.1.3 set up Q

In [None]:
class StateInfo:
    '''
    class for keeping Q values
    '''
    def __init__(self, action_num=3):
        self.q = np.zeros(action_num)
    
    def greedy(self):
        return np.argmax(self.q)
    
    def pi(self):
        return self.greedy()

In [None]:
class QAgent(DpPolicyAgent):
    '''
    Second version
    '''
    def __init__(self, time_interval, estimator, puddle_coeff=100, widths=np.array([0.2,0.2,math.pi/18]).T,\
                lowerleft=np.array([-4,-4]).T, upperright=np.array([4,4]).T):
        super().__init__(time_interval, estimator, None, puddle_coeff, widths, lowerleft, upperright)
        
        #const
        self.smallval = 0.1 # initial value is subtracted with this value in case the action is not in policy.

        nx, ny, nt = self.index_nums # in DpPolicyAgent. nx is number of x indices.
        self.indexes = list(itertools.product(range(nx), range(ny), range(nt)))
        self.actions = list(set([tuple(self.policy_data[i]) for i in self.indexes])) # in this example, actions=[forward,turn]
        self.statespace = self.set_action_value_function() 
        
    def set_action_value_function(self, value_file="../section_reinforcement_learning/puddle_ignore_values.txt"):
        statespace = {}
        for line in open(value_file, 'r'):
            d = line.split()
            index= (int(d[0]), int(d[1]), int(d[2]))
            value = float(d[3])
            statespace[index] = StateInfo(len(self.actions)) # init state space
            
            for i, a in enumerate(self.actions):
                statespace[index].q[i] = value if tuple(self.policy_data[index]) == \
                a else value - self.smallval
            
        return statespace
    
    def policy(self, pose):
        index = self.to_index(pose, self.pose_min, self.index_nums, self.widths) # from DpPolicyAgent class
        a = self.statespace[tuple(index)].pi() # a is the index of the action.
        return self.actions[a]

In [None]:
trial()