In [10]:
from mdp import MDP
import typing

抽象クラス等のインポート

In [20]:
class GridMDP(MDP):
    def __init__(self,grid,terminals,init=(0,0),gamma=.9):
        grid.reverse()  # because we want row 0 on bottom, not on top                                                                                                  
        MDP.__init__(self, init, actlist=orientations,
                     terminals=terminals, gamma=gamma)
        self.grid = grid
        self.rows = len(grid)
        self.cols = len(grid[0])
        for x in range(self.cols):
            for y in range(self.rows):
                self.reward[x, y] = grid[y][x]
                if grid[y][x] is not None:
                    self.states.add((x, y))

# 抽象クラスの具象化

## init
### 引数について
- grid : 各状態での報酬(リスト？)
- terminals : 終了状態
- init : エージェントの初期位置
- gamma : 割引係数

In [12]:
    def T(self, state, action):
        if action is None:
            return [(0.0, state)]
        else:
            return [(0.8, self.go(state, action)),
                    (0.1, self.go(state, turn_right(action))),
                    (0.1, self.go(state, turn_left(action)))]


## T
    遷移モデル。状態sで行動aをとった時の次状態への遷移確率と次状態のタプル(probability, s')を返す

In [13]:
     def go(self, state, direction):
        state1 = vector_add(state, direction)
        return state1 if state1 in self.states else state

## go
    指定された方向へ移動した時の状態を返す

In [14]:
    def to_grid(self, mapping) -> list:
        return list(reversed([[mapping.get((x, y), None)
                               for x in range(self.cols)]
                              for y in range(self.rows)]))

In [15]:
    def to_arrows(self, policy) -> list:
        chars = {(1, 0): '>', (0, 1): '^', (-1, 0): '<', (0, -1): 'v', None: '.'}
        return self.to_grid({s: chars[a] for (s, a) in policy.items()})

### to_grid & to_arrows
    表示用メソッド。listを作成している


## 抽象クラスで実装されている他の関数
### actions
    各状態で取れる行動のリストを返す

# 価値反復法の実装

In [16]:
    def value_iteration(mdp, epsilon=0.001):
        U1 = {s: 0 for s in mdp.states}
        R, T, gamma = mdp.R, mdp.T, mdp.gamma
        while True:
            U = U1.copy()
            delta = 0
            for s in mdp.states:
                U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)])
                                            for a in mdp.actions(s)])
                delta = max(delta, abs(U1[s] - U[s]))
            if delta < epsilon * (1 - gamma) / gamma:
                return U

## value_iteration
### 引数
- mdp  
    GridMDPのインスタンス
- epsilon  
    ε 微小値。なぜわざわざ与える？  

### 出力
    各状態におけるU(s)      

- U(s)　　
    価値反復法における期待利得。割引累積報酬の計算。現在の状態sから将来にわたって最優な行動を取り続けた時の期待利益。
    
#### 例
 ```
 >> value_iteration(sequential_decision_environment)
{(0, 0): 0.2962883154554812,
 (0, 1): 0.3984432178350045,
 (0, 2): 0.5093943765842497,
 (1, 0): 0.25386699846479516,
 (1, 2): 0.649585681261095,
 (2, 0): 0.3447542300124158,
 (2, 1): 0.48644001739269643,
 (2, 2): 0.7953620878466678,
 (3, 0): 0.12987274656746342,
 (3, 1): -1.0,
 (3, 2): 1.0}
 ```

In [17]:
    def best_policy(mdp, U):
        pi = {}
        for s in mdp.states:
            pi[s] = argmax(mdp.actions(s), key=lambda a: expected_utility(a, s, U, mdp))
        return pi


## best_policy
value_iteration で計算した各U(s)データから最適な政策を求める

In [18]:
    def expected_utility(a, s, U, mdp):
        return sum([p * U[s1] for (p, s1) in mdp.T(s, a)])

In [19]:
if __name__ == "__main__":
    sequential_decision_environment = GridMDP([[-0.04, -0.04, -0.04, +1],
                                           [-0.04, None,  -0.04, -1],
                                           [-0.04, -0.04, -0.04, -0.04]],
                                          terminals=[(3, 2), (3, 1)])

    pi = best_policy(sequential_decision_environment, value_iteration(sequential_decision_environment, .01))

    print_table(sequential_decision_environment.to_arrows(pi))

NameError: name 'orientations' is not defined

- 2018/10/03実行メモ
    分析しつつ写して実行。  

```

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-133b3e9e1ee9> in <module>()
      3                                            [-0.04, None,  -0.04, -1],
      4                                            [-0.04, -0.04, -0.04, -0.04]],
----> 5                                           terminals=[(3, 2), (3, 1)])
      6 
      7     pi = best_policy(sequential_decision_environment, value_iteration(sequential_decision_environment, .01))

<ipython-input-11-18941023d420> in __init__(self, grid, terminals, init, gamma)
      2     def __init__(self,grid,terminals,init=(0,0),gamma=.9):
      3         grid.reverse()  # because we want row 0 on bottom, not on top
----> 4         MDP.__init__(self, init, actlist=orientations,
      5                      terminals=terminals, gamma=gamma)
      6         self.grid = grid

NameError: name 'orientations' is not defined

```

デフォルト引数の値が宣言されていないので、どこにあるのか探すことに

## 参考文献
(Qiita:Pythonではじめる強化学習)[https://qiita.com/Hironsan/items/56f6c0b2f4cfd28dd906#%E5%AE%9F%E8%B7%B5%E7%B7%A8]