In [1]:
import numpy as np
import pandas as pd
import scipy
from typing import TypeVar,Mapping, Set, Generic, Sequence

### MP/MRP/MDP

**Elements**
* State set $S$
* Action set $A$
* Transition matirx $P$ following the Markov property: $P(X_{t+1}|X_t) = P(X_{t+1}|X_t,...,X_0)$
    - Or $P_{ss'}^a = P(X_{t+1} = s'| X_t = s, A_t = a)$
* Reward: 
    - 1) $R_s = \mathbb{E}[R_{t+1}|S_t = s]$ (or $\mathbb{E}[R_{t+1}|S_t = s, A_t = a]$)
    - 2) $r(s',s)$, where $R_s = \sum_{s' \in S}r(s',s)p(s',s)$
    - 3) $r(s)$ which is assigned to each state (and action)
* Discount factor: $\gamma$
* Policy: $\pi(a|s) = P[A_t = a | S_t = s]$<br>

**Processes**
* Markov Process = state set $S$ + transition probability matrix
    - $<S, P>$ 
* Markov Reward Process  = MP + Reward + Discount factor
    - $<S, P, R, \gamma>$ 
    - Return: $G_t = R_{t+1} + \gamma R_{t+2} + ... = \sum^{\infty}_{i=t+1}\gamma^{i-t-1} R_i$ 
    - Value: $v(s) = \mathbb{E}[G_{t}|S_t = s]$
* Markov Decision Process = MRP (enviroment uncertainty) + action (agent uncertainty)
    - $<S, P, R, \gamma, A>$ 
    - MDP + policy = MRP, $P_{ss'} = \sum_{a \in A} \pi(a|s) P_{ss'}^a$, $R_{s} = \sum_{a \in A} \pi(a|s) R_{s}^a$
    - State-value function $v_{\pi}(s) = \mathbb{E}[G_{t}|S_t = s]$ (follow $\pi$ after state $s$)
    - Action-value function $q_{\pi}(s,a) = \mathbb{E}[G_{t}|S_t = s, A_t = a]$ (follow $\pi$ after action $a$)
    

### Bellman equation

<script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=default"></script>
**MRP**
\begin{equation*}
\begin{split}
    v(s) & = \mathbb{E}[G_{t}|S_t = s] \\
         & = \mathbb{E}[R_{t+1} + \gamma G_{t+1}|S_t = s] \\
         & = \mathbb{E}[R_{t+1} + \gamma v(S_{t+1})|S_t = s] \\
         & = R_s + \mathbb{E}[\gamma G_{t+1}|S_t = s] \\
         & = R_s + \gamma \sum_{s' \in S} P_{ss'}\mathbb{E}[G_{t+1}|S_{t+1} = s']\\
         & = R_s + \gamma \sum_{s' \in S} P_{ss'}v(s')
\end{split}
\end{equation*}

**MDP**
* Non-optimal v-q (1.1) 
\begin{equation*}
\begin{split}
    v_{\pi}(s) & = \sum_{a \in A} \pi(a|s)q_{\pi}(s,a)
\end{split}
\end{equation*}
* Non-optimal q-v (1.2) 
\begin{equation*}
\begin{split}
    q_{\pi}(s,a) & = R_s^{a} + \gamma \sum_{s \in S'} P_{ss'}^a v_{\pi}(s')
\end{split}
\end{equation*}
* Non-optimal v-v (1.3)
\begin{equation*}
\begin{split}
    v_{\pi}(s) & = \sum_{a \in A} \pi(a|s)q_{\pi}(s,a) \\
    & = \sum_{a \in A} \pi(a|s)(R_s^{a} + \gamma \sum_{s \in S'} P_{ss'}^a v_{\pi}(s'))
\end{split}
\end{equation*}
or
\begin{equation*}
\begin{split}
    v_{\pi}(s) & = \mathbb{E}_{\pi}[G_{t}|S_t = s] \\
    & = \mathbb{E}_{\pi}[R_{t+1} + \gamma G_{t+1}|S_t = s] \\
    & = \mathbb{E}_{\pi}[R_{t+1} + \gamma v_{\pi}(S_{t+1})|S_t = s] 
\end{split}
\end{equation*}
* Non-optimal q-q (1.4)
\begin{equation*}
\begin{split}
    q_{\pi}(s,a) & = R_s^{a} + \gamma \sum_{s \in S'} P_{ss'}^a v_{\pi}(s')\\
    & = R_s^{a} + \gamma \sum_{s \in S'} P_{ss'}^a (\sum_{a' \in A} \pi(a'|s')q_{\pi}(s',a')) 
\end{split}
\end{equation*}
or
\begin{equation*}
\begin{split}
    q_{\pi}(s,a) & = \mathbb{E}_{\pi}[G_{t}|S_t = s, A_t = a] \\
    & = \mathbb{E}_{\pi}[R_{t+1} + \gamma G_{t+1}|S_t = s, A_t = a] \\
    & = \mathbb{E}_{\pi}[R_{t+1} + \gamma q_{\pi}(S_{t+1},A_{t+1})|S_t = s, A_t = a] 
\end{split}
\end{equation*}
* Optimal v-q (2.1)
\begin{equation*}
\begin{split}
    v_{\ast}(s) & = \max_{a} q_{\ast}(s,a)
\end{split}
\end{equation*}
* Optimal q-v (2.2)
\begin{equation*}
\begin{split}
    q{\ast}(s,a) & = \max_{\pi}{R_s^{a} + \gamma \sum_{s \in S'} P_{ss'}^a v_{\pi}(s')}\\
    & = R_s^{a} + \gamma \sum_{s \in S'} P_{ss'}^a v_{\ast}(s')
\end{split}
\end{equation*}
* Optimal v-v (2.3)
\begin{equation*}
\begin{split}
    v_{\ast}(s) & = \max_{a} q_{\ast}(s,a) \\
    & = \max_{a} {R_s^{a} + \gamma \sum_{s \in S'} P_{ss'}^a v_{\ast}(s')}
\end{split}
\end{equation*}
* Optimal q-q (2.4)
\begin{equation*}
\begin{split}
    q_{\ast}(s,a) & = R_s^{a} + \gamma \sum_{s \in S'} P_{ss'}^a v_{\ast}(s')\\
    & = R_s^{a} + \gamma \sum_{s \in S'} P_{ss'}^a \max_{a'} q_{\ast}(s',a')
\end{split}
\end{equation*}

### Class design for MP
**Write code to generate the stationary distribution for an MP** <br>
We know that if $\pi$ is the stationary distribution of an MP with transition matrix of A, then $\pi = \pi A$. <br>
Equivalently, we find vector $\pi$ as a left eigenvector of A with eigenvalue 1. Thus, finding the stationary distribution is equal to solving the eigenvector.

In [24]:
# Helper functions
T = TypeVar("T",str,int,float)

# Identity helper function for str, int and float
def ind(x: T, y: T):
    if x == y or np.abs(x-y)<1e-5:
        return True
    else:
        return False
    
# Get state helper function
def get_states_helper(in_graph: dict) -> dict:
    state_list = list(in_graph.keys())
    ind = range(len(state_list))
    state = dict(zip(state_list,ind))
    return state

# Get transition matrix helper function
def get_transition_helper(in_graph: dict) -> np.ndarray:
    state = get_states_helper(in_graph)
    tran_mat = np.zeros((len(state),len(state)))
    for i, row in in_graph.items():
        for j, prob in row.items():
            ind_row = state[i]
            ind_col = state[j]
            if ind(tran_mat[ind_row,ind_col],0):
                tran_mat[ind_row,ind_col] = prob
    return tran_mat 

In [25]:
# Test helper functions
Input = {'Sunny': {'Sunny': 0.1, 'Cloudy': 0.2, 'Rainy': 0.3, 'Windy': 0.4},
         'Cloudy': {'Sunny': 0.25, 'Cloudy': 0.25, 'Rainy': 0.3, 'Windy': 0.2},
         'Rainy': {'Sunny': 0.1, 'Cloudy': 0.2, 'Rainy': 0.3, 'Windy': 0.4},
         'Windy': {'Sunny': 0.25, 'Cloudy': 0.25, 'Rainy': 0.25, 'Windy': 0.25}}
print(get_states_helper(Input))
print(get_transition_helper(Input))

{'Sunny': 0, 'Cloudy': 1, 'Rainy': 2, 'Windy': 3}
[[0.1  0.2  0.3  0.4 ]
 [0.25 0.25 0.3  0.2 ]
 [0.1  0.2  0.3  0.4 ]
 [0.25 0.25 0.25 0.25]]


In [75]:
# Define MP by Graph
"""
    E.g.,
    Input = {'Sunny': {'Sunny': 0.1, 'Cloudy': 0.2, 'Rainy': 0.3, 'Cloudy': 0.4},
             'Cloudy': {'Sunny': 0.25, 'Cloudy': 0.25, 'Rainy': 0.3, 'Cloudy': 0.2},
             'Rainy': {'Sunny': 0.1, 'Cloudy': 0.2, 'Rainy': 0.3, 'Cloudy': 0.4},
             'Windy': {'Sunny': 0.25, 'Cloudy': 0.25, 'Rainy': 0.25, 'Cloudy': 0.25}}
    Meaning: Today's weather => tmr's weather
"""

class MP:
    # Initiate state dict & transition matrix
    def __init__(self, in_graph: dict) -> None:
        self.graph = in_graph
        state = get_states_helper(in_graph)
        tran_mat = get_transition_helper(in_graph)
        # Check transition matrix and match state set with transition probs
        if np.linalg.norm(np.sum(tran_mat, axis = 1)- np.ones(tran_mat.shape[0]))>1e-5:
            raise ValueError
        elif len(state) != tran_mat.shape[0]:
            raise ValueError
        else:
            self.state: dict = state
            self.tran_mat: np.ndarray = tran_mat
            
    # Get all states
    def get_states(self) -> set:
        return self.state
    
    # Get the transition matirx
    def get_tran_mat(self) -> np.ndarray:
        return self.tran_mat
    
    # Compute stationary distribution using eigenvalue decomposition
    def stationary_dist(self) -> np.array:
        e_value, e_vec = np.linalg.eig(self.tran_mat.T)
        out = np.array(e_vec[:, np.where(np.abs(e_value- 1.) < 1e-5)[0][0]])
        out = out/np.sum(out)
        return out

In [76]:
# Test class
Input = {'Sunny': {'Sunny': 0.1, 'Cloudy': 0.2, 'Rainy': 0.3, 'Windy': 0.4},
         'Cloudy': {'Sunny': 0.25, 'Cloudy': 0.25, 'Rainy': 0.3, 'Windy': 0.2},
         'Rainy': {'Sunny': 0.1, 'Cloudy': 0.2, 'Rainy': 0.3, 'Windy': 0.4},
         'Windy': {'Sunny': 0.25, 'Cloudy': 0.25, 'Rainy': 0.25, 'Windy': 0.25}}
test_MP = MP(Input)
print(test_MP.get_states())
print(test_MP.get_tran_mat())
print(test_MP.stationary_dist())

{'Sunny': 0, 'Cloudy': 1, 'Rainy': 2, 'Windy': 3}
[[0.1  0.2  0.3  0.4 ]
 [0.25 0.25 0.3  0.2 ]
 [0.1  0.2  0.3  0.4 ]
 [0.25 0.25 0.25 0.25]]
[0.18027211 0.22675737 0.2845805  0.30839002]


### Class design for MRP
- Separately implement the $r(s,s')$ and the $R(s) = \sum_{s'} p(s,s') * r(s,s')$ definitions of MRP
- Write code to convert/cast the $r(s,s')$ definition of MRP to the $R(s)$ definition of MRP (put some thought into code design here)<br>

**Given** $v(s) = \mathbb{E}[G_{t}|S_t = s]= \mathbb{E}[\sum^{\infty}_{i=0}\gamma^{i} R_{t+i+1}|S_t = s]$ <br>
**For state vector $S \in \mathbb{R}^d$** $v(S) = \mathbb{E}[G_{t}|S_t] = \mathbb{E}[\sum^{\infty}_{i=0}\gamma^{i} P^i R_{t+1}|S_t] = \mathbb{E}[(I-\gamma P)^{-1}R_{t+1}|S_t]$

In [77]:
# Convert reward helper function
def convert_reward(_2nd_def_reward: dict, tran_mat: np.ndarray, state: dict) -> dict:
    reward_mat = np.zeros((len(state),len(state)))
    # Create reward matrix
    for i, row in _2nd_def_reward.items():
        for j, reward in row.items():
            ind_row = state[i]
            ind_col = state[j]
            if ind(reward[ind_row,ind_col],0):
                reward[ind_row,ind_col] = reward
    # Cast to 1st def reward vector
    reward_vec = np.diag(tran_mat.dot(reward_mat.T))
    reward_dict = dict(zip(state.keys(),reward_vec))
    return reward_dict 

In [79]:
# Define MRP by Graph
"""
    E.g.,
    Input = {'Sunny': {'Sunny': 0.1, 'Cloudy': 0.2, 'Rainy': 0.3, 'Cloudy': 0.4},
             'Cloudy': {'Sunny': 0.25, 'Cloudy': 0.25, 'Rainy': 0.3, 'Cloudy': 0.2},
             'Rainy': {'Sunny': 0.1, 'Cloudy': 0.2, 'Rainy': 0.3, 'Cloudy': 0.4},
             'Windy': {'Sunny': 0.25, 'Cloudy': 0.25, 'Rainy': 0.25, 'Cloudy': 0.25}}
    state_reward = {'Rain': 1, 'Sunny': 2, 'Cloudy': 3, 'Windy': 4}
    gamma = 0.5
    Meaning: Today's weather => tmr's weather
"""
class MRP(MP):
    
    # Initiate state with reward and discount
    def __init__(self, state_reward: dict, gamma: float) -> None:
        if gamma <0 or gamma >1:
            raise ValueError
        else:
            reward_vec = np.zeros(len(self.state))
            for key, ind in self.state.items():
                reward_vec[ind] = state_reward[key]
            self.reward: np.ndarray = reward_vec
            self.gamma: float = gamma
    
    # Compute value function R(s)
    def value_func(self) -> float:
        return np.linalg.inv(np.identity(len(self.state))-self.gamma*self.tran_mat).dot(self.reward)

    # Compute value function r(s,s')
    def value_func_2nd(self,_2nd_def_reward) -> float:
        reward_dict = convert_reward(_2nd_def_reward)
        reward_vec = np.zeros(len(self.state))
        for key, ind in self.state.items():
            reward_vec[ind] = reward_dict[key]
        self.reward = reward_vec
        return self.value_func()