In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0,'../../modules')

In [111]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.animation as animation
from matplotlib import animation, rc
from IPython.display import HTML,Image
rc('animation', html='html5')

# Markov Decision Processes
Many problems require making a set of sequential decisions (e.g chess, go, card games). <br>
One of the approaches to dealing with sequential problems is to assume a markov condition. In a markov decision process(MDP) we have a set of actions $A$ and a set of states $S$. Iteratively at each timestep we make an action which transitions the current state $S_t$ to a new state $S_{t+1}$ with a probability $P(S_{t+1}|S_t,A_t)$. At each timestep a reward value $R_t$ is given based on the current state and action with probability $P(R_t|S_t,A_t)$. The markov condition is that the next state only depends on the current state and current action.

When we make a finite number of decisions the utility (which we want to maximize) is given as the sum of rewards at all timesteps, $\sum_{t=1}^n R_t$. With an infinite set of steps this becomes infinite, so often a discount factor is included which grows with time, $\sum_{t=1}^\infty \lambda^{t-1} R_t$. Alternatively we can use an average instead, but that can be computationally difficult. 

Because we don't know the reward at each given time step we need to take an expectation over all possible rewards so the above becomes $\sum_{t=1}^\infty \lambda^{t-1} \sum_i P(R_t^i)R_t^i$. Because of the markov condition we know the probability of $R_t$ given $S_t$ and $A_t$ and can rewrite this as:
$$\sum_{t=1}^\infty \lambda^{t-1} \sum_i P(R_t^i|S_t,A_t)R_t^i$$

While there are non-stationary MDPs it is useful to assume $P(S_{t+1}|S_t,A_t)$ and $P(R_t|S_t,A_t)$ are the same for all $t$. Stationary MDPs can have the transition from one state to another determined by a function $T(s'|s,a)$ which doesn't depend on $t$. The expectation over the reward given the current state $s$ and executing action function $a$ is $R(s,a)$. So the above is:
$$\sum_{t=1}^\infty \lambda^{t-1} R(S_t,A_t)$$

In a MDP problem we are trying to find a good policy, which tells us which action to take given previous actions and the current state. With an infinite time stationary MDP we get policies not depending on $t$. We call the stationary policy $\pi$, which maps state to action $\pi(s)$. We also have a stationary transition function $T(s'|s,a)$ which is the probability of moving from a state and action to a new state.

With this in mind the above overall utility function can be written:
$$\sum_{t=1}^\infty \lambda^{t-1} R(S_t,\pi(S_t))$$
Let $U_k^\pi(s)=\sum_{t=1}^k \lambda^{t-1} R(S_t,\pi(S_t))$ with starting state $S_1$ (counting $t$ from 1 **relative** to $s$, so $S_1$=$s$). <br>
So, $U_1^\pi(s)=R(s,\pi(s))$<br>
Then $U_\infty^\pi(s)$ is the value we want to maximize. <br>
$$
\begin{aligned}
    U_{k+1}^\pi(s)&=\sum_{t=1}^{k+1} \lambda^{t-1} R(S_t,\pi(S_t)) \\
    &= R(s,\pi(s)) + \lambda \sum_{t=2}^{k+1} \lambda^{t-1} R(S_t,\pi(S_t)) \\
    &= R(s,\pi(s)) + \lambda U_k^\pi(s')
\end{aligned}
$$
Where $s'$ is the state that follows $s$. However, as we don't know the actual value of $s'$ we use the expectation:
$$U_{k+1}^\pi(s)=R(s,\pi(s)) + \lambda \sum_{s'} T(s'|s,\pi(s)) U_k^\pi(s')$$
This formula is very intuitive. The value of a state ($U^\pi(s)$) is the immediate reward plus the expected value of the probable future states.

For an infinite horizon the solution for $U^\pi$ can be found iteratively:
$$U^\pi=R(s,\pi(s))+\lambda\sum_{s'}T(s'|s,\pi(s))U^\pi(s')$$

### Evaluating a simple decision:
**W** is a wall <br>
**F** is a fire <br>
**G** is gold <br>
**S** is empty <br>
**Y** is you. <br>
Actions are moving in a direction. Moving into a wall leaves you in the same place. <br>
You have a $90\%$ chance of going the way you choose to move, $5\%$ chance of slipping and falling to the left and also $5\%$ chance of falling to the right. <br>
The possible states are just where you are.

In [171]:
def get_states(world):
    states = []
    for i in range(world.shape[0]):
        for j in range(world.shape[1]):
            if(not world[i,j]=='W'):
                states.append([i,j])
    return np.array(states)

def check_state_valid(state,all_states):
    for s in all_states:
        if(state[0]==s[0] and state[1]==s[1]):
            return True
    return False

def step_func(step_vec):
    def step(state,all_states):
        new_state = state+step_vec
        if(check_state_valid(new_state,all_states)):
            return new_state
        else:
            return state
    return step

left_step = step_func(np.array([0,-1]))
right_step = step_func(np.array([0,1]))
up_step = step_func(np.array([-1,0]))
down_step = step_func(np.array([1,0]))

def left_action(state,all_states):
    left = left_step(state,all_states)
    up = up_step(state,all_states)
    down = down_step(state,all_states)
    return [left,up,down],[0.9,0.05,0.05]

def right_action(state,all_states):
    right = right_step(state,all_states)
    up = up_step(state,all_states)
    down = down_step(state,all_states)
    return [right,up,down],[0.9,0.05,0.05]

def down_action(state,all_states):
    down = down_step(state,all_states)
    left = left_step(state,all_states)
    right = right_step(state,all_states)
    return [down,left,right],[0.9,0.05,0.05]

def up_action(state,all_states):
    up = up_step(state,all_states)
    left = left_step(state,all_states)
    right = right_step(state,all_states)
    return [up,left,right],[0.9,0.05,0.05]

def make_animation(states,world):
    empty_tile = np.zeros((8,8))
    wall_tile = np.ones((8,8))
    fire_tile = np.array([[0,0,0,0,0,0,0,0],
                          [0,0,0,0,0,0,0,0],
                          [0,0,0,0,0,1,0,0],
                          [0,0,0,0,0,0,0,0],
                          [0,0,0,0,1,0,0,0],
                          [0,0,1,0,1,0,0,0],
                          [0,0,1,1,0,1,0,0],
                          [0,1,1,0,1,1,1,0]])*2
    gold_tile = np.array([[0,0,0,0,0,0,0,0],
                          [0,0,0,0,0,0,0,0],
                          [0,0,0,1,1,0,0,0],
                          [0,0,1,0,0,1,0,0],
                          [0,0,1,0,0,1,0,0],
                          [0,0,0,1,1,0,0,0],
                          [0,0,0,0,0,0,0,0],
                          [0,0,0,0,0,0,0,0]])*3
    person_tile = np.array([[0,0,0,1,0,0,0,0],
                            [0,1,1,1,1,1,0,0],
                            [0,0,0,1,0,0,0,0],
                            [0,0,0,1,0,0,0,0],
                            [0,0,1,1,1,0,0,0],
                            [0,0,1,0,1,0,0,0],
                            [0,0,1,0,1,0,0,0],
                            [0,0,1,0,1,0,0,0]])*4
    symbol_to_tile = dict(zip(['S','W','F','G','Y'],[empty_tile,wall_tile,fire_tile,gold_tile,person_tile]))
    
    images = []
    for i in range(len(states)):
        state = states[i]
        image = np.zeros((world.shape[0]*8,world.shape[1]*8))
        for row in range(world.shape[0]):
            for col in range(world.shape[1]):
                tile = symbol_to_tile[world[row,col]]
                image[row*8:(row+1)*8,col*8:(col+1)*8]=tile
                if(row==state[0] and col==state[1]):
                    image[row*8:(row+1)*8,col*8:(col+1)*8][person_tile>0]=person_tile[person_tile>0]
        images.append(image)
    
    fig = plt.figure(figsize=(world.shape[1]/2,world.shape[0]/2))
    fig.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=None, hspace=None)
    im = plt.imshow(images[0],cmap=colors.ListedColormap(['grey','black','red','yellow','pink']))
    #plt.axis(False)
    #plt.show()

    def animate_func(i):
        im.set_array(images[i])
        return
    
    anim = animation.FuncAnimation(fig, 
                                   animate_func,
                                   frames=len(states), 
                                   interval=200
                                   )
    
    html = HTML(anim.to_jshtml())
    display(html)
    plt.close()
    
    #anim.save('Images/animation.gif', writer='imagemagick', fps=10)

def numpy_2d_index(state,all_states):
    for s in range(len(all_states)):
        if((all_states[s]==state).all()):
            return s
    return None
    
def get_transition_matrix(all_states,policy):
    matrix = np.zeros((len(all_states),len(all_states)))
    for s in range(len(all_states)):
        state = all_states[s]
        decision = policy[s]
        possible_result_states,probabilities = decision(state,all_states)
        for o in range(len(possible_result_states)):
            matrix[numpy_2d_index(possible_result_states[o],all_states),s]+=probabilities[o]
    return matrix

def get_transition_matrix_gold_restart(restart_state,all_states,world,policy):
    matrix = get_transition_matrix(all_states,policy)
    index = numpy_2d_index(restart_state,all_states)
    for s in range(len(all_states)):
        state = all_states[s]
        if(world[state[0],state[1]]=='G'):
            matrix[:,s]=0
            matrix[index,s]=1
    return matrix

In [183]:
world = np.array([['W','W','W','W','W'],
                  ['W','S','S','G','W'],
                  ['W','S','F','S','W'],
                  ['W','S','F','S','W'],
                  ['W','S','S','S','W'],
                  ['W','W','W','W','W']])
all_states = get_states(world)
state = np.array([3,1])
policy = np.random.choice([left_action,right_action,up_action,down_action],size=12)
policy = [right_action,right_action,right_action,up_action,up_action,up_action,up_action,right_action,up_action,right_action,right_action,up_action]
transition_matrix = get_transition_matrix_gold_restart(state,all_states,world,policy)
state_index = numpy_2d_index(state,all_states)

In [186]:
state_hist = [state]
for iteration in range(200):
    state_index = np.random.choice(len(all_states),p=transition_matrix[:,state_index])
    state = all_states[state_index]
    state_hist.append(state)

In [187]:
make_animation(state_hist,world)