In [None]:
import numpy as np

In [None]:
TYPE_STATE = ['normal', 'stone', 'gem', 'hole']
ACTIONS = ['up', 'down', 'left', 'right']
ACTIONSYMBOLS = {'up': "↑", 'down': "↓", 'left': "←", 'right': "→"}

---

# Environments

In [None]:
EnvRobot = np.full((3,4), 'normal')

EnvRobot[0,3] = 'gem'
EnvRobot[1,1] = 'stone'
EnvRobot[1,3] = 'hole'

![EnvRobot.png](EnvRobot.png)

---

# Dynamic of the environment

We determine the next state and reward given the current state and action.

In [None]:
def NextStateAndReward(Position, Action, Rewards={'Tired':0, 'Stone':0, 'Gem':1, 'Hole':-1}, Environment=EnvRobot):

    Worldsize = Environment.shape

    # We select the next position according to the action taken
    if Action == 'up':
        NextPosition = (Position[0]-1, Position[1])
    elif Action == 'down':
        NextPosition = (Position[0]+1, Position[1])
    elif Action == 'left':
        NextPosition = (Position[0], Position[1]-1)
    else:
        NextPosition = (Position[0], Position[1]+1)

    # We check if the next state is inside the grid world, if it is not, then we stay in the same state
    if(
        NextPosition[0] >= 0
        and NextPosition[0] <= Worldsize[0] -1
        and NextPosition[1] >= 0
        and NextPosition[1] <= Worldsize[1] -1
        ):
        # We check the type of state that is the next state
        if Environment[NextPosition] == 'normal':
            Reward = Rewards['Tired']
        elif Environment[NextPosition] == 'stone':
            Reward = Rewards['Stone']
            NextPosition = Position
        elif Environment[NextPosition] == 'gem':
            Reward = Rewards['Gem']
        else:
            Reward = Rewards['Hole']
    else:
        NextPosition = Position
        Reward = 0
    
    return (NextPosition, Reward)

## Slippery dynamic

If there is slippery floor, the next action is stochastic.

In [None]:
SlipperyOpposite = {
    'up': {},
    'down': {},
    'left': {},
    'right': {}
}

SlipperyRandom = {
}

SlipperyRandomNotOpposite = {
}

SlipperyCompletelyRandom = {
}

---

# Iterative Policy Evaluation

In [None]:
def PolicyEvaluation(
        Policy,
        Environment = EnvRobot,
        Rewards={'Tired':0, 'Stone':0, 'Gem':1, 'Hole':-1},
        DiscountRate=0.9,
        NumberIterations=100,
        DeterministicPolicy=True,
        Slippery=False,
        SlipperyDistribution=None,
        Story=False
    ):
    # If Slippery is True, we have to pass the SlipperyDistribution

    Worldsize = Environment.shape

    StateValue = np.zeros(Worldsize)

    for _ in range(NumberIterations):
        NewStateValue = np.zeros(Worldsize)
        for i in range(Worldsize[0]):
            for j in range(Worldsize[1]):
                if Environment[i,j] == 'normal':
                    if DeterministicPolicy:
                        # Deterministic Policy
                        if not Slippery:
                            # Deterministic Environment
                            Action = Policy((i,j))
                            NextPosition, Reward = NextStateAndReward((i,j), Action, Rewards, Environment)
                            NewStateValue[i,j] += Reward + DiscountRate * StateValue[NextPosition]
                        else:
                            # Stochastic Environment
                            Action = 
                            # We have to take into account the distribution of possible actions
                            for ActualAction in :
                                ProbActualAction = 
                                NextPosition, Reward = 
                                NewStateValue[i,j] += 
                    else:
                        # Stochastic Policy
                        if not Slippery:
                            # Deterministic Environment
                            for Action in :
                                ProbAction = 
                                NextPosition, Reward = 
                                NewStateValue[i,j] += 
                        else:
                            # Stochastic Environment
                else:
                    pass
                
        StateValue = np.copy(NewStateValue)

        if Story:
            print(StateValue.round(2))
            print(  )

    return(StateValue)

---

# Iterative action-value evaluation

In [None]:
def ConvertStateActionToPositionTable(s, a, Environment=EnvRobot):
    Worldsize = Environment.shape
    return (s[0]*Worldsize[1] + s[1], ACTIONS.index(a))

In [None]:
def PolicyActionEvaluation(
    ):
    # If Slippery is True, we have to give the SlipperyDistribution also

    Worldsize = 

    QTable = 

    for _ in range(NumberIterations):
        NewQTable = 
        for i in range(Worldsize[0]):
            for j in range(Worldsize[1]):
                if Environment[i,j] == 'normal':
                    if DeterministicPolicy:
                        # Deterministic Policy
                        if not Slippery:
                            # Deterministic Environment
                            Action = 
                            NextPosition, Reward = 

                            Aux = 0
                            if Environment[NextPosition] == 'normal':
                                ActionPrime = 
                                Aux += 

                            NewQTable[] += 
                        else:
                            # Stochastic Environment
                            Action = 
                            # We have to take into account the distribution of possible actions
                            for ActualAction in :
                                ProbActualAction = 
                                NextPosition, Reward = 

                                Aux = 0
                                if Environment[NextPosition] == 'normal':
                                    ActionPrime = 
                                    Aux += QTable[]

                                NewQTable[] += 
                    else:
                        # Stochastic Policy
                        if not Slippery:
                            # Deterministic Environment
                            for Action in :
                                NextPosition, Reward = 

                                Aux = 0
                                if Environment[NextPosition] == 'normal':
                                    for ActionPrime in :
                                        ProbActionPrime = 
                                        Aux += 

                                NewQTable[] += 
                        else:
                            # Stochastic Environment
                else:
                    pass
                
        QTable = np.copy(NewQTable)

        if Story:
            print(QTable.round(2))
            print(  )

    return(QTable)

In [None]:
def PolicyExtraction(QTable, Environment=EnvRobot):
    Worldsize = Environment.shape

    Policy = np.array([ACTIONSYMBOLS[ACTIONS[_]] for _ in np.argmax(QTable, axis=1)]).reshape((Worldsize[0],Worldsize[1]))

    for i in range(Worldsize[0]):
        for j in range(Worldsize[1]):
            if Environment[i,j] != 'normal':
                Policy[i,j] = ''

    return Policy

---

# Policies

## Deterministic policies

In [None]:
def Policy1(Position):
    # Position must be a 2-tuple, i.e. (i,j)
    _ = np.array([
        ['right', 'right', 'right', None]
        , ['up', None, 'up', None]
        , ['up', 'right', 'up', 'left']
    ])

    return _[Position]

![Policy1.png](Policy1.png)

In [None]:
def Policy2(Position):
    # Position must be a 2-tuple, i.e. (i,j)
    _ = np.array([
        ['right', 'right', 'right', None]
        , ['up', None, 'right', None]
        , ['right', 'right', 'right', 'up']
    ])

    return _[Position]

![Policy2.png](Policy2.png)

In [None]:
def Policy3(Position):
    # Position must be a 2-tuple, i.e. (i,j)
    _ = np.array([
        ['right', 'right', 'left', None]
        , ['down', None, 'right', None]
        , ['up', 'left', 'up', 'left']
    ])

    return _[Position]

![Policy3.png](Policy3.png)

## Stochastic policies

In [None]:
def Policy4(Position, Environment=EnvRobot):
    # Position must be a 2-tuple, i.e. (i,j)
    Distribution = {}

    if Environment[Position] == 'normal':
        return Distribution
    else:
        return None

![Policy4.png](Policy4.png)

---

# Deterministic policy and deterministic environment

## Policy 1

## Policy 2

## Policy 3

---

# Deterministic policy and stochastic environment

## Policy 1, Opposite slippery

## Policy 2, Opposite slippery

## Policy 3, Opposite slippery

---

# Stochastic policy and deterministic environment

## Policy 4

---

# Stochastic policy and stochastic environment

## Policy 4, Opposite slippery

---

# Action-value function

## Robot Environment, no slippery

In [None]:
# PolicyActionEvaluation(Policy1).round(2)

In [None]:
# PolicyActionEvaluation(Policy2).round(2)

In [None]:
QTablePolicy4 = 

In [None]:
PolicyExtraction(QTablePolicy4)