# 1 Preamble

In [None]:
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
TYPE_STATE = ['normal', 'stone', 'gem', 'hole']
ACTIONS = ['up', 'down', 'left', 'right']
ACTIONSYMBOLS = {'up': "↑", 'down': "↓", 'left': "←", 'right': "→", '': ''}

# 2 Environment

In [None]:
EnvRobot = np.full((3,4), 'normal')

EnvRobot[0,3] = 'gem'
EnvRobot[1,1] = 'stone'
EnvRobot[1,3] = 'hole'

![EnvRobot.png](EnvRobot.png)

# 3 Dynamic of environment

We determine the next state and reward given the current state and action.

In [None]:
def NextStateAndReward(Position, Action, Rewards={'tired':0, 'stone':0, 'gem':1, 'hole':-1}, Environment=EnvRobot):

    Worldsize = Environment.shape

    # We select the next position according to the action taken
    if Action == 'up':
        NextPosition = (Position[0]-1, Position[1])
    elif Action == 'down':
        NextPosition = (Position[0]+1, Position[1])
    elif Action == 'left':
        NextPosition = (Position[0], Position[1]-1)
    else:
        NextPosition = (Position[0], Position[1]+1)

    # We check if the next state is inside the grid world, if it is not, then we stay in the same state
    if(
        NextPosition[0] >= 0
        and NextPosition[0] <= Worldsize[0] -1
        and NextPosition[1] >= 0
        and NextPosition[1] <= Worldsize[1] -1
        ):
        # We check the type of state that is the next state
        if Environment[NextPosition] == 'normal':
            Reward = Rewards['tired']
        elif Environment[NextPosition] == 'stone':
            Reward = Rewards['stone']
            NextPosition = Position
        elif Environment[NextPosition] == 'gem':
            Reward = Rewards['gem']
        else:
            Reward = Rewards['hole']
    else:
        NextPosition = Position
        Reward = Rewards['tired']
    
    return (NextPosition, Reward)

## 3.1 Stochastic environment

If there is slippery floor, the next action is stochastic.

In [None]:
SlipperyOpposite = {
    'up': {'up':0.8, 'down':0.2, 'left':0, 'right':0},
    'down': {'up':0.2, 'down':0.8, 'left':0, 'right':0},
    'left': {'up':0, 'down':0, 'left':0.8, 'right':0.2},
    'right': {'up':0, 'down':0, 'left':0.2, 'right':0.8}
}

SlipperyRandom = {
}

SlipperyRandomNotOpposite = {
}

SlipperyCompletelyRandom = {
    
}

# 4 Iterative state value evaluation

In [None]:
def PolicyValueEvaluation(
        Policy,
        Environment = EnvRobot,
        Rewards={'tired':0, 'stone':0, 'gem':1, 'hole':-1},
        DiscountRate=0.9,
        NumberIterations=100,
        DeterministicPolicy=True,
        Slippery=False,
        SlipperyDistribution=None,
        Story=False
    ):
    # If Slippery is True, we have to pass the SlipperyDistribution

    Worldsize = Environment.shape

    StateValue = np.zeros(Worldsize)

    for _ in range(NumberIterations):
        NewStateValue = np.zeros(Worldsize)
        for i in range(Worldsize[0]):
            for j in range(Worldsize[1]):
                if Environment[i,j] == 'normal':
                    if DeterministicPolicy:
                        # Deterministic Policy
                        if not Slippery:
                            # Deterministic Environment
                            Action = Policy((i,j))
                            NextPosition, Reward = NextStateAndReward((i,j), Action, Rewards, Environment)
                            NewStateValue[i,j] += Reward + DiscountRate * StateValue[NextPosition]
                        else:
                            # Stochastic Environment
                            Action = Policy((i,j))
                            # We have to take into account the distribution of possible actions
                            for ActualAction in SlipperyDistribution[Action]:
                                ProbActualAction = SlipperyDistribution[Action][ActualAction]
                                NextPosition, Reward = NextStateAndReward((i,j), ActualAction, Rewards, Environment)
                                NewStateValue[i,j] += ProbActualAction * (Reward + DiscountRate * StateValue[NextPosition])
                    #else:
                        # Stochastic Policy
                        #if not Slippery:
                            # Deterministic Environment
                            #for Action in :
                                #ProbAction =
                                #NextPosition, Reward =
                                #NewStateValue[i,j] +=
                        #else:
                            # Stochastic Environment
                else:
                    pass
                
        StateValue = np.copy(NewStateValue)

        if Story:
            print(StateValue.round(2))
            print(  )

    return(StateValue)

In [None]:
def PlotPolicyValue(Policy, Value, DeterministicPolicy=True):
    if DeterministicPolicy:
        sns.heatmap(
        Value,
        annot=np.vectorize(ACTIONSYMBOLS.get)(Policy),
        fmt = '',
        cmap='RdBu',
        vmin=-1,
        vmax=1,
        linewidths=0.7,
        linecolor="black",
        xticklabels=[],
        yticklabels=[],
        square=True
    )
    else:
        sns.heatmap(
        Value,
        cmap='RdBu',
        vmin=-1,
        vmax=1,
        linewidths=0.7,
        linecolor="black",
        xticklabels=[],
        yticklabels=[],
        square=True
        )

# 5 Iterative state-action value evaluation

In [None]:
def ConvertStateActionToPositionTable(s, a, Environment=EnvRobot):
    Worldsize = Environment.shape
    return (s[0]*Worldsize[1] + s[1], ACTIONS.index(a))

In [None]:
def PolicyActionEvaluation(
        Policy,
        Environment = EnvRobot,
        Rewards={'tired':0, 'stone':0, 'gem':1, 'hole':-1},
        DiscountRate=0.9,
        NumberIterations=100,
        DeterministicPolicy=True,
        Slippery=False,
        SlipperyDistribution=None,
        Story=False
    ):
    # If Slippery is True, we have to give the SlipperyDistribution also

    Worldsize = 

    QTable = 

    for _ in range(NumberIterations):
        NewQTable = 
        for i in range(Worldsize[0]):
            for j in range(Worldsize[1]):
                if Environment[i,j] == 'normal':
                    if DeterministicPolicy:
                        # Deterministic Policy
                        if not Slippery:
                            # Deterministic Environment
                            Action = 
                            NextPosition, Reward = 

                            Aux = 0
                            if Environment[NextPosition] == 'normal':
                                ActionPrime = 
                                Aux += 

                            NewQTable[] +=
                        else:
                            # Stochastic Environment
                            Action = 
                            # We have to take into account the distribution of possible actions
                            for ActualAction in :
                                ProbActualAction = 
                                NextPosition, Reward = 

                                Aux = 0
                                if Environment[NextPosition] == 'normal':
                                    ActionPrime = 
                                    Aux += 

                                NewQTable[] += 
                    else:
                        # Stochastic Policy
                        if not Slippery:
                            # Deterministic Environment
                            for Action in :
                                NextPosition, Reward = 

                                Aux = 0
                                if Environment[NextPosition] == 'normal':
                                    for ActionPrime in :
                                        ProbActionPrime = 
                                        Aux += 

                                NewQTable[] += 
                        else:
                            # Stochastic Environment
                else:
                    pass
                
        QTable = np.copy(NewQTable)

        if Story:
            print(QTable.round(2))
            print(  )

    return(QTable)

# 6 Iterative policy improvement

In [None]:
def GreedyPolicy(QTable, Environment=EnvRobot):
    Worldsize = Environment.shape

    Policy = []

    for s in range(len(QTable)):
        if np.all(np.isclose(QTable[s], QTable[s][0])):
            Policy.append(random.choice(ACTIONS))
        else:
            Policy.append(ACTIONS[np.argmax(QTable[s])])

    Policy = np.array(Policy).reshape((Worldsize[0],Worldsize[1]))

    for i in range(Worldsize[0]):
        for j in range(Worldsize[1]):
            if Environment[i,j] != 'normal':
                Policy[i,j] = ''

    return Policy

In [None]:
def ValueFromQTable(QTable, Policy, Environment=EnvRobot, DeterministicPolicy=True):
    Worldsize = Environment.shape

    StateValue = np.zeros(Worldsize)
    
    for i in range(Worldsize[0]):
        for j in range(Worldsize[1]):
            if Environment[i,j] == 'normal':
                if DeterministicPolicy:
                    # Deterministic Policy
                    Action =
                    StateValue[i,j] =
                else:
                    # Stochastic Policy
                    for Action in :
                        ProbAction = 
                        StateValue[i,j] += 

    return StateValue

In [None]:
def PolicyIterativeEstimation(
        Policy,
        Environment = EnvRobot,
        Rewards={'tired':0, 'stone':0, 'gem':1, 'hole':-1},
        DiscountRate=0.9,
        NumberIterations=100,
        DeterministicPolicy=True,
        Slippery=False,
        SlipperyDistribution=None,
        Story=False
    ):
    for _ in range(NumberIterations):
        # Policy Evaluation
        QTable = 

        ValuePolicy = 

        # Policy Improvement
        PolicyArray = 
        
        if not DeterministicPolicy:
            DeterministicPolicy = True

        def Policy(Position):
            return PolicyArray[Position]

        if Story:
            print("QTable:\n")
            print(QTable.round(2))
            print(  )

            print("State Value:\n")
            print(ValuePolicy.round(2))
            print(  )

            print("k:{}\n".format(_+1))
            print("Policy:\n")
            print(np.vectorize(ACTIONSYMBOLS.get)(PolicyArray))
            print(  )

    return PolicyArray, ValuePolicy

---

# 1 Policies

## 1.1 Deterministic policies

In [None]:
Policy1_array = np.array([
    ['right', 'right', 'right', ''],
    ['up', '', 'up', ''],
    ['up', 'right', 'up', 'left']
])

def Policy1(Position):
    return Policy1_array[Position]

![Policy1.png](Policy1.png)

In [None]:
Policy2_array = np.array([
    ['right', 'right', 'right', ''],
    ['up', '', 'right', ''],
    ['right', 'right', 'right', 'up']
])

def Policy2(Position):
    return Policy2_array[Position]

![Policy2.png](Policy2.png)

In [None]:
Policy3_array = np.array([
    ['right', 'right', 'left', ''],
    ['down', '', 'right', ''],
    ['up', 'left', 'up', 'left']
])

def Policy3(Position):
    return Policy3_array[Position]

![Policy3.png](Policy3.png)

## 1.2 Stochastic policy

In [None]:
def Policy4(Position, Environment=EnvRobot):
    Distribution = {'up':0.25, 'down':0.25, 'left':0.25, 'right':0.25}

    if Environment[Position] == 'normal':
        return Distribution
    else:
        return None

![Policy4.png](Policy4.png)

# 2 State value function evaluation

## 2.1 Deterministic policy and deterministic environment

### 2.1.1 Policy 1

In [None]:
PolicyValueEvaluation(
    Policy=Policy1,
    NumberIterations=5,
    Story=True
).round(2)

In [None]:
ValuePolicy1 = PolicyValueEvaluation(Policy=Policy1)
ValuePolicy1.round(2)

In [None]:
PlotPolicyValue(Policy1_array, ValuePolicy1)

### 2.1.2 Policy 2

In [None]:
PolicyValueEvaluation(
    Policy=Policy2,
    NumberIterations=5,
    Story=True
).round(2)

In [None]:
ValuePolicy2 = PolicyValueEvaluation(Policy=Policy2)
ValuePolicy2.round(2)

In [None]:
PlotPolicyValue(Policy2_array, ValuePolicy2)

### 2.1.3 Policy 3

In [None]:
PolicyValueEvaluation(
    Policy=Policy3,
    NumberIterations=5,
    Story=True
).round(2)

In [None]:
ValuePolicy3 = PolicyValueEvaluation(Policy=Policy3)
ValuePolicy3.round(2)

In [None]:
PlotPolicyValue(Policy3_array, ValuePolicy3)

## 2.2 Deterministic policy and stochastic environment

### 2.2.1 Policy 1, Opposite slippery

In [None]:
PolicyValueEvaluation(
    Policy=Policy1,
    NumberIterations=5,
    Slippery=True,
    SlipperyDistribution=SlipperyOpposite,
    Story=True
).round(2)

In [None]:
ValueSlipperyPolicy1 = PolicyValueEvaluation(
    Policy=Policy1,
    Slippery=True,
    SlipperyDistribution=SlipperyOpposite
)

ValueSlipperyPolicy1.round(2)

In [None]:
PlotPolicyValue(Policy1_array, ValueSlipperyPolicy1)

### 2.2.2 Policy 2, Opposite slippery

In [None]:
PolicyValueEvaluation(
    Policy=Policy2,
    NumberIterations=5,
    Slippery=True,
    SlipperyDistribution=SlipperyOpposite,
    Story=True
).round(2)

In [None]:
ValueSlipperyPolicy2 = PolicyValueEvaluation(
    Policy=Policy2,
    Slippery=True,
    SlipperyDistribution=SlipperyOpposite
)

ValueSlipperyPolicy2.round(2)

In [None]:
PlotPolicyValue(Policy2_array, ValueSlipperyPolicy2)

### 2.2.3 Policy 3, Opposite slippery

In [None]:
PolicyValueEvaluation(
    Policy=Policy3,
    NumberIterations=5,
    Slippery=True,
    SlipperyDistribution=SlipperyOpposite,
    Story=True
).round(2)

In [None]:
ValueSlipperyPolicy3 = PolicyValueEvaluation(
    Policy=Policy3,
    Slippery=True,
    SlipperyDistribution=SlipperyOpposite
)

ValueSlipperyPolicy3.round(2)

In [None]:
PlotPolicyValue(Policy3_array, ValueSlipperyPolicy3)

## 2.3 Stochastic policy and deterministic environment

### 2.3.1 Policy 4

In [None]:
PolicyValueEvaluation(
    
).round(2)

In [None]:
ValuePolicy4 = PolicyValueEvaluation()

ValuePolicy4.round(2)

In [None]:
PlotPolicyValue()

## 2.4 Stochastic policy and stochastic environment

### 2.4.1 Policy 4, Opposite slippery

In [None]:
PolicyValueEvaluation(
    
).round(2)

In [None]:
ValueSlipperyPolicy4 = PolicyValueEvaluation(
    
)

In [None]:
PlotPolicyValue()

# 3 State-action value function evaluation

## 3.1 Deterministic policies and deterministic environment

### 3.1.1 Policy 1

In [None]:
QTablePolicy1 = PolicyActionEvaluation(Policy1)

In [None]:
QTablePolicy1.round(2)

In [None]:
ValueFromQTable(QTablePolicy1, Policy1).round(2)

### 3.1.2 Policy 2

In [None]:
QTablePolicy2 = PolicyActionEvaluation()

In [None]:
QTablePolicy2.round(2)

In [None]:
ValueFromQTable().round(2)

### 3.1.3 Policy 3

In [None]:
QTablePolicy3 = PolicyActionEvaluation()

In [None]:
QTablePolicy3.round(2)

In [None]:
ValueFromQTable().round(2)

## 3.2 Deterministic policy and stochastic environment

### 3.2.1 Policy 1, Opposite slippery

In [None]:
QTableSlipperyPolicy1 = PolicyActionEvaluation(
    
)

In [None]:
QTableSlipperyPolicy1.round(2)

In [None]:
ValueFromQTable().round(2)

### 3.2.2 Policy 2, Opposite slippery

In [None]:
QTableSlipperyPolicy2 = PolicyActionEvaluation(
    
)

In [None]:
QTableSlipperyPolicy2.round(2)

In [None]:
ValueFromQTable().round(2)

### 3.2.3 Policy 3, Opposite slippery

In [None]:
QTableSlipperyPolicy3 = PolicyActionEvaluation(
    
)

In [None]:
QTableSlipperyPolicy3.round(2)

In [None]:
ValueFromQTable().round(2)

## 3.3 Stochastic policy and deterministic environment

### 3.3.1 Policy 4

In [None]:
QTablePolicy4 = PolicyActionEvaluation(
    
)

In [None]:
QTablePolicy4.round(2)

In [None]:
ValueFromQTable().round(2)

## 3.4 Stochastic policy and stochastic environment

### 3.4.1 Policy 4, Opposite slippery

In [None]:
QTableSlipperyPolicy4 = PolicyActionEvaluation(
    
)

In [None]:
QTableSlipperyPolicy4.round(2)

In [None]:
ValueFromQTable().round(2)

# 4 Iterative policy improvement

## 4.1 Starting from Policy 2, by hand

In [None]:
Policy2_array

In [None]:
np.vectorize(ACTIONSYMBOLS.get)(Policy2_array)

Evaluation step

In [None]:
QTable2_0 = PolicyActionEvaluation()
ValuePolicy2_0 = ValueFromQTable()

In [None]:
QTable2_0.round(2)

In [None]:
ValuePolicy2_0.round(2)

Improvement step

In [None]:
Policy2_1_array = GreedyPolicy()

In [None]:
def Policy2_1(Position):
    return Policy2_1_array[Position]

In [None]:
Policy2_1_array

In [None]:
np.vectorize(ACTIONSYMBOLS.get)(Policy2_1_array)

Evaluation step

In [None]:
QTable2_1 = PolicyActionEvaluation()
ValuePolicy2_1 = ValueFromQTable()

In [None]:
QTable2_1

In [None]:
ValuePolicy2_1

Improvement step

In [None]:
Policy2_2_array = GreedyPolicy()

In [None]:
def Policy2_2(Position):
    return Policy2_2_array[Position]

In [None]:
Policy2_2_array

In [None]:
np.vectorize(ACTIONSYMBOLS.get)(Policy2_2_array)

Evaluation step

In [None]:
QTable2_2 = PolicyActionEvaluation()
ValuePolicy2_2 = ValueFromQTable()

In [None]:
QTable2_2

In [None]:
ValuePolicy2_2

## 4.2 Starting from Policy 2

In [None]:
PolicyStar, ValueStar = PolicyIterativeEstimation(Policy2)

In [None]:
PlotPolicyValue(PolicyStar, ValueStar)

## 4.3 Starting from Policy 4

In [None]:
PolicyStar, ValueStar = PolicyIterativeEstimation(Policy4, DeterministicPolicy=False)

In [None]:
PlotPolicyValue(PolicyStar, ValueStar)