# State Value Function

We want to calculate $V_{\pi}(s)$ (the state-value-function given a policy)
![mdp.png](mdp.png)

# Policy Evaluation by Dynamic Programming (page 16)

In [None]:
import numpy as np

policy=np.array([[0.3, 0.2, 0.5], [0.5, 0.4, 0.1], [0.8, 0.1, 0.1]])
print('This is the optimal policy: {}')
rewards=np.array([10., 2., 3.])

state_value_function=np.array([0 for i in range(3)])

for i in range(20):
    print('V_{}={}'.format(i, state_value_function))
    state_value_function=rewards+0.1*(np.matmul(policy, state_value_function))
print('\nV={}'.format(state_value_function))

# Policy Evaluation by Linear Programming (page 15)

The state-value-function can be directly solved through linear programming (as shown on page 15):


In [None]:
solution=np.matmul(np.linalg.inv(np.eye(3)-0.1*policy), rewards)
print(solution)

The result stays the same.

# Monte Carlo Policy Evaluation (page 20)


Monte Carlo Policy Evaluation can also be used, whereby sampling is used to get to the same result

In [None]:
import random
from collections import defaultdict
reward_counter=np.array([0., 0., 0.])
visit_counter=np.array([0., 0., 0.])

def gt(rewardlist, gamma=0.1):
    '''
    Function to calculate the total discounted reward
    >>> gt([10, 2, 3], gamma=0.1)
    10.23
    '''
    summe=0
    for i, value in enumerate(rewardlist):
        summe+=(gamma**i)*value
    return summe


for i in range(400):
    start_state=random.randint(0, 2)
    next_state=start_state
    rewardlist=[]
    occurence=defaultdict(list) 
    for i in range(250):
        rewardlist.append(rewards[next_state]) 
        occurence[next_state].append(len(rewardlist)-1) 
        action=np.random.choice(np.arange(0, 3), p=policy[next_state]) 
        next_state=action

    for state in occurence: 
        for value in occurence[state]: 
            rew=gt(rewardlist[value:]) 
            reward_counter[state]+=rew 
            visit_counter[state]+=1 
            

print(reward_counter/visit_counter)

As can be seen the result is nearly the same as the state-value-function calculated above.

# Policy Optimization by Q-Learning (page 26)

This code solves a very easy problem: using the rewards it calculated the optimal action-value-function.

It samples a state-action pair randomly, so that all state-action pairs can be seen.

In [None]:
q_table=np.zeros((3, 3)) 
for i in range(1001): 
    state=random.randint(0, 2) 
    action=random.randint(0, 2) 
    next_state=action
    reward=rewards[next_state] 
    next_q=max(q_table[next_state]) 
    q_table[state, action]=q_table[state, action]+1*(reward+0.1*(next_q)-q_table[state, action]) #Q-Table update
    if i%100==0:
        print(q_table)

## Score Function Gradient Estimator (page 32)

In [None]:
import torch

import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.t_policy=torch.autograd.Variable(torch.FloatTensor([[1/3 for x in range(3)] for y in range(3)]), requires_grad=True)
    def forward(self):
        policy = torch.nn.functional.log_softmax(self.t_policy)
        
        return policy

def gt(rewardlist, gamma=0.1):
    '''
    Function um den " total discounted return from time-step t zu berechnen
    >>> gt([10, 2, 3], gamma=0.1)
    10.23
    '''
    summe=0
    for i, value in enumerate(rewardlist):
        summe+=(gamma**i)*value
    return summe

valuelist=[] 
rewards=np.array([10., 2., 3.])/10
model = Model()
optim = torch.optim.SGD([model.t_policy], lr=0.0001)
for i in range(10001): 
    poli=torch.nn.functional.softmax(model.t_policy).data.numpy()
    state_action_list=[] 
    start_state=random.randint(0, 2)
    next_state=start_state
    rewardlist=[] 

    for k in range(40):
        rewardlist.append(rewards[next_state])
        action=np.random.choice(np.arange(0, 3), p=poli[next_state])
        state_action_list.append((next_state, action)) 
        next_state=action 


    rew=gt(rewardlist[:], 0.99)
    grad_list = []
    for j, (state, action) in enumerate(state_action_list):
        value = model()
        value = -value[state, action] * rew
        grad_list.append(value.view(1, -1))

    grads = torch.cat(grad_list, 0).mean()
    grads.backward()
    optim.step()
    value=(gt(rewardlist, 1))
    valuelist.append(value)


    if i%100==0:
        print(poli)
        print(rewardlist)
        plt.plot(valuelist)
        plt.show()

# Value Iteration



In [None]:
import numpy as np

rewards=np.array([10., 2., 3.])

state_value_function=np.array([0 for i in range(3)])
print('V_{} = {}'.format(0,state_value_function))
for i in range(1000):
    s_v_f=state_value_function.copy()
    for s in range(3):
        state_value_function[s]=max(rewards[s]+0.1*state_value_function[s_prime] for s_prime  in range(3))
    if i%100==99:
        print('V_{} = {}'.format(i+1,state_value_function))