In [1]:
# test state transition matrix.

import numpy as np
import math

# ------------------------------------------------------------------------------
# Function calPoisson: return a poisson distribution value given number n and
#                      lambda l.
# ------------------------------------------------------------------------------

def calPoisson(n, l):
    
    return (l ** n / math.factorial(n)) * math.exp(-l)

# ------------------------------------------------------------------------------
# init2rentRemains_matrix：
# rentRemains2return
# ------------------------------------------------------------------------------

carparkA_init2rentRemains_matrix = np.zeros(((21, 21, 3))) # column = initStates, row = rents.
carparkA_rentRemains2returnRemains_matrix = np.zeros(((21, 21, 2))) # column = rentRemains, row = returns.
carparkB_init2rentRemains_matrix = np.zeros(((21, 21, 3))) # column = initStates, row = rents.
carparkB_rentRemains2returnRemains_matrix = np.zeros(((21, 21, 2))) # column = rentRemains, row = returns.

# generate carparkA_init2rentRemains_matrix.
for init in range(21):
    for rents in range(21):
        
        # variables in init2rentRemains_matrix.
        rentRemains = init - rents
        if rentRemains <= 0:
            rentRemains = 0
        
        if rents == 20:
            sigma_prob = 0
            for i in range(rents):
                sigma_prob = sigma_prob + calPoisson(n=i, l=3)
            prob = 1 - sigma_prob
        else:
            prob = calPoisson(n=rents, l=3)
        
        if init >= rents:
            reward = rents * 10
        else:
            reward = init * 10
        
        carparkA_init2rentRemains_matrix[init, rents] = np.array([rentRemains, prob, reward])

# generate carparkA_rentRemains2returnRemains_matrix.
for rentRemains in range(21):
    for returns in range(21):
        
        # variables in rentRemains2returnRemains_matrix.
        returnRemains = rentRemains + returns
        if returnRemains >= 20:
            returnRemains = 20
            
        if returns == 20:
            sigma_prob = 0
            for i in range(returns):
                sigma_prob = sigma_prob + calPoisson(n=i, l=3)
            prob = 1 - sigma_prob
        else:
            prob = calPoisson(n=returns, l=3)
            
        carparkA_rentRemains2returnRemains_matrix[rentRemains, returns] = np.array([returnRemains, prob])
    
# generate carparkB_init2rentRemains_matrix.
for init in range(21):
    for rents in range(21):
        
        # variables in init2rentRemains_matrix.
        rentRemains = init - rents
        if rentRemains <= 0:
            rentRemains = 0
        
        if rents == 20:
            sigma_prob = 0
            for i in range(rents):
                sigma_prob = sigma_prob + calPoisson(n=i, l=4)
            prob = 1 - sigma_prob
        else:
            prob = calPoisson(n=rents, l=4)
        
        if init >= rents:
            reward = rents * 10
        else:
            reward = init * 10
        
        carparkB_init2rentRemains_matrix[init, rents] = np.array([rentRemains, prob, reward])

# generate carparkB_rentRemains2returnRemains_matrix.
for rentRemains in range(21):
    for returns in range(21):
        
        # variables in rentRemains2returnRemains_matrix.
        returnRemains = rentRemains + returns
        if returnRemains >= 20:
            returnRemains = 20
            
        if returns == 20:
            sigma_prob = 0
            for i in range(returns):
                sigma_prob = sigma_prob + calPoisson(n=i, l=2)
            prob = 1 - sigma_prob
        else:
            prob = calPoisson(n=returns, l=2)
            
        carparkB_rentRemains2returnRemains_matrix[rentRemains, returns] = np.array([returnRemains, prob])

In [2]:
# ----------------------------------------------------------------
# verify the correct of transition probability. 
# ----------------------------------------------------------------

sigma = 0

for i in range(21):
    
    sigma += carparkA_init2rentRemains_matrix[0][i][1]
    
print(sigma)
print(carparkB_init2rentRemains_matrix[0][20][1])


1.0
1.020052231570645e-08


In [3]:
carparkA_init2rentRemains_matrix[20]

array([[2.00000000e+01, 4.97870684e-02, 0.00000000e+00],
       [1.90000000e+01, 1.49361205e-01, 1.00000000e+01],
       [1.80000000e+01, 2.24041808e-01, 2.00000000e+01],
       [1.70000000e+01, 2.24041808e-01, 3.00000000e+01],
       [1.60000000e+01, 1.68031356e-01, 4.00000000e+01],
       [1.50000000e+01, 1.00818813e-01, 5.00000000e+01],
       [1.40000000e+01, 5.04094067e-02, 6.00000000e+01],
       [1.30000000e+01, 2.16040315e-02, 7.00000000e+01],
       [1.20000000e+01, 8.10151179e-03, 8.00000000e+01],
       [1.10000000e+01, 2.70050393e-03, 9.00000000e+01],
       [1.00000000e+01, 8.10151179e-04, 1.00000000e+02],
       [9.00000000e+00, 2.20950322e-04, 1.10000000e+02],
       [8.00000000e+00, 5.52375804e-05, 1.20000000e+02],
       [7.00000000e+00, 1.27471339e-05, 1.30000000e+02],
       [6.00000000e+00, 2.73152870e-06, 1.40000000e+02],
       [5.00000000e+00, 5.46305740e-07, 1.50000000e+02],
       [4.00000000e+00, 1.02432326e-07, 1.60000000e+02],
       [3.00000000e+00, 1.80762

In [10]:
# -----------------------------------------------
# test policy evaluation.
# -----------------------------------------------

import copy

state_value_matrix = np.zeros((21, 21))
gamma = 0.9
theta = 0.1

while True:
    
    state_value_matrix_record = copy.deepcopy(state_value_matrix)
    state_value_matrix = np.zeros((21, 21))
    
    for A_state in range(21):
        for B_state in range(21):
            
            # calculate state value.
            for A_rents in range(21):
                for A_returns in range(21):
                    
                    # To accelerate algorithm.
                    prob_A = carparkA_init2rentRemains_matrix[A_state][A_rents][1] *\
                             carparkA_rentRemains2returnRemains_matrix[int(carparkA_init2rentRemains_matrix[A_state][A_rents][0])][A_returns][1]
                    
                    for B_rents in range(21):
                        for B_returns in range(21):
                            
                            prob_B = carparkB_init2rentRemains_matrix[B_state][B_rents][1] *\
                            carparkB_rentRemains2returnRemains_matrix[int(carparkB_init2rentRemains_matrix[B_state][B_rents][0])][B_returns][1]
                            
                            prob = prob_A * prob_B
                            
                            reward = carparkA_init2rentRemains_matrix[A_state][A_rents][2] + carparkB_init2rentRemains_matrix[B_state][B_rents][2]
                            
                            next_sub_value = state_value_matrix_record\
                            [int(carparkA_rentRemains2returnRemains_matrix[int(carparkA_init2rentRemains_matrix[A_state][A_rents][0])][A_returns][0])]\
                            [int(carparkB_rentRemains2returnRemains_matrix[int(carparkB_init2rentRemains_matrix[B_state][B_rents][0])][B_returns][0])]
                            
                            sub_value = prob * (reward + gamma * next_sub_value)
                            
                            state_value_matrix[A_state][B_state] += sub_value
    
    value_error = abs(state_value_matrix - state_value_matrix_record)
    
    delta = np.amax(value_error)

    print('delta: ', delta)
    
    if delta < theta:

        
        break
    
    else:
        
        print('state value:', state_value_matrix[0][0])

delta:  69.999999976433
state value: 0.0
delta:  62.99964904783437
state value: 34.04921932650149
delta:  56.675705553743
state value: 68.21991744997968
delta:  50.79282387292241
state value: 100.2515133212078


KeyboardInterrupt: 

In [11]:
carparkA_init2rentRemains_matrix[0][0][1]

0.049787068367863944

In [12]:
state_value_matrix

array([[129.71121004, 139.68246735, 149.53134399, 159.11107751,
        168.27550117, 176.9207554 , 184.98818   , 192.44457805,
        199.26470389, 205.42569304, 210.91009576, 215.7113227 ,
        219.83778383, 223.3145966 , 226.18295657, 228.49760961,
        230.32296117, 231.72841909, 232.78360542, 233.55412593,
        234.09867241],
       [139.49468587, 149.46594318, 159.31481981, 168.89455334,
        178.058977  , 186.70423122, 194.77165583, 202.22805388,
        209.04817972, 215.20916887, 220.69357159, 225.49479853,
        229.62125966, 233.09807243, 235.96643239, 238.28108544,
        240.106437  , 241.51189491, 242.56708125, 243.33760175,
        243.88214823],
       [148.57181824, 158.54307555, 168.39195218, 177.9716857 ,
        187.13610937, 195.78136359, 203.8487882 , 211.30518625,
        218.12531209, 224.28630124, 229.77070396, 234.5719309 ,
        238.69839203, 242.1752048 , 245.04356476, 247.35821781,
        249.18356937, 250.58902728, 251.64421362, 252.4147

In [13]:
# -----------------------------------------------
# test policy improvement.
# -----------------------------------------------

actions = np.array([5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5])
policy_matrix = np.zeros((21, 21))

for A_state in range(1):
    A_state = 11
    for B_state in range(21):
            
            # initialize optimal action value.
            optimal_action_value = 0.0
            optimal_action = 0
            
            for act in actions:
                
                action_value = 0.0
                
                # calculate action value.
                for A_rents in range(21):
                    for A_returns in range(21):
                    
                        # To accelerate algorithm.
                        prob_A = carparkA_init2rentRemains_matrix[A_state][A_rents][1] *\
                        carparkA_rentRemains2returnRemains_matrix[int(carparkA_init2rentRemains_matrix[A_state][A_rents][0])][A_returns][1]
                    
                        for B_rents in range(21):
                            for B_returns in range(21):
                            
                                prob_B = carparkB_init2rentRemains_matrix[B_state][B_rents][1] *\
                                carparkB_rentRemains2returnRemains_matrix[int(carparkB_init2rentRemains_matrix[B_state][B_rents][0])][B_returns][1]
                            
                                prob = prob_A * prob_B
                            
                                # the next state when the agent takes action.
                                next_A = int(carparkA_rentRemains2returnRemains_matrix[int(carparkA_init2rentRemains_matrix[A_state][A_rents][0])][A_returns][0] + act)
                                next_B = int(carparkB_rentRemains2returnRemains_matrix[int(carparkB_init2rentRemains_matrix[B_state][B_rents][0])][B_returns][0] - act)
                                
                                if next_A >= 20:
                                    next_A = 20
                                elif next_A < 0:
                                    break
                                    
                                if next_B >= 20:
                                    next_B = 20
                                elif next_B < 0:
                                    break

                                # the reward when the agent takes action.
                                reward = carparkA_init2rentRemains_matrix[A_state][A_rents][2] + carparkB_init2rentRemains_matrix[B_state][B_rents][2] - 2 * abs(act)
                                
                                next_sub_value = state_value_matrix[next_A][next_B]
                            
                                sub_value = prob * (reward + gamma * next_sub_value)
                                
                                action_value += sub_value 
                
                # renew the optimal action.
                if action_value > optimal_action_value:
                    
                    optimal_action_value = action_value
                    optimal_action = act
                print('value of action ' + str(act), action_value)
                    
            print('optimal action value of state ' + str([A_state, B_state]), optimal_action_value)
            print('optimal action of state ' + str([A_state, B_state]), optimal_action)
            
            policy_matrix[A_state][B_state] = optimal_action
                            
                                

value of action 5 0.0
value of action 4 0.0
value of action 3 0.0
value of action 2 0.0
value of action 1 0.0
value of action 0 195.4092997022048
value of action -1 208.1808631472294
value of action -2 216.69345953272716
value of action -3 222.22332579770395
value of action -4 225.51765578521022
value of action -5 226.67810671045396
optimal action value of state [11, 0] 226.67810671045396
optimal action of state [11, 0] -5
value of action 5 0.0
value of action 4 0.0
value of action 3 0.0
value of action 2 0.0
value of action 1 3.3067584298376254
value of action 0 205.37029136440566
value of action -1 218.14187330073324
value of action -2 226.64847322635345
value of action -3 232.1670082083097
value of action -4 235.4413776830209
value of action -5 236.5607920866947
optimal action value of state [11, 1] 236.5607920866947
optimal action of state [11, 1] -5
value of action 5 0.0
value of action 4 0.0
value of action 3 0.0
value of action 2 2.8945190160309915
value of action 1 17.398834777

KeyboardInterrupt: 

In [53]:
policy_matrix

array([[-5., -5., -5., -5., -5., -5., -5., -5., -5.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.],
       [-5., -5., -5., -5., -5., -5., -5., -5., -5.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.],
       [-5., -5., -5., -5., -5., -5., -5., -5.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [-5., -5., -5., -5., -5., -5., -5.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [-5., -5., -5., -5., -5., -5.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [-5., -5., -5., -5., -5., -2., -1., -1.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [-3., -3., -3., -3., -3., -2., -2., -1., -1., -1.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [-3., -3., -3., -3., -3., -3., -2., -2., -2., -1., -1., -1., -1.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [-4., -4., -4., -