In [3]:
import numpy as np
np.random.seed(0)

P = [
    [0.9, 0.1, 0.0, 0.0, 0.0, 0.0],
    [0.5, 0.0, 0.5, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.6, 0.0, 0.4],
    [0.0, 0.0, 0.0, 0.0, 0.3, 0.7],
    [0.0, 0.2, 0.3, 0.5, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
]
P = np.array(P)

rewards = [-1, -2, -2, 10, 1, 0]
gamma = 0.5

def calculate_return(start_index, chain, gamma):
    out = 0
    for i in reversed(range(start_index, len(chain))):
        out += rewards[chain[i] - 1] * gamma **i
    return out

# Example usage
chain = [1, 2, 3, 6]
print(calculate_return(0, chain, gamma))

-2.5


In [4]:
def calculate_state_value(P, rewards, gamma, state_num):
    # By inverse of a matrix
    rewards = np.array(rewards).reshape(-1, 1)
    state_value1 = np.linalg.inv(np.eye(state_num) - gamma * P).dot(rewards)

    # By iterative method
    state_value2 = np.zeros((state_num, 1))
    for _ in range(10):
        state_value2 = rewards + gamma * P.dot(state_value2)
    return state_value1, state_value2

print(calculate_state_value(P, rewards, gamma, 6))

(array([[-2.01950168],
       [-2.21451846],
       [ 1.16142785],
       [10.53809283],
       [ 3.58728554],
       [ 0.        ]]), array([[-2.01855393],
       [-2.21401787],
       [ 1.16144318],
       [10.53812503],
       [ 3.58740383],
       [ 0.        ]]))


In [5]:
S = ["s1", "s2", "s3", "s4", "s5"]
A = ["保持s1", "前往s1", "前往s2", "前往s3", "前往s4", "前往s5", "概率前往"]
P = {
    "s1-保持s1-s1": 1.0,
    "s1-前往s2-s2": 1.0,
    "s2-前往s1-s1": 1.0,
    "s2-前往s3-s3": 1.0,
    "s3-前往s4-s4": 1.0,
    "s3-前往s5-s5": 1.0,
    "s4-前往s5-s5": 1.0,
    "s4-概率前往-s2": 0.2,
    "s4-概率前往-s3": 0.4,
    "s4-概率前往-s4": 0.4,
}
R = {
    "s1-保持s1": -1,
    "s1-前往s2": 0,
    "s2-前往s1": -1,
    "s2-前往s3": -2,
    "s3-前往s4": -2,
    "s3-前往s5": 0,
    "s4-前往s5": 10,
    "s4-概率前往": 1,
}
gamma = 0.5
MDP = (S, A, P, R, gamma)

Pi_1 = {
    "s1-保持s1": 0.5,
    "s1-前往s2": 0.5,
    "s2-前往s1": 0.5,
    "s2-前往s3": 0.5,
    "s3-前往s4": 0.5,
    "s3-前往s5": 0.5,
    "s4-前往s5": 0.5,
    "s4-概率前往": 0.5,
}
Pi_2 = {
    "s1-保持s1": 0.6,
    "s1-前往s2": 0.4,
    "s2-前往s1": 0.3,
    "s2-前往s3": 0.7,
    "s3-前往s4": 0.5,
    "s3-前往s5": 0.5,
    "s4-前往s5": 0.1,
    "s4-概率前往": 0.9,
}

def join(str1, str2):
    return str1 + '-' + str2

In [6]:
P_from_mdp_to_mrp = [
    [0.5, 0.5, 0.0, 0.0, 0.0],
    [0.5, 0.0, 0.5, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.5, 0.5],
    [0.0, 0.1, 0.2, 0.2, 0.5],
    [0.0, 0.0, 0.0, 0.0, 1.0],
]
P_from_mdp_to_mrp = np.array(P_from_mdp_to_mrp)
R_from_mdp_to_mrp = [-0.5, -1.5, -1.0, 5.5, 0]

print(calculate_state_value(P_from_mdp_to_mrp, R_from_mdp_to_mrp, gamma, 5))

(array([[-1.22555411],
       [-1.67666232],
       [ 0.51890482],
       [ 6.0756193 ],
       [ 0.        ]]), array([[-1.22558266],
       [-1.6766816 ],
       [ 0.51890139],
       [ 6.07561448],
       [ 0.        ]]))


In [11]:
def sample(MDP, Pi, episode_maxlength, number):
    S, A, P, R, gamma = MDP
    episodes = []
    for _ in range(number):
        episode = []
        episode_len = 0
        s = S[np.random.randint(4)]
        while s != 's5' and episode_len <= episode_maxlength:
            episode_len += 1
            rand, temp = np.random.rand(), 0
            for a_opt in A:
                temp += Pi.get(join(s, a_opt), 0)
                if temp > rand:
                    a = a_opt
                    r = R.get(join(s, a))
                    break
            rand, temp = np.random.rand(), 0
            for s_opt in S:
                temp += P.get(join(join(s, a), s_opt), 0)
                if temp > rand:
                    s_next = s_opt
                    break
            episode.append((s, a, r, s_next))
            s = s_next
        episodes.append(episode)
    return episodes

episodes = sample(MDP, Pi_1, 20, 5)
for _ in episodes:
    print(_)
    print('\n')

[('s4', '概率前往', 1, 's3'), ('s3', '前往s5', 0, 's5')]


[('s4', '概率前往', 1, 's4'), ('s4', '概率前往', 1, 's3'), ('s3', '前往s4', -2, 's4'), ('s4', '前往s5', 10, 's5')]


[('s2', '前往s1', -1, 's1'), ('s1', '保持s1', -1, 's1'), ('s1', '保持s1', -1, 's1'), ('s1', '前往s2', 0, 's2'), ('s2', '前往s1', -1, 's1'), ('s1', '保持s1', -1, 's1'), ('s1', '保持s1', -1, 's1'), ('s1', '前往s2', 0, 's2'), ('s2', '前往s1', -1, 's1'), ('s1', '保持s1', -1, 's1'), ('s1', '前往s2', 0, 's2'), ('s2', '前往s1', -1, 's1'), ('s1', '前往s2', 0, 's2'), ('s2', '前往s3', -2, 's3'), ('s3', '前往s5', 0, 's5')]


[('s1', '保持s1', -1, 's1'), ('s1', '前往s2', 0, 's2'), ('s2', '前往s3', -2, 's3'), ('s3', '前往s5', 0, 's5')]


[('s3', '前往s4', -2, 's4'), ('s4', '前往s5', 10, 's5')]




In [None]:
def MCBasics():