In [8]:
import os
os.chdir("..")
from rl.monte_carlo import mc_prediction
from operator import itemgetter
from itertools import groupby
from typing import Sequence, Tuple, Mapping

S = str
DataType = Sequence[Sequence[Tuple[S, float]]]
ProbFunc = Mapping[S, Mapping[S, float]]
RewardFunc = Mapping[S, float]
ValueFunc = Mapping[S, float]
from collections import defaultdict
import numpy as np

In [38]:
def get_state_return_samples(
    data: DataType
) -> Sequence[Tuple[S, float]]:
    """
    prepare sequence of (state, return) pairs.
    Note: (state, return) pairs is not same as (state, reward) pairs.
    """
    return [(s, sum(r for (_, r) in l[i:]))
            for l in data for i, (s, _) in enumerate(l)]


def get_mc_value_function(
    state_return_samples: Sequence[Tuple[S, float]]
) -> ValueFunc:
    """
    Implement tabular MC Value Function compatible with the interface defined above.
    """
    n: Dict[S, int] = defaultdict(lambda:0)
    s: Dict[S, float] = defaultdict(lambda:0)

    for i, step in enumerate(state_return_samples):
        n[step[0]]= n[step[0]]+1
        s[step[0]]= s[step[0]]+step[1]
    d1={s_: s[s_] / n[s_] for s_ in n.keys()}
    return d1


def get_state_reward_next_state_samples(
    data: DataType
) -> Sequence[Tuple[S, float, S]]:
    """
    prepare sequence of (state, reward, next_state) triples.
    """
    return [(s, r, l[i+1][0] if i < len(l) - 1 else 'T')
            for l in data for i, (s, r) in enumerate(l)]

def get_mrp_value_function(
    prob_func: ProbFunc,
    reward_func: RewardFunc
) -> ValueFunc:
    """
    Implement code that calculates the MRP Value Function from the probability
    transitions and reward function, compatible with the interface defined above.
    Hint: Use the MRP Bellman Equation and simple linear algebra
    """
    s = list(reward_func.keys())
    nt_states = list(prob_func.keys())
    gamma = 0.5 
    
    P = np.zeros((len(s), len(s)))
    R = np.zeros((len(s), 1))

    for i, s_ in enumerate(nt_states):
        for j, s_nxt in enumerate(s):
            if s_nxt in prob_func[s_].keys():
                P[i, j] = prob_func[s_][s_nxt]

    for j, s_nxt in enumerate(s):
        R[j] = reward_func[s_]
 
    val_function=np.matmul(np.linalg.inv((np.identity(len(s)) - gamma * P)), R)
    return val_function


def get_td_value_function(
    srs_samples: Sequence[Tuple[S, float, S]],
    num_updates: int = 300000,
    learning_rate: float = 0.3,
    learning_rate_decay: int = 30
) -> ValueFunc:
    """
    Implement tabular TD(0) (with experience replay) Value Function compatible
    with the interface defined above. Let the step size (alpha) be:
    learning_rate * (updates / learning_rate_decay + 1) ** -0.5
    so that Robbins-Monro condition is satisfied for the sequence of step sizes.
    """
    gamma = 0.5
    val_function = defaultdict(lambda: 0)
    for i in range(num_updates):
        nxt_sample = srs_samples[np.random.randint(len(srs_samples))]
        alpha = learning_rate * (i / learning_rate_decay + 1) ** (-1/2)
        r=nxt_sample[1]
        val_function[nxt_sample[0]] += alpha * (r + \
        gamma * val_function[nxt_sample[2]] - val_function[nxt_sample[0]]
        )

    return val_function


def get_probability_and_reward_functions(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> Tuple[ProbFunc, RewardFunc]:
    """
    Implement code that produces the probability transitions and the
    reward function compatible with the interface defined above.
    """
    d = {s: [(r, s1) for _, r, s1 in l] for s, l in
         groupby(sorted(srs_samples, key=itemgetter(0)), itemgetter(0))}

    prob_func = {s: {s1: len(list(l1)) / len(l) for s1, l1 in
                     groupby(sorted(l, key=itemgetter(1)), itemgetter(1))
                     if s1 != 'T'} for s, l in d.items()}
    reward_func = {s: np.mean([r for r, _ in l]) for s, l in d.items()}

    return prob_func, reward_func

def get_lstd_value_function(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> ValueFunc:
    """
    Implement LSTD Value Function compatible with the interface defined above.
    Hint: Tabular is a special case of linear function approx where each feature
    is an indicator variables for a corresponding state and each parameter is
    the value function for the corresponding state.
    """
    num_N = len(nt_states)
    feature = np.identity(num_N)
    # initialization
    A = np.zeros((num_N, num_N))
    b = np.zeros(num_N)
    for s, r, s1 in srs_samples:
        p1 = feature[nt_states.index(s)]
        if s1 != 'T': 
            p2 = feature[nt_states.index(s1)] 
        else :
            np.zeros(num_N)
        A += p1@(p1 - p2)
        b += p1 @ r
    value_function={nt_states[i]: v for i, v in enumerate(np.linalg.inv(A).dot(b))}
    return value_function 


if __name__ == '__main__':
    given_data: DataType = [
        [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)],
        [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)],
        [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)],
        [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)],
        [('B', 8.), ('B', 2.)]
    ]

    sr_samps = get_state_return_samples(given_data)

    print("------------- MONTE CARLO VALUE FUNCTION --------------")
    print(get_mc_value_function(sr_samps))

    srs_samps = get_state_reward_next_state_samples(given_data)

    pfunc, rfunc = get_probability_and_reward_functions(srs_samps)

    print("-------------- MRP VALUE FUNCTION ----------")
    print(get_mrp_value_function(pfunc, rfunc))

    print("------------- TD VALUE FUNCTION --------------")
    print(get_td_value_function(srs_samps))


------------- MONTE CARLO VALUE FUNCTION --------------
{'A': 9.571428571428571, 'B': 5.642857142857143}
-------------- MRP VALUE FUNCTION ----------
[[4.82089552]
 [4.1119403 ]]
------------- TD VALUE FUNCTION --------------
defaultdict(<function get_td_value_function.<locals>.<lambda> at 0x00000279ADDE6B80>, {'B': 4.301795248155684, 'T': 0, 'A': 5.001503479910988})
