In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ERS import ErgodicRiverswim

In [39]:
def fun_p(data, env, alpha):
    l = env.l
    action = env.num_action
    matrix = np.zeros((l, action, l))
    for s1 in range(l):
        for a in range(action):
            times_s1_a = data.loc[(data["state"] == s1)
                                  & (data["action"] == a)]
            num_s1 = max(1, len(times_s1_a))
            for s2 in range(l):
                trans_s2 = times_s1_a[times_s1_a["s+1"] == s2]
                num_s1_s2 = max(1, len(trans_s2))
                x = num_s1_s2 + alpha
                y = num_s1 + alpha * l
                matrix[s1, a, s2] = 0.0 if y == 0 else x / y
    return matrix

In [40]:
def fun_r(data, env, alpha):
    r = np.zeros((env.l, env.action))
    for s in range(env.l):
        for a in range(env.action):
            times_state_action = data.loc[(
                data["state"] == s) & (data["action"] == a)]
            num_state = max(1, len(times_state_action))
            rewards_states = times_state_action["reward"].sum()
            x = rewards_states + alpha
            y = num_state + alpha
            r[s, a] = 0.0 if y == 0 else x / y
    return r

In [41]:
def simulation(env, size=100):
    simulation = []
    for _ in range(size):
        s = env.state
        action = np.random.choice(env.action)
        s_1, reward = env.step(action)
        simulation.append([s, action, reward, s_1])
    return pd.DataFrame(simulation, columns=["state", "action", "reward", "state+1"])

In [44]:
def model_based_learning(data, env, alpha, *, step_size):
    l = env.l
    action = env.action
    q_values = []
    print("Starting the Model-based learning algorithm...", end="\n")
    for t in range(1, len(data), step_size):
        p = fun_p(data[:t], env, alpha)
        r = fun_r(data[:t], env, alpha)
        q_value = q_value_iteration(r, p, l, action)
        q_values.append(q_value)
    print(
        f"Done computing Q-value estimates for {model_based_learning.__name__}")
    return np.array(q_values)

In [45]:
def q_learning(data, env, alpha, gamma=0.95, *, step_size):
    Q_init = np.zeros((env.l, env.action))
    q_values = []
    num_selected = np.zeros((env.l, env.action))
    print("Starting the Q-value learning algorithm...", end="\n")
    for t in range(0, len(data), step_size):
        s = data["state"][t]
        a = data["action"][t]
        reward = data["reward"][t]
        s_1 = data["state+1"][t]
        num_selected[s][a] += 1
        delta = reward + gamma * \
            max(Q_init[s_1, 0], Q_init[s_1, 1]) - \
            Q_init[s, a]
        Q_new = Q_init.copy()
        Q_new[s, a] = Q_init[s, a] + \
            alpha(t, num_selected[s][a]) * delta
        q_values.append(Q_new)
        Q_init = Q_new.copy()
    print(f"Done computing Q-value estimates for {q_learning.__name__}")
    return np.array(q_values)

In [47]:
def get_error(q_estimates, env):
    q_star = env.q_value_iteration()
    errors = []
    for q_estimate in q_estimates:
        error = np.linalg.norm(q_estimate - q_star, ord=np.inf)
        errors.append(error)
    return errors

In [50]:
def main():
    env = ErgodicRiverswim(4)
    size = 10 ** 6
    step_size = 1000
    data = simulation(env, size)

    # run algorithm and compute errors for model based opo
    q_estimates_model = model_based_learning(data, env, 0, step_size=step_size)
    model_errors = get_error(q_estimates_model, env)
    # run algorithm and compute errors for ql for alpha and alpha prime
    q_estimates_ql_a = q_learning(
        data, env, lambda t, num: 2 / (t ** 0.66 + 1), step_size=step_size)
    q_estimates_ql_ap = q_learning(
        data, env, lambda t, num: 2 / (num ** 0.66 + 1), step_size=step_size)
    ql_errors_a = get_error(q_estimates_ql_a, env)
    ql_errors_ap = get_error(q_estimates_ql_ap, env)
    log_ql_errors_a = np.log10(ql_errors_a)
    log_ql_errors_ap = np.log10(ql_errors_ap)

    # plots
    x_values = np.arange(1, size, step_size)
    x_label = r"$T$"
    y_label = r"$\|Q^{*} - Q_{t}\|_{\infty}$"
    model_title = r"Model-Based Method for OPO with $\alpha$ = 0"
    ql_title = "Q-Learning Method for OPO"
    ql_legend = [r"Q-Learning with $\alpha$", r"Q-Learning with $\alpha'$"]

    plt.plot(x_values, model_errors)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(model_title)
    plt.savefig("./plots/model_based.png")
    plt.show()

    plt.plot(x_values, ql_errors_a, label=ql_legend[0])
    plt.plot(x_values, ql_errors_ap, label=ql_legend[1])
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(ql_title)
    plt.legend()
    plt.savefig("./plots/q_learning.png")
    plt.show()

    plt.plot(x_values, log_ql_errors_a, label=ql_legend[0])
    plt.plot(x_values, log_ql_errors_ap, label=ql_legend[1])
    plt.yscale("log")
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(ql_title)
    plt.legend()
    plt.savefig("./plots/log_q_learning.png")
    plt.show()


if __name__ == "__main__":
    main()


TypeError: ErgodicRiverswim() takes no arguments