In [1]:
import gym
import numpy as np
import random
from timeit import default_timer as timer
from datetime import timedelta
import matplotlib.pylab as plt
import pandas as pd
import seaborn as sns

from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest

np.random.seed(2023)

In [2]:
def get_avg_reward(P, R, policy, test_count=100, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                action = policy[state]
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state = np.random.choice(candidates, 1, p=probs)[0]
                reward = R[state][action] * disc_rate
                episode_reward += reward
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode


def run_vi(P, R, gamma=None, epsilon=None):
    if epsilon is None:
        epsilon = [1e-9]
    if gamma is None:
        gamma = [0.9]
        
    res = pd.DataFrame(columns=["Epsilon", "Gamma", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    i = 0
    for eps in epsilon:
        for ga in gamma:
            vi = ValueIteration(P, R, gamma=ga, epsilon=eps, max_iter=1e6)
            vi.run()
            reward = get_avg_reward(P, R, vi.policy)
            info = [eps, ga, vi.policy, vi.iter, vi.time, reward, vi.V]
            res.loc[i, :] = info
            i += 1
            print(f'Finished: eps={eps}, gamma={ga}')
    return res


def run_pi(P, R, gamma=None, epsilon=None):
    if epsilon is None:
        epsilon = [1e-9]
    if gamma is None:
        gamma = [0.9]
    
    res = pd.DataFrame(columns=["Epsilon", "Gamma", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    i = 0
    for eps in epsilon:
        for ga in gamma:
            pi = PolicyIteration(P, R, gamma=ga, max_iter=1e6)
            pi.run()
            pi_pol = pi.policy
            reward = get_avg_reward(P, R, pi_pol)
            info = [eps, ga, pi.policy, pi.iter, pi.time, reward, pi.V]
            res.loc[i, :] = info
            i += 1
            print(f'Finished: eps={eps}, gamma={ga}')
    return res


def run_qlearning(P, R, discount=0.9, alpha_dec=None, alpha_min=None, 
            epsilon=None, epsilon_decay=None, n_iter=1e6):
    
    if alpha_dec is None:
        alpha_dec=[0.99]
    if alpha_min is None:
        alpha_min=[0.001]
    if epsilon is None:
        epsilon = [1.0]
    if epsilon_decay is None:
        epsilon_decay=[0.99]
        
    res = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for eps in epsilon:
        for eps_dec in epsilon_decay:
            for a_dec in alpha_dec:
                for a_min in alpha_min:
                    q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                  alpha_min=a_min, epsilon=eps, 
                                  epsilon_decay=eps_dec, n_iter=n_iter)
                    q.run()
                    reward = get_avg_reward(P, R, q.policy)
                    print("{}: {}".format(count, reward))
                    rews = [s['Reward'] for s in q.run_stats]
                    info = [count, a_dec, a_min, eps, eps_dec, reward, 
                            q.time, q.policy, q.V, rews]

                    res.loc[count, :] = info
                    count += 1
    return res

In [3]:
P, R = forest(S=500, r1=100, r2= 15, p=0.01)

# Value Iteration

In [30]:
vi_df = run_vi(P, R, epsilon=[1e-3, 1e-6, 1e-9, 1e-12], gamma=[0.9, 0.99])
vi_df

Finished: eps=0.001, gamma=0.9
Finished: eps=0.001, gamma=0.99
Finished: eps=1e-06, gamma=0.9
Finished: eps=1e-06, gamma=0.99
Finished: eps=1e-09, gamma=0.9
Finished: eps=1e-09, gamma=0.99
Finished: eps=1e-12, gamma=0.9
Finished: eps=1e-12, gamma=0.99


Unnamed: 0,Epsilon,Gamma,Policy,Iteration,Time,Reward,Value Function
0,0.001,0.9,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",119,0.023896,2.767571,"(4.7117745667154995, 5.240595870281114, 5.2405..."
1,0.001,0.99,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",802,0.09748,2.327321,"(49.481788993702146, 49.98681394207038, 49.986..."
2,1e-06,0.9,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",179,0.021702,2.740544,"(4.711792669916437, 5.240613400253226, 5.24061..."
3,1e-06,0.99,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1146,0.147842,2.318291,"(49.49700500097145, 50.00202999965886, 50.0020..."
4,0.0,0.9,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",239,0.03037,2.667377,"(4.711792702216012, 5.240613431989174, 5.24061..."
5,0.0,0.99,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1489,0.215634,2.290279,"(49.49748436514349, 50.00250936388094, 50.0025..."
6,0.0,0.9,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",299,0.036595,2.757743,"(4.711792702273827, 5.240613432046434, 5.24061..."
7,0.0,0.99,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1833,0.224226,2.343564,"(49.497499629556565, 50.00252462829403, 50.002..."


In [40]:
vi_df[['Epsilon', 'Gamma', 'Iteration', 'Time', 'Reward']]

Unnamed: 0,Epsilon,Gamma,Iteration,Time,Reward
0,0.001,0.9,119,0.023896,2.767571
1,0.001,0.99,802,0.09748,2.327321
2,1e-06,0.9,179,0.021702,2.740544
3,1e-06,0.99,1146,0.147842,2.318291
4,0.0,0.9,239,0.03037,2.667377
5,0.0,0.99,1489,0.215634,2.290279
6,0.0,0.9,299,0.036595,2.757743
7,0.0,0.99,1833,0.224226,2.343564


In [42]:
vi_df[['Epsilon', 'Iteration', 'Time', 'Reward']].groupby('Epsilon').mean()

Unnamed: 0_level_0,Iteration,Time,Reward
Epsilon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1e-12,1066.0,0.13041,2.550653
1e-09,864.0,0.123002,2.478828
1e-06,662.5,0.084772,2.529418
0.001,460.5,0.060688,2.547446


In [43]:
vi_df[['Gamma', 'Iteration', 'Time', 'Reward']].groupby('Gamma').mean()

Unnamed: 0_level_0,Iteration,Time,Reward
Gamma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.9,209.0,0.028141,2.733309
0.99,1317.5,0.171295,2.319864


# Policy Iteration

In [32]:
pi_df = run_pi(P, R, epsilon=[1e-3, 1e-6, 1e-9, 1e-12], gamma=[0.9, 0.99])

Finished: eps=0.001, gamma=0.9
Finished: eps=0.001, gamma=0.99
Finished: eps=1e-06, gamma=0.9
Finished: eps=1e-06, gamma=0.99
Finished: eps=1e-09, gamma=0.9
Finished: eps=1e-09, gamma=0.99
Finished: eps=1e-12, gamma=0.9
Finished: eps=1e-12, gamma=0.99


In [44]:
pi_df[['Epsilon', 'Gamma', 'Iteration', 'Time', 'Reward']]

Unnamed: 0,Epsilon,Gamma,Iteration,Time,Reward
0,0.001,0.9,46,0.233041,2.76267
1,0.001,0.99,264,1.282655,2.317722
2,1e-06,0.9,46,0.220595,2.799463
3,1e-06,0.99,264,1.343199,2.338921
4,0.0,0.9,46,0.289921,2.778699
5,0.0,0.99,264,1.350103,2.376883
6,0.0,0.9,46,0.238062,2.707856
7,0.0,0.99,264,1.326248,2.369715


In [45]:
pi_df[['Epsilon', 'Iteration', 'Time', 'Reward']].groupby('Epsilon').mean()

Unnamed: 0_level_0,Iteration,Time,Reward
Epsilon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1e-12,155.0,0.782155,2.538786
1e-09,155.0,0.820012,2.577791
1e-06,155.0,0.781897,2.569192
0.001,155.0,0.757848,2.540196


In [46]:
pi_df[['Gamma', 'Iteration', 'Time', 'Reward']].groupby('Gamma').mean()

Unnamed: 0_level_0,Iteration,Time,Reward
Gamma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.9,46.0,0.245405,2.762172
0.99,264.0,1.325551,2.350811


# Q Learning

In [34]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
q_df = run_qlearning(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=1e6)

0: 2.645198698214663
1: 2.647742018130157
2: 2.6210366560490432
3: 2.613258625915354
4: 0.832
5: 2.6866480590497273
6: 2.5855550030005925
7: 2.663230354595905
8: 2.6508040919431064
9: 2.6466229049409007
10: 2.6513424540759196
11: 2.510908427734945
12: 2.5985486725307423
13: 2.6182337445895443
14: 0.802
15: 2.673702980674238


In [38]:
q_df[[ 'Alpha Decay', 'Alpha Min', 'Epsilon', 'Epsilon Decay',
       'Reward', 'Time']]

Unnamed: 0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
0,0.99,0.001,10.0,0.99,2.645199,56.542301
1,0.99,0.0001,10.0,0.99,2.647742,56.019397
2,0.999,0.001,10.0,0.99,2.621037,56.331551
3,0.999,0.0001,10.0,0.99,2.613259,56.17677
4,0.99,0.001,10.0,0.999,0.832,55.90587
5,0.99,0.0001,10.0,0.999,2.686648,56.292351
6,0.999,0.001,10.0,0.999,2.585555,55.61275
7,0.999,0.0001,10.0,0.999,2.66323,58.351593
8,0.99,0.001,1.0,0.99,2.650804,56.510756
9,0.99,0.0001,1.0,0.99,2.646623,56.217086


In [48]:
q_df[[ 'Alpha Decay',
       'Reward', 'Time']].groupby('Alpha Decay').mean()

Unnamed: 0_level_0,Reward,Time
Alpha Decay,Unnamed: 1_level_1,Unnamed: 2_level_1
0.99,2.415725,56.630793
0.999,2.390129,58.898493


In [49]:
q_df[[ 'Alpha Min',
       'Reward', 'Time']].groupby('Alpha Min').mean()

Unnamed: 0_level_0,Reward,Time
Alpha Min,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0001,2.632543,58.30426
0.001,2.173311,57.225026


In [50]:
q_df[[ 'Epsilon', 
       'Reward', 'Time']].groupby('Epsilon').mean()

Unnamed: 0_level_0,Reward,Time
Epsilon,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2.39402,59.125213
10.0,2.411834,56.404073


In [51]:
q_df[[ 'Epsilon Decay',
       'Reward', 'Time']].groupby('Epsilon Decay').mean()

Unnamed: 0_level_0,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1
0.99,2.623364,57.251282
0.999,2.18249,58.278004


## Compare

In [4]:
vi_df_comp = run_vi(P, R, epsilon=[1e-3], gamma=[0.9])

Finished: eps=0.001, gamma=0.9


In [5]:
pi_df_comp = run_pi(P, R, epsilon=[1e-6], gamma=[0.9])

Finished: eps=1e-06, gamma=0.9


In [7]:
q_df_comp = run_qlearning(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.0001], 
            epsilon=[10], epsilon_decay=[0.999], n_iter=1e6)

0: 2.679298855966302


In [13]:
vi_opt_pol = vi_df_comp.loc[0,'Policy']
pi_opt_pol = pi_df_comp.loc[0,'Policy']
q_opt_pol = q_df_comp.loc[0,'Policy']

In [15]:
vi_opt_pol == pi_opt_pol

True

In [16]:
vi_opt_pol == q_opt_pol

False

In [17]:
count = 0
for i in range(len(vi_opt_pol)):
    if vi_opt_pol[i] != q_opt_pol[i]:
        count += 1

In [18]:
count

116

In [19]:
count / len(vi_opt_pol)

0.232