In [4]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
# import hiive_mdptoolbox.example
# import hiive_mdptoolbox
import gym
import numpy as np
import sys
import os
from numpy.random import choice
import pandas as pd
import seaborn as sns
np.random.seed(44)

In [5]:
P, R = forest(S=500, r1=100, r2= 15, p=0.01)

In [6]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

In [7]:
def test_policy(P, R, policy, test_count=100, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # when go back to 0 ended
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode


In [8]:
def trainVI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df

In [9]:
vi_df = trainVI(P, R, epsilon=[1e-1, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",79,0.038722,2.790057,"(4.710556185449387, 5.239434944489701, 5.23943..."
1,0.001,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",119,0.031787,2.726258,"(4.7117745667154995, 5.240595870281114, 5.2405..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",179,0.045519,2.737932,"(4.711792669916437, 5.240613400253226, 5.24061..."
3,1e-09,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",239,0.063116,2.801772,"(4.711792702216012, 5.240613431989174, 5.24061..."
4,1e-12,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",299,0.158067,2.736673,"(4.711792702273827, 5.240613432046434, 5.24061..."
5,1e-15,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",349,0.166646,2.728691,"(4.7117927022739305, 5.240613432046538, 5.2406..."


In [21]:
table_df=vi_df[['Epsilon','Iteration','Time','Reward']]
heading_properties = [('font-size', '18px')]

cell_properties = [('font-size', '16px')]

dfstyle = [dict(selector="th", props=heading_properties),\
 dict(selector="td", props=cell_properties)]

table_df.style.set_table_styles(dfstyle)

Unnamed: 0,Epsilon,Iteration,Time,Reward
0,0.1,79,0.038722,2.790057
1,0.001,119,0.031787,2.726258
2,1e-06,179,0.045519,2.737932
3,0.0,239,0.063116,2.801772
4,0.0,299,0.158067,2.736673
5,0.0,349,0.166646,2.728691


In [10]:
pi = PolicyIteration(P, R, gamma=0.9, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(46, 12.021312236785889, 2.7411655523985554)

# Q-Learning

In [11]:
def trainQ(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001], 
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[1000000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                      alpha_min=a_min, epsilon=eps, 
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward, 
                                q.time, q.policy, q.V, rews]
                        
                        df_length = len(q_df)
                        q_df.loc[df_length] = info
    return q_df

In [12]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [1000000, 10000000]
q_df = trainQ(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 2.650183487415402
2: 2.6363744404711285
3: 2.6072561523039504
4: 2.6117818590299353
5: 2.57748663663716
6: 2.67399605909375
7: 2.6278609803234336
8: 2.626661209650027
9: 2.5297012242759016
10: 2.655280375155995
11: 2.6601539567558454
12: 2.608396759527677
13: 2.6790831093841865
14: 2.6850597811143957
15: 0.822
16: 2.676218333466832
17: 2.7135200868055835
18: 2.8117761382427333
19: 2.764109280376738
20: 2.765967761281483
21: 2.7403159646717272
22: 2.861893456347953
23: 2.656581016247709
24: 2.8327739465709403
25: 2.812239418250017
26: 2.7756577123354447
27: 2.7755963806781256
28: 2.7871103415832237
29: 2.753035768577412
30: 2.850375830314547
31: 2.7556116413006757
32: 2.8514372672801582


In [13]:
q_df

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,1000000,0.99,0.001,10.0,0.99,2.650183,68.845374,"(0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...","(4.712210801507589, 5.241277084192105, 5.24121...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1000000,0.99,0.0001,10.0,0.99,2.636374,69.441975,"(0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...","(4.672188906786895, 5.200874840421224, 4.37178...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1000000,0.999,0.001,10.0,0.99,2.607256,69.540786,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, ...","(4.711828804944402, 5.241218072985039, 5.24151...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,1000000,0.999,0.0001,10.0,0.99,2.611782,68.276857,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, ...","(4.710461366899375, 5.239581376346029, 5.09524...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,1000000,0.99,0.001,10.0,0.999,2.577487,68.128824,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, ...","(4.71057876795351, 5.240113535858624, 5.240027...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, ..."
5,1000000,0.99,0.0001,10.0,0.999,2.673996,69.000589,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, ...","(4.666537393022336, 5.1952190427092955, 4.3410...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
6,1000000,0.999,0.001,10.0,0.999,2.627861,68.723511,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(4.713478207004994, 5.242419702750562, 5.24130...","[1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, ..."
7,1000000,0.999,0.0001,10.0,0.999,2.626661,67.591351,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(4.708203589152052, 5.237402752385588, 5.13867...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
8,1000000,0.99,0.001,1.0,0.99,2.529701,67.769544,"(0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","(4.711667962492642, 5.240921762076372, 5.24098...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,1000000,0.99,0.0001,1.0,0.99,2.65528,67.983159,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, ...","(4.67205487152553, 5.200652330776339, 4.374608...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [22]:
q_df_table=q_df.loc[:, ~q_df.columns.isin(['Value Function', 'Training Rewards','Policy'])]
heading_properties = [('font-size', '18px')]

cell_properties = [('font-size', '16px')]

dfstyle = [dict(selector="th", props=heading_properties),\
 dict(selector="td", props=cell_properties)]

q_df_table.style.set_table_styles(dfstyle)

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
0,1000000,0.99,0.001,10.0,0.99,2.650183,68.845374
1,1000000,0.99,0.0001,10.0,0.99,2.636374,69.441975
2,1000000,0.999,0.001,10.0,0.99,2.607256,69.540786
3,1000000,0.999,0.0001,10.0,0.99,2.611782,68.276857
4,1000000,0.99,0.001,10.0,0.999,2.577487,68.128824
5,1000000,0.99,0.0001,10.0,0.999,2.673996,69.000589
6,1000000,0.999,0.001,10.0,0.999,2.627861,68.723511
7,1000000,0.999,0.0001,10.0,0.999,2.626661,67.591351
8,1000000,0.99,0.001,1.0,0.99,2.529701,67.769544
9,1000000,0.99,0.0001,1.0,0.99,2.65528,67.983159


In [14]:
pi_pol == q_df.Policy

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
Name: Policy, dtype: bool

In [23]:
q_df.groupby("Iterations").mean().style.set_table_styles(dfstyle)

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000000,0.9945,0.00055,5.5,0.9945,2.520468,68.393224
10000000,0.9945,0.00055,5.5,0.9945,2.78175,665.461295


In [24]:
q_df.groupby("Epsilon Decay").mean().style.set_table_styles(dfstyle)

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,2.697819,368.438054
0.999,0.9945,0.00055,5.5,2.604399,365.416465


In [25]:
q_df.groupby("Alpha Decay").mean().style.set_table_styles(dfstyle)

Unnamed: 0_level_0,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Alpha Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.00055,5.5,0.9945,2.712874,366.509178
0.999,0.00055,5.5,0.9945,2.589345,367.345341


In [26]:
q_df.groupby("Epsilon Decay").mean().style.set_table_styles(dfstyle)


Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,2.697819,368.438054
0.999,0.9945,0.00055,5.5,2.604399,365.416465


In [27]:
q_df.groupby("Alpha Min").mean().style.set_table_styles(dfstyle)

Unnamed: 0_level_0,Alpha Decay,Epsilon,Epsilon Decay,Reward,Time
Alpha Min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0001,0.9945,5.5,0.9945,2.731923,366.680707
0.001,0.9945,5.5,0.9945,2.570296,367.173812
