In [1]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
# import hiive_mdptoolbox.example
# import hiive_mdptoolbox
import gym
import numpy as np
import sys
import os
from numpy.random import choice
import pandas as pd
import seaborn as sns
np.random.seed(1)

In [2]:
P, R = forest(S=500, r1=100, r2= 15, p=0.01)

In [3]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

In [4]:
def test_policy(P, R, policy, test_count=100, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # when go back to 0 ended
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode

In [5]:
def trainVI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df

In [6]:
vi_df = trainVI(P, R, epsilon=[1e-1, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",79,0.008999,2.684473,"(4.710556185449387, 5.239434944489701, 5.23943..."
1,0.001,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",119,0.013,2.68553,"(4.7117745667154995, 5.240595870281114, 5.2405..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",179,0.019622,2.703008,"(4.711792669916437, 5.240613400253226, 5.24061..."
3,1e-09,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",239,0.026042,2.728511,"(4.711792702216012, 5.240613431989174, 5.24061..."
4,1e-12,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",299,0.033547,2.76384,"(4.711792702273827, 5.240613432046434, 5.24061..."
5,1e-15,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",349,0.038574,2.74039,"(4.7117927022739305, 5.240613432046538, 5.2406..."


In [7]:
pi = PolicyIteration(P, R, gamma=0.9, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(46, 0.2581815719604492, 2.7514575862277217)

Q Learning

In [8]:
def trainQ(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001], 
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[1000000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                      alpha_min=a_min, epsilon=eps, 
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward, 
                                q.time, q.policy, q.V, rews]
                        
                        df_length = len(q_df)
                        q_df.loc[df_length] = info
    return q_df

In [9]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [1000000, 10000000]
q_df = trainQ(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 2.687266872467038
2: 2.6941917725267146
3: 2.6130856934861337
4: 2.6354183457293745
5: 2.5946525439456187
6: 2.578153028298264
7: 2.605474004646821
8: 2.596000510175219
9: 2.6879958688866883
10: 2.632698070930168
11: 2.5620752690210344
12: 2.655248908925676
13: 2.6350142697702115
14: 2.5972195023801383
15: 2.5993041742440663
16: 2.64146729611628
17: 2.7428394253877504
18: 2.8023140156328004
19: 2.801215257481723
20: 2.866498607953126
21: 2.7342701838054717
22: 2.808678194062356
23: 2.7476693805997607
24: 2.8049362439302796
25: 2.8037095062176873
26: 2.786840065915448
27: 2.7798216843030823
28: 2.7870238293999026
29: 2.7399805986055767
30: 2.872250144174092
31: 2.818059720307557
32: 2.821077918235218


In [10]:
q_df

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,1000000,0.99,0.001,10.0,0.99,2.687267,35.644933,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, ...","(4.712867366396945, 5.241544637419329, 5.24207...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
1,1000000,0.99,0.0001,10.0,0.99,2.694192,35.704317,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","(4.673288623632353, 5.201489510785222, 4.36982...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1000000,0.999,0.001,10.0,0.99,2.613086,35.545834,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","(4.713994218416997, 5.2425928115063485, 5.2402...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1000000,0.999,0.0001,10.0,0.99,2.635418,35.276222,"(0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, ...","(4.710179611958519, 5.238968370463088, 5.07877...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
4,1000000,0.99,0.001,10.0,0.999,2.594653,35.489483,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, ...","(4.710597570993974, 5.239911102598716, 5.23988...","[1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
5,1000000,0.99,0.0001,10.0,0.999,2.578153,35.274308,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","(4.666988347406458, 5.195525021182049, 4.31935...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, ..."
6,1000000,0.999,0.001,10.0,0.999,2.605474,35.734805,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...","(4.709580409103406, 5.23863480376842, 5.239289...","[0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
7,1000000,0.999,0.0001,10.0,0.999,2.596001,35.459876,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...","(4.709151181333247, 5.237589206038483, 5.14132...","[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,1000000,0.99,0.001,1.0,0.99,2.687996,35.916738,"(0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, ...","(4.713245523606477, 5.241856288512258, 5.24102...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,1000000,0.99,0.0001,1.0,0.99,2.632698,35.855484,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...","(4.6725296754938155, 5.201291410650221, 4.3874...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."


In [11]:
pi_pol == q_df.Policy

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
Name: Policy, dtype: bool

In [12]:
q_df.groupby("Iterations").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000000,0.9945,0.00055,5.5,0.9945,2.625954,35.701204
10000000,0.9945,0.00055,5.5,0.9945,2.794824,355.424679


In [13]:
q_df.groupby("Epsilon Decay").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,2.72114,195.465613
0.999,0.9945,0.00055,5.5,2.699638,195.66027


In [14]:
q_df.groupby("Alpha Decay").mean()

Unnamed: 0_level_0,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Alpha Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.00055,5.5,0.9945,2.71238,195.592162
0.999,0.00055,5.5,0.9945,2.708399,195.53372


In [15]:
q_df.groupby("Epsilon Decay").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,2.72114,195.465613
0.999,0.9945,0.00055,5.5,2.699638,195.66027


In [16]:
q_df.groupby("Alpha Min").mean()

Unnamed: 0_level_0,Alpha Decay,Epsilon,Epsilon Decay,Reward,Time
Alpha Min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0001,0.9945,5.5,0.9945,2.723751,194.786881
0.001,0.9945,5.5,0.9945,2.697027,196.339002
