In [2]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
# import hiive_mdptoolbox.example
# import hiive_mdptoolbox
import gym
import numpy as np
import sys
import os
from numpy.random import choice
import pandas as pd
import seaborn as sns
np.random.seed(44)

In [3]:
P, R = forest(S=20, r1=10, r2=6, p=0.1)

In [4]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

In [5]:
def test_policy(P, R, policy, test_count=1000, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # when go back to 0 ended
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode


In [6]:
def trainVI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df

In [7]:
vi_df = trainVI(P, R, epsilon=[1e-1, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",33,0.002749,2.843259,"(4.328504830081768, 4.881518644971712, 4.88151..."
1,0.001,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",55,0.004467,2.842526,"(4.460720290173723, 5.013211594807497, 5.01321..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",87,0.007052,2.920512,"(4.474643139169861, 5.027129333047953, 5.02712..."
3,1e-09,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",120,0.009778,2.884589,"(4.475122825121185, 5.027609012960728, 5.02760..."
4,1e-12,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",153,0.012374,2.885044,"(4.475137648839068, 5.027623836684378, 5.02762..."
5,1e-15,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",186,0.015038,2.913634,"(4.4751381069387985, 5.027624294784101, 5.0276..."


In [24]:
table_df=vi_df[['Epsilon','Policy','Iteration','Time','Reward']]

In [26]:
heading_properties = [('font-size', '18px')]

cell_properties = [('font-size', '16px')]

dfstyle = [dict(selector="th", props=heading_properties),\
 dict(selector="td", props=cell_properties)]

table_df.style.set_table_styles(dfstyle)

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward
0,0.1,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",33,0.002749,2.843259
1,0.001,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",55,0.004467,2.842526
2,1e-06,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",87,0.007052,2.920512
3,0.0,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",120,0.009778,2.884589
4,0.0,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",153,0.012374,2.885044
5,0.0,"(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)",186,0.015038,2.913634


In [8]:
pi = PolicyIteration(P, R, gamma=0.9, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(14, 0.10313749313354492, 2.8669479098730797)

In [9]:
pi_pol

(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

# Q-Learning

In [10]:
def trainQ(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001], 
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[1000000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                      alpha_min=a_min, epsilon=eps, 
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward, 
                                q.time, q.policy, q.V, rews]
                        
                        df_length = len(q_df)
                        q_df.loc[df_length] = info
    return q_df

In [11]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [1000000, 10000000]
q_df = trainQ(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 3.2419625985195655
2: 3.45468712711018
3: 2.9725909845507967
4: 3.1199088702065234
5: 1.1
6: 3.4605589330321034
7: 3.1957922700013963
8: 0.8
9: 3.3193457298294207
10: 3.358040492544265
11: 3.052963918650888
12: 3.165223845408274
13: 3.4057947654678107
14: 3.4913868295783455
15: 3.200831840749126
16: 3.1403480871645533
17: 3.3697538256736776
18: 3.3341063241755533
19: 2.8207445098572754
20: 0.85
21: 0.95
22: 3.486291960372364
23: 3.271429634595206
24: 0.95
25: 3.1392181548026974
26: 3.4049870182429904
27: 3.1145079958705306
28: 2.9350139196004505
29: 3.455509245021752
30: 3.4166385707092233
31: 3.3223197902165595
32: 1.0


In [12]:
vi_df.Policy == pi_pol

0    True
1    True
2    True
3    True
4    True
5    True
Name: Policy, dtype: bool

In [13]:
test_policy(P,R,q_df.Policy[18])

2.9311811474308658

In [14]:
q_df

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,1000000,0.99,0.001,10.0,0.99,3.241963,63.144067,"(0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, ...","(4.469516616179936, 5.021068518048445, 5.02559...","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ..."
1,1000000,0.99,0.0001,10.0,0.99,3.454687,62.832406,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(4.439242057739486, 4.991535438072977, 4.02348...","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
2,1000000,0.999,0.001,10.0,0.99,2.972591,62.451478,"(0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, ...","(4.484346439269151, 5.034010306485361, 5.03050...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
3,1000000,0.999,0.0001,10.0,0.99,3.119909,62.65347,"(0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, ...","(4.472149848246118, 5.025254217665293, 4.80726...","[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
4,1000000,0.99,0.001,10.0,0.999,1.1,62.286294,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...","(4.475160791588194, 5.030119252396098, 5.03305...","[6.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, ..."
5,1000000,0.99,0.0001,10.0,0.999,3.460559,65.386859,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(4.4320373705496445, 4.983831695810947, 4.0022...","[1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
6,1000000,0.999,0.001,10.0,0.999,3.195792,65.176648,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, ...","(4.4755272438345965, 5.026659913734769, 5.0244...","[10.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0,..."
7,1000000,0.999,0.0001,10.0,0.999,0.8,66.51105,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, ...","(4.47387595179908, 5.02543011236457, 4.8995375...","[0.0, 0.0, 0.0, 0.0, 1.0, 10.0, 0.0, 0.0, 0.0,..."
8,1000000,0.99,0.001,1.0,0.99,3.319346,65.185775,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, ...","(4.4715302899745675, 5.025254103920376, 5.0260...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 10.0, 0.0, 0.0,..."
9,1000000,0.99,0.0001,1.0,0.99,3.35804,67.566188,"(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, ...","(4.440501282624428, 4.9917634204759755, 4.0250...","[0.0, 10.0, 0.0, 0.0, 10.0, 0.0, 0.0, 0.0, 1.0..."


In [32]:
q_df_table=q_df.loc[:, ~q_df.columns.isin(['Value Function', 'Training Rewards','Policy'])]

In [34]:
heading_properties = [('font-size', '18px')]

cell_properties = [('font-size', '16px')]

dfstyle = [dict(selector="th", props=heading_properties),\
 dict(selector="td", props=cell_properties)]

q_df_table.style.set_table_styles(dfstyle)

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
0,1000000,0.99,0.001,10.0,0.99,3.241963,63.144067
1,1000000,0.99,0.0001,10.0,0.99,3.454687,62.832406
2,1000000,0.999,0.001,10.0,0.99,2.972591,62.451478
3,1000000,0.999,0.0001,10.0,0.99,3.119909,62.65347
4,1000000,0.99,0.001,10.0,0.999,1.1,62.286294
5,1000000,0.99,0.0001,10.0,0.999,3.460559,65.386859
6,1000000,0.999,0.001,10.0,0.999,3.195792,65.176648
7,1000000,0.999,0.0001,10.0,0.999,0.8,66.51105
8,1000000,0.99,0.001,1.0,0.99,3.319346,65.185775
9,1000000,0.99,0.0001,1.0,0.99,3.35804,67.566188


In [35]:
q_df.groupby("Iterations").mean().style.set_table_styles(dfstyle)

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000000,0.9945,0.00055,5.5,0.9945,2.967465,64.352203
10000000,0.9945,0.00055,5.5,0.9945,2.676283,656.289999


In [36]:
q_df.groupby("Epsilon Decay").mean().style.set_table_styles(dfstyle)

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,3.040816,359.545867
0.999,0.9945,0.00055,5.5,2.602931,361.096335
