## Assignment 4 - MDP

## Importing Libraries

In [2]:
import cs7641assn4 as a4
import numpy as np
import pandas as pd
import warnings
import xlrd

pd.set_option('display.max_columns', 35)
# pd.reset_option("display.max_columns")

## Importing Settings

In [36]:

# Import settings, uncomment to read settings from an excel spreadsheet
# Imports 'default_settings.xlsx' spreadsheet
settings = pd.read_excel('C:\\Users\\Mike\\Documents\\Georgia Tech\\Machine Learning\\hw4-ncook-solution-master\\default_settings.xlsx').to_dict()

# Determine the number of runs
n_settings = len(settings['rH'])

In [37]:
for n in range(n_settings):
    # Load settings
    rH = settings['rH'][n] # -1 #-5 # reward for H(ole)
    rG = settings['rG'][n] #1 # 10 # reward for G(oal)
    rF = settings['rF'][n] #-0.2# reward includes S(tart) and F(rozen)
    size = settings['size'][n] #4 # height and width of square gridworld, [4, 8, 16] are included in cs7641assn4.py 
    p = settings['p'][n] #0.8 # if generating a random map probability that a grid will be F(rozen)
    map_name = 'x'.join([str(size)]*2) # None, if you want a random map
    desc = a4.MAPS[map_name] # None, if you want a random map
    is_slippery = settings['is_slippery'][n] #False
    render_initial = settings['render_initial'][n] # True

    epsilon = settings['epsilon'][n] #1e-8 # convergence threshold for policy/value iteration
    gamma = settings['gamma'][n] #0.8 # discount parameter for past policy/value iterations
    max_iter = settings['max_iter'][n] #10000 # maximum iterations for slowly converging policy/value iteration 

    # Qlearning(env, rH=0, rG=1, rF=0, qepsilon=0.1, lr=0.8, gamma=0.95, episodes=10000)
    qepsilon = settings['qepsilon'][n] #0.1 # epsilon value for the Q-learning epsilon greedy strategy
    lr = settings['lr'][n] #0.8 # Q-learning rate
    qgamma = settings['qgamma'][n] #0.95 # Q-Learning discount factor
    episodes = settings['episodes'][n] #10000 # number of Q-learning episodes
    initial = settings['initial'][n] #0 # value to initialize the Q grid
    decay = settings['decay'][n] #True

    # Printing options
    report = settings['report'][n] #True # For cs7641assn4.py policy and value iteration functions
    display_print = settings['display_print'][n] #True # For this script

    # Create Environment
    env = a4.getEnv(env_id='hw4-FrozenLake-v0', rH=rH, rG=rG, rF=rF, 
                    desc=desc,  
                    is_slippery=is_slippery, render_initial=True)

    # Store a representation of the map
    env_desc = env.desc.astype('<U8')

    # Store a representation of the state rewards
    env_rs = a4.getStateReward(env)

    if display_print:
        # Display reward at each state
        print('\n--Reward Values at Each State--')
        a4.matprint(a4.print_value(env_rs, width=size, height=size))
        
    ## Policy Iteration
    print('\n--Policy Iteration TimeIt--')
    pi_time = %timeit -o a4.policy_iteration(env, epsilon, gamma, max_iter, report=False)
    
    pi_V, pi_policy, pi_epochs = a4.policy_iteration(env, epsilon, gamma, max_iter, report=report)

    pi_policy_arrows = a4.print_policy(pi_policy, width=size, height=size)

    if display_print:
        # Display values
        print('\n--Policy Iteration Values in grid order--')
        a4.matprint(a4.print_value(pi_V, width=size, height=size))

        # Display policy
        print('\n--Policy Iteration Policy Matrix--')
        a4.matprint(pi_policy_arrows)
        
    ## Value Iteration
    print('\n--Value Iteration TimeIt--')
    vi_time = %timeit -o a4.valueIteration(env, epsilon, gamma, max_iter, report=False)
    
    vi_V, vi_epochs = a4.valueIteration(env, epsilon, gamma, max_iter, report=report)

    vi_policy = a4.value_to_policy(env, V=vi_V, gamma=gamma)

    vi_policy_arrows = a4.print_policy(vi_policy, width=size, height=size)

    if display_print:
        # display value function:
        print('\n--Value Iteration Values in grid order--')
        a4.matprint(a4.print_value(vi_V, width=size, height=size))
        
        # display policy
        print('\n--Value Iteration Policy Matrix--')
        a4.matprint(vi_policy_arrows)
        
    ## Q-Learning
    print('\n--Q-Learning TimeIt--')
    Q_time = %timeit -o a4.Qlearning(env, qepsilon, lr, qgamma, episodes, initial, decay, report=False)
        
    Q, Q_epochs = a4.Qlearning(env, qepsilon, lr, qgamma, episodes, initial, decay, report)

    maxQ = np.max(Q,axis=1)

    Q_policy = a4.Q_to_policy(Q)

    Q_policy_arrows = a4.print_policy(Q_policy, width=size, height=size)

    if display_print: 
        print('--Q with all options--')
        a4.matprint(Q)
        print('\n--argmax(Q) in grid order--')
        a4.matprint(a4.print_value(maxQ, width=size, height=size))
        print('\n--Q-Learning Policy Matrix--')
        a4.matprint(Q_policy_arrows)
        
    ## Save results to DataFrame
    results = pd.DataFrame({'rH': [rH], 
                        'rG': [rG], 
                        'rF': [rF], 
                        'size': [size], 
                        'p': [p], 
                        'desc': [desc], 
                        'map_name': [map_name],                        
                        'is_slippery': [is_slippery],
                        'epsilon': [epsilon],
                        'gamma': [gamma], 
                        'max_iter': [max_iter], 
                        'qepsilon': [qepsilon], 
                        'lr': [lr], 
                        'qgamma': [qgamma], 
                        'episodes': [episodes], 
                        'initial': [initial],
                        'env_desc': [env_desc],
                        'env_rs': [env_rs],
                        'pi_time': [pi_time.average],
                        'pi_V': [pi_V],
                        'pi_epochs': [pi_epochs],
                        'pi_policy': [pi_policy],
                        'pi_policy_arrows': [pi_policy_arrows],
                        'vi_time': [vi_time.average],
                        'vi_V': [vi_V],
                        'vi_epochs': [vi_epochs],
                        'vi_policy': [vi_policy],
                        'vi_policy_arrows': [vi_policy_arrows],
                        'Q_time': [Q_time.average],
                        'Q': [Q],
                        'Q_epochs': [Q_epochs],
                        'Q_V': [maxQ],
                        'Q_policy': [Q_policy],
                        'Q_policy_arrows': [Q_policy_arrows]})
    
    if display_print: 
        display(results)
        
    ## Save results to disk
    warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
    try:
        dataset = pd.read_hdf('data.h5', key='dataset', mode='a')
    except FileNotFoundError:
        results.to_hdf('data.h5', key='dataset', mode='a')
    else:
        dataset.append(
            other=results, 
            ignore_index=True,
            sort=False
            ).to_hdf(
            path_or_buf='data.h5', 
            key='dataset', 
            mode='a')
        
    if display_print:
        pd.read_hdf('data.h5', key='dataset', mode='a')
    
    
print('Complete!')   

--Board--

[41mS[0mFFF
FHFH
FFFH
HFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 13, -0.2, False)],
 1: [(1.0, 14, -0.2, False)],
 2: [(1.0, 15, 1, True)],
 3: [(1.0, 10, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  
-0.2    -1  -0.2    -1  
-0.2  -0.2  -0.2    -1  
  -1  -0.2  -0.2     1  

--Policy Iteration TimeIt--
1.05 ms ± 18.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Policy iteration converged after  7 epochs

--Policy Iteration Values in grid order--
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072    2.84    3.8      -5  
    -5     3.8      5       5  

--Policy Iteration Policy Matrix--
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  

--Value Iteration TimeIt--
6.98 ms ± 69.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Value iteration converged after  91 epochs

--Value Iteration Values in grid order--
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072  

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,4,0.8,"[SFFF, FHFH, FFFH, HFFG]",4x4,False,1e-08,0.8,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F], [F, H, F, H], [F, F, F, H], [H,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -0.2, -1....",0.001048,"[0.9660800000000008, 1.457600000000001, 2.0720...",7,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.00698,"[0.9660799905143127, 1.4575999905143127, 2.071...",91,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.012021,"[[0.3737963529136319, 0.6039965355670365, 0.55...",43,"[0.6039965355670365, 0.8340003724087158, 1.099...","[1, 2, 1, 2, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, →], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,..."


--Board--

[41mS[0mFFF
FHFH
FFFH
HFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 13, -0.2, False)],
 1: [(1.0, 14, -0.2, False)],
 2: [(1.0, 15, 1, True)],
 3: [(1.0, 10, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  
-0.2    -1  -0.2    -1  
-0.2  -0.2  -0.2    -1  
  -1  -0.2  -0.2     1  

--Policy Iteration TimeIt--
1.03 ms ± 8.53 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Policy iteration converged after  7 epochs

--Policy Iteration Values in grid order--
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072    2.84    3.8      -5  
    -5     3.8      5       5  

--Policy Iteration Policy Matrix--
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  

--Value Iteration TimeIt--
7 ms ± 103 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Value iteration converged after  91 epochs

--Value Iteration Values in grid order--
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072    2.

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,4,0.8,"[SFFF, FHFH, FFFH, HFFG]",4x4,False,1e-08,0.8,10000,0.1,0.6,0.95,10000,1,"[[S, F, F, F], [F, H, F, H], [F, F, F, H], [H,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -0.2, -1....",0.001032,"[0.9660800000000008, 1.457600000000001, 2.0720...",7,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.007002,"[0.9660799905143127, 1.4575999905143127, 2.071...",91,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.015068,"[[0.37379406083697947, 0.6039964852531244, 0.5...",64,"[0.6039964852531244, 0.8443149112725706, 1.101...","[1, 2, 1, 0, 1, 0, 1, 0, 2, 2, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, →, ↓, ←], [←,..."


--Board--

[41mS[0mFFF
FHFH
FFFH
HFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 13, -0.2, False)],
 1: [(1.0, 14, -0.2, False)],
 2: [(1.0, 15, 1, True)],
 3: [(1.0, 10, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  
-0.2    -1  -0.2    -1  
-0.2  -0.2  -0.2    -1  
  -1  -0.2  -0.2     1  

--Policy Iteration TimeIt--
1.15 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Policy iteration converged after  7 epochs

--Policy Iteration Values in grid order--
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072    2.84    3.8      -5  
    -5     3.8      5       5  

--Policy Iteration Policy Matrix--
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  

--Value Iteration TimeIt--
7.47 ms ± 615 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Value iteration converged after  91 epochs

--Value Iteration Values in grid order--
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072   

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,4,0.8,"[SFFF, FHFH, FFFH, HFFG]",4x4,False,1e-08,0.8,10000,0.1,0.4,0.95,10000,1,"[[S, F, F, F], [F, H, F, H], [F, F, F, H], [H,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -0.2, -1....",0.001147,"[0.9660800000000008, 1.457600000000001, 2.0720...",7,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.00747,"[0.9660799905143127, 1.4575999905143127, 2.071...",91,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.022006,"[[0.3737190795658745, 0.6039944756997857, 0.59...",95,"[0.6039944756997857, 0.8412160114463848, 1.100...","[1, 2, 1, 0, 1, 0, 1, 0, 2, 2, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, →, ↓, ←], [←,..."


--Board--

[41mS[0mFFF
FHFH
FFFH
HFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 13, -0.2, False)],
 1: [(1.0, 14, -0.2, False)],
 2: [(1.0, 15, 1, True)],
 3: [(1.0, 10, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  
-0.2    -1  -0.2    -1  
-0.2  -0.2  -0.2    -1  
  -1  -0.2  -0.2     1  

--Policy Iteration TimeIt--
1.08 ms ± 57.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Policy iteration converged after  7 epochs

--Policy Iteration Values in grid order--
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072    2.84    3.8      -5  
    -5     3.8      5       5  

--Policy Iteration Policy Matrix--
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  

--Value Iteration TimeIt--
7.05 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Value iteration converged after  91 epochs

--Value Iteration Values in grid order--
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072   

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,4,0.8,"[SFFF, FHFH, FFFH, HFFG]",4x4,False,1e-08,0.8,10000,0.1,0.2,0.95,10000,1,"[[S, F, F, F], [F, H, F, H], [F, F, F, H], [H,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -0.2, -1....",0.001085,"[0.9660800000000008, 1.457600000000001, 2.0720...",7,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.007053,"[0.9660799905143127, 1.4575999905143127, 2.071...",91,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.039579,"[[0.3735967020792117, 0.6039960735176774, 0.59...",200,"[0.6039960735176774, 0.8440913768860469, 1.100...","[1, 2, 1, 0, 1, 0, 1, 0, 2, 2, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, →, ↓, ←], [←,..."


--Board--

[41mS[0mFFF
FHFH
FFFH
HFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 13, -0.2, False)],
 1: [(1.0, 14, -0.2, False)],
 2: [(1.0, 15, 1, True)],
 3: [(1.0, 10, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  
-0.2    -1  -0.2    -1  
-0.2  -0.2  -0.2    -1  
  -1  -0.2  -0.2     1  

--Policy Iteration TimeIt--
1.07 ms ± 85.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Policy iteration converged after  7 epochs

--Policy Iteration Values in grid order--
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072    2.84    3.8      -5  
    -5     3.8      5       5  

--Policy Iteration Policy Matrix--
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  

--Value Iteration TimeIt--
7.4 ms ± 749 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Value iteration converged after  91 epochs

--Value Iteration Values in grid order--
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072    

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,4,0.8,"[SFFF, FHFH, FFFH, HFFG]",4x4,False,1e-08,0.8,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F], [F, H, F, H], [F, F, F, H], [H,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -0.2, -1....",0.001069,"[0.9660800000000008, 1.457600000000001, 2.0720...",7,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.007397,"[0.9660799905143127, 1.4575999905143127, 2.071...",91,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.01168,"[[0.373796048188534, 0.15160444744571291, 0.60...",39,"[0.6039965768084528, 0.8463121873560248, 1.101...","[2, 2, 1, 0, 3, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[→, →, ↓, ←], [↑, ←, ↓, ←], [→, ↓, ↓, ←], [←,..."


--Board--

[41mS[0mFFF
FHFH
FFFH
HFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 13, -0.2, False)],
 1: [(1.0, 14, -0.2, False)],
 2: [(1.0, 15, 1, True)],
 3: [(1.0, 10, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  
-0.2    -1  -0.2    -1  
-0.2  -0.2  -0.2    -1  
  -1  -0.2  -0.2     1  

--Policy Iteration TimeIt--
1.07 ms ± 63.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Policy iteration converged after  7 epochs

--Policy Iteration Values in grid order--
-0.2667  -0.1112  0.148  -0.1112  
-0.1112     -2.5   0.58     -2.5  
  0.148     0.58    1.3     -2.5  
   -2.5      1.3    2.5      2.5  

--Policy Iteration Policy Matrix--
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  

--Value Iteration TimeIt--
3.09 ms ± 28.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Value iteration converged after  41 epochs

--Value Iteration Values in grid order--
-0.2667  -0.1112  0.148  -0.1112  
-0.1112     -2.5   0.58  

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,4,0.8,"[SFFF, FHFH, FFFH, HFFG]",4x4,False,1e-08,0.6,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F], [F, H, F, H], [F, F, F, H], [H,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -0.2, -1....",0.001068,"[-0.26672000000000007, -0.11120000000000006, 0...",7,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.00309,"[-0.2667200033418737, -0.11120000334187365, 0....",41,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.012003,"[[0.3737475163092963, 0.6039113509679276, 0.60...",48,"[0.6039965780005706, 0.8463121874938313, 1.101...","[2, 2, 1, 0, 1, 0, 1, 0, 2, 2, 1, 0, 0, 2, 2, 0]","[[→, →, ↓, ←], [↓, ←, ↓, ←], [→, →, ↓, ←], [←,..."


--Board--

[41mS[0mFFF
FHFH
FFFH
HFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 13, -0.2, False)],
 1: [(1.0, 14, -0.2, False)],
 2: [(1.0, 15, 1, True)],
 3: [(1.0, 10, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  
-0.2    -1  -0.2    -1  
-0.2  -0.2  -0.2    -1  
  -1  -0.2  -0.2     1  

--Policy Iteration TimeIt--
1.06 ms ± 33.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Policy iteration converged after  7 epochs

--Policy Iteration Values in grid order--
-0.3129  -0.2821  -0.2053  -0.2821  
-0.2821  -1.6667  -0.0133  -1.6667  
-0.2053  -0.0133   0.4667  -1.6667  
-1.6667   0.4667   1.6667   1.6667  

--Policy Iteration Policy Matrix--
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  

--Value Iteration TimeIt--
1.86 ms ± 37.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Value iteration converged after  24 epochs

--Value Iteration Values in grid order--
-0.3129  -0.2821  -0.2053  -0.2821  
-0.2821  -1.66

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,4,0.8,"[SFFF, FHFH, FFFH, HFFG]",4x4,False,1e-08,0.4,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F], [F, H, F, H], [F, F, F, H], [H,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -0.2, -1....",0.001056,"[-0.31285333333333337, -0.28213333333333335, -...",7,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.001862,"[-0.3128533345061458, -0.28213333450614575, -0...",24,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.011756,"[[0.3737338917466591, 0.6039965777340898, 0.55...",45,"[0.6039965777340898, 0.8272964838965975, 1.097...","[1, 2, 1, 3, 1, 0, 1, 0, 2, 2, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ↑], [↓, ←, ↓, ←], [→, →, ↓, ←], [←,..."


--Board--

[41mS[0mFFF
FHFH
FFFH
HFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 13, -0.2, False)],
 1: [(1.0, 14, -0.2, False)],
 2: [(1.0, 15, 1, True)],
 3: [(1.0, 10, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  
-0.2    -1  -0.2    -1  
-0.2  -0.2  -0.2    -1  
  -1  -0.2  -0.2     1  

--Policy Iteration TimeIt--
1.04 ms ± 30 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Policy iteration converged after  7 epochs

--Policy Iteration Values in grid order--
-0.2495  -0.2476  -0.238  -0.2476  
-0.2476    -1.25   -0.19    -1.25  
 -0.238    -0.19    0.05    -1.25  
  -1.25     0.05    1.25     1.25  

--Policy Iteration Policy Matrix--
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  

--Value Iteration TimeIt--
1.18 ms ± 62.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Value iteration converged after  15 epochs

--Value Iteration Values in grid order--
-0.2495  -0.2476  -0.238  -0.2476  
-0.2476    -1.25   -0

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,4,0.8,"[SFFF, FHFH, FFFH, HFFG]",4x4,False,1e-08,0.2,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F], [F, H, F, H], [F, F, F, H], [H,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -0.2, -1....",0.001044,"[-0.24952000000000002, -0.24760000000000001, -...",7,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.001177,"[-0.24952000020480003, -0.24760000020480002, -...",15,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",0.011811,"[[0.37375058649803716, 0.5326134791180845, 0.6...",42,"[0.6039949336726316, 0.8463117678348976, 1.101...","[2, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[→, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

358 ms ± 61.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Q-Learning converged after  301 epochs
--Q with all options--
  -2.24232    -2.14987     -2.14986    -2.24232  
  -2.21599     -2.0525     -2.05766     -2.0696  
  -2.10152    -1.98243     -1.98502    -2.00718  
  -2.02809     -1.9081     -1.90729    -1.94728  
  -1.93752     -1.9264     -1.82321    -1.83534  
  -1.88501    -1.71206     -1.94981    -1.71186  
        -1          -1           -1          -1  
        -1          -1           -1          -1  
     -1.36    -1.18436     -1.26097    -1.17663  
  -1.17715    -1.25283     -1.28377    -1.20439  
  -1.22214    -1.18229     -1.28439    -1.17663  
  -1.19728    -1.19515      -1.2884    -1.17663  
  -1.19453    -1.23604      -1.2879    -1.28957  
  -1.22658    -1.18336     -1.28859    -1.17663  
   -1.1968       -1.36     -1.29052    -1.28957  
  -1.27678    -1.20068     -1.28957    -1.28957  
  -2.13506     -2.0525      -2.0525    -2.09785  
  -2.13076       -1.

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.8,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.117051,"[-0.99071544970536, -0.9883943121316999, -0.98...",19,"[1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, ↓, ↓, ←, ←, ←, ←,...",0.130137,"[-0.9907154521919761, -0.9883943146183161, -0....",97,"[1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",0.358326,"[[-2.242318303603281, -2.149866264722395, -2.1...",301,"[-2.149860665277897, -2.052499668424097, -1.98...","[2, 1, 1, 2, 2, 3, 0, 0, 3, 0, 3, 3, 0, 3, 0, ...","[[→, ↓, ↓, →, →, ↑, ←, ←, ↑, ←, ↑, ↑, ←, ↑, ←,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

381 ms ± 73.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Q-Learning converged after  372 epochs
--Q with all options--
  -2.24232    -2.14986      -2.14986     -2.24233  
  -2.22465     -2.0525      -2.05575     -2.05479  
  -2.07098    -1.97269      -1.95806     -1.97821  
  -1.85529    -1.89664      -1.85981     -1.87095  
  -1.84249    -1.94517      -1.76716     -1.83687  
  -1.78023    -1.67779       -1.7612     -1.66513  
        -1          -1            -1           -1  
        -1          -1            -1           -1  
   -1.7612   -0.998624     -0.931289    -0.928732  
 -0.924093   -0.888602     -0.899569    -0.919305  
 -0.900893   -0.890036     -0.899159    -0.833744  
 -0.843608   -0.876263     -0.824617    -0.833744  
 -0.876694   -0.825227     -0.826193    -0.833744  
 -0.848498   -0.774427     -0.806406    -0.833744  
 -0.834176       -0.77     -0.791161    -0.833744  
 -0.837793   -0.822959     -0.781849    -0.790625  
  -2.12752    -2.05249      -2.05249  

←  →  ↓  ←  →  ↓  →  →  ↓  →  ↑  ←  ↑  ←  ←  ←  
↑  ←  ↓  ↓  →  ←  ←  ←  →  ↓  ↓  ←  ←  ←  ←  ←  
→  ←  →  ←  ←  ↓  ←  ↑  ↓  ↓  ↓  ←  ←  ←  ←  ←  
↓  →  ←  →  ↓  ↑  ↓  ←  ↓  ↑  ←  ←  ←  ←  ←  ←  
←  →  ←  ↑  ↑  ↓  →  ←  ↑  →  →  →  →  →  ←  ←  
←  ↓  ←  →  ←  ←  ←  →  ↓  ←  ↓  →  ↓  ↓  ←  ←  
↓  ↓  ↑  →  ↑  ←  ←  ←  ↓  ←  ↓  →  →  →  ↓  ←  
↓  ↓  ←  ↑  ↑  ←  →  ↓  ←  ←  ↓  ↓  ↑  ←  ↓  ↓  
→  →  ↓  ←  ←  ↓  ←  →  ←  ←  ↓  ↓  ←  →  ↓  ↓  
↑  ↑  ←  →  ←  ←  ↓  ←  →  ↓  ↓  ←  ←  ↓  →  ↓  
←  ←  ←  ←  ←  →  ↑  ↓  ←  →  ↓  ←  ↓  ←  →  ↓  
←  ←  ←  ←  ←  ↑  ↑  ↑  →  →  →  →  ↓  ↓  ↓  ↓  
←  ←  ←  ←  ←  ←  ↓  ←  ←  →  →  →  →  →  →  ←  


Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.8,10000,0.1,0.6,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.10706,"[-0.99071544970536, -0.9883943121316999, -0.98...",19,"[1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, ↓, ↓, ←, ←, ←, ←,...",0.1344,"[-0.9907154521919761, -0.9883943146183161, -0....",97,"[1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",0.381263,"[[-2.2423236997312137, -2.1498565253506303, -2...",372,"[-2.1498565253506303, -2.0524993331067183, -1....","[1, 1, 2, 0, 2, 3, 0, 0, 3, 1, 3, 2, 1, 1, 1, ...","[[↓, ↓, →, ←, →, ↑, ←, ←, ↑, ↓, ↑, →, ↓, ↓, ↓,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

515 ms ± 93.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Q-Learning converged after  552 epochs
--Q with all options--
  -2.2416     -2.14981   -2.14985   -2.24188  
 -2.21572      -2.0525   -2.06023   -2.14692  
 -2.11086     -1.96883   -1.97102   -1.99881  
 -1.98545      -1.8849   -1.87319   -1.87686  
 -1.79324     -1.93216   -1.79851   -1.79941  
 -1.77303     -1.71226   -1.72061   -1.70694  
       -1           -1         -1         -1  
       -1           -1         -1         -1  
  -1.3128    -0.992144   -1.03556   -1.02308  
 -1.01097     -1.03153   -1.05105   -1.04302  
 -1.02549     -1.01964  -0.974965  -0.982676  
-0.980025    -0.941317   -0.97827  -0.954885  
-0.962539    -0.933186  -0.974729  -0.921098  
-0.939822     -0.93612  -0.962899  -0.921098  
-0.924785      -1.3128  -0.904316  -0.921098  
-0.924201    -0.906627  -0.920517  -0.921098  
 -2.13475     -2.05245   -2.05247   -2.22723  
 -2.11244        -1.95   -1.95063    -2.1427  
 -1.94284     -1.87057  

 -0.888  -0.9173       -1  -1.2612  -1.1885  -1.1733  -1.0863       -1   -0.888  -0.8386  -0.7908       -1       -1       -1       -1        1  
     -1  -0.9108       -1  -1.1654  -1.1414  -1.0831  -1.0134  -0.9193  -0.8382  -0.7746  -0.6642  -0.5447   -0.424  -0.2877   -0.234       -1  
-0.9181  -0.9598  -1.0116   -1.075  -1.0898  -1.0338   -0.993  -0.9141  -0.8212       -1  -0.5429  -0.4461  -0.3319  -0.1658       -1        1  
-0.9136  -0.9177  -0.9734   -1.017  -1.0314  -0.9786       -1       -1  -0.7282       -1  -0.4241  -0.3447  -0.1626   0.0824     0.33       -1  
-0.9185  -0.9195       -1  -0.9639  -0.9676  -0.9094  -0.8078  -0.6974  -0.6024       -1  -0.3121  -0.2608    -0.18       -1   0.5838   0.4426  
-0.9103   -0.904   -0.888       -1       -1   -0.888       -1  -0.6003   -0.474  -0.3325  -0.2303  -0.2159       -1   0.6428   0.8397   0.5472  
 -0.888   -0.888   -0.888  -0.8746  -0.8819       -1   0.1247       -1  -0.3549  -0.1893  -0.0013  -0.1518       -1   0.7789      

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.8,10000,0.1,0.4,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.108923,"[-0.99071544970536, -0.9883943121316999, -0.98...",19,"[1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, ↓, ↓, ←, ←, ←, ←,...",0.13187,"[-0.9907154521919761, -0.9883943146183161, -0....",97,"[1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",0.51478,"[[-2.241604020157463, -2.1498080874948906, -2....",552,"[-2.1498080874948906, -2.052498389185882, -1.9...","[1, 1, 1, 2, 0, 3, 0, 0, 1, 0, 2, 1, 3, 3, 2, ...","[[↓, ↓, ↓, →, ←, ↑, ←, ←, ↓, ←, →, ↓, ↑, ↑, →,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

907 ms ± 82.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Q-Learning converged after  1022 epochs
--Q with all options--
   -2.2419   -2.14975    -2.14974    -2.24164  
  -2.22008   -2.05248    -2.05343    -2.11957  
  -2.04231   -1.95737    -1.96012     -1.9869  
  -1.92379   -1.86534    -1.86915    -1.87354  
  -1.78459   -1.78782    -1.77972    -1.78274  
  -1.70263   -1.68395     -1.6966    -1.68452  
        -1         -1          -1          -1  
        -1         -1          -1          -1  
 -0.983344  -0.904111   -0.922041   -0.913549  
 -0.893628  -0.894153   -0.869394   -0.882373  
 -0.822767   -0.83536   -0.866855   -0.819073  
  -0.81371  -0.854504   -0.837191   -0.848074  
  -0.80734  -0.814014   -0.828298   -0.819073  
  -0.80621  -0.791009   -0.809756   -0.813836  
 -0.779016  -0.983344   -0.787881   -0.786942  
 -0.774978  -0.782352   -0.783989   -0.754487  
  -2.12934   -2.05241    -2.05242    -2.22802  
  -2.07664      -1.95    -1.96852     -2.1216  
   -1

↓  ↑  ←  ↓  →  ←  ←  ←  →  →  ↓  ←  ←  ←  ←  ←  
←  ↓  ←  ↓  ←  →  ←  ↓  ↓  →  ↓  ↓  ↓  ↓  ←  ←  
←  ↓  ←  ↓  ↓  ←  →  →  ↓  ←  ↓  →  ↓  ←  ←  ←  
↓  ↑  ↑  ←  ↑  ←  ←  ←  ↓  ←  ↓  →  →  →  ↓  ←  
←  ↑  ←  ↑  ←  →  →  ↓  ↓  ←  ↓  →  ←  ←  ↓  ↓  
↑  →  ↑  ←  ←  ↑  ←  →  ↓  →  ↓  ←  ←  ↓  →  ↓  
↑  ←  ↓  ↑  ↓  ←  ↓  ←  →  ↓  ↓  ↑  ←  ↓  →  ↓  
←  ←  ←  ←  ←  ↓  ↑  ↑  ←  ↓  ↓  ←  →  ←  ↓  ↓  
←  ←  ←  ←  ←  →  ↑  ↓  ←  →  →  →  ↓  →  ↓  ↓  
←  ←  ←  ←  ←  ←  ↑  ↓  ←  ↓  →  →  →  →  →  ←  


Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.8,10000,0.1,0.2,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.106978,"[-0.99071544970536, -0.9883943121316999, -0.98...",19,"[1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, ↓, ↓, ←, ←, ←, ←,...",0.133026,"[-0.9907154521919761, -0.9883943146183161, -0....",97,"[1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",0.906675,"[[-2.24189556909803, -2.14975081797252, -2.149...",1022,"[-2.1497404950135057, -2.0524790779077176, -1....","[2, 1, 1, 1, 2, 1, 0, 0, 1, 2, 3, 0, 0, 1, 0, ...","[[→, ↓, ↓, ↓, →, ↓, ←, ←, ↓, →, ↑, ←, ←, ↓, ←,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

345 ms ± 56.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Q-Learning converged after  323 epochs
--Q with all options--
  -2.24236    -2.14987   -2.14987    -2.24237  
  -2.23944     -2.0525   -2.07276    -2.14983  
  -2.10823    -1.99157   -2.02638    -2.05388  
  -2.01868    -1.97172     -1.971      -1.966  
  -1.93455     -1.9264    -1.9003    -1.89903  
  -1.88843    -1.81041     -1.832    -1.82017  
        -1          -1         -1          -1  
        -1          -1         -1          -1  
     -1.36    -1.35453   -1.28157    -1.31293  
  -1.31341    -1.35671   -1.29584    -1.28957  
  -1.19885     -1.2999   -1.19226    -1.17663  
  -1.15709    -1.21654   -1.17763    -1.17663  
  -1.21797    -1.19479   -1.27561    -1.17663  
  -1.26775    -1.25678   -1.29042    -1.28957  
  -1.31167       -1.36   -1.26525    -1.28957  
   -1.3053    -1.27858   -1.28957    -1.28957  
  -2.14978     -2.0525    -2.0525    -2.21673  
   -2.0463       -1.95   -1.97708    -2.00336  
  -2.0

↓  ↓  ↓  ↑  ↑  ↓  ←  ←  →  ↑  ↑  ←  ↑  ↓  →  ↓  
↓  ↓  ↓  ↓  ←  ↓  ←  ←  ↑  ←  →  ↑  ←  ↓  ←  ↓  
→  ←  ↓  →  →  ↓  ↓  ←  ↑  ←  ↓  ←  →  →  →  ←  
←  →  ↓  ←  →  ↑  ←  →  →  ↓  ↓  →  ←  ←  ↑  ←  
←  ←  ↓  →  →  ↓  ←  ←  →  ←  ↓  ←  ←  ←  ←  ←  
↑  ←  →  ↓  ←  ↓  ←  →  ↓  ←  ←  ←  ←  ←  ←  ←  
←  ←  ←  →  ↓  ←  ↓  ←  ↓  ↓  ↑  ←  ←  ←  ←  ←  
←  ↑  ←  →  ↓  →  ↓  →  ↓  ↑  →  ↓  →  →  ←  ←  
→  ↑  →  ↑  →  ↓  ↑  →  ↓  ←  →  →  →  ↓  ←  ←  
→  ↓  ←  ←  ←  ↓  ←  ←  ↓  ←  →  →  →  →  ↓  ←  
→  ↓  ←  ↑  →  →  →  ↓  ←  ←  ↓  ↑  ←  ←  ↓  ↓  
↓  ↑  ←  ←  ←  ↑  ←  ↑  →  ↓  ↓  ←  ←  →  →  ↓  
←  ↑  ↑  →  ←  ←  ↓  ←  →  ↓  ↓  ←  ←  →  ←  ↓  
←  ←  ←  ←  ←  ↓  ↑  ←  ←  ↓  ↓  ←  →  ↓  ↓  ↓  
←  ←  ←  ←  ←  ↑  →  ↑  →  →  →  →  →  ↓  ↓  ↓  
←  ←  ←  ←  ←  ←  ↓  ←  ←  →  →  →  →  →  →  ←  


Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.8,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.107953,"[-0.99071544970536, -0.9883943121316999, -0.98...",19,"[1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, ↓, ↓, ←, ←, ←, ←,...",0.141809,"[-0.9907154521919761, -0.9883943146183161, -0....",97,"[1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",0.344643,"[[-2.2423641824564347, -2.1498734395300088, -2...",323,"[-2.1498734395300088, -2.052499972271947, -1.9...","[1, 1, 1, 3, 3, 1, 0, 0, 2, 3, 3, 0, 3, 1, 2, ...","[[↓, ↓, ↓, ↑, ↑, ↓, ←, ←, →, ↑, ↑, ←, ↑, ↓, →,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

343 ms ± 59.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Q-Learning converged after  303 epochs
--Q with all options--
 -2.23758    -2.14968    -2.14987    -2.22603  
 -2.05307     -2.0525    -2.06323    -2.13174  
 -2.06547    -1.97004    -1.96431    -1.98921  
 -1.94879    -1.89795    -1.88839    -1.89172  
 -1.85723    -1.94528    -1.80478     -1.8009  
 -1.78322    -1.73237     -1.9264    -1.78606  
       -1          -1          -1          -1  
       -1          -1          -1          -1  
    -1.36     -1.1037    -1.22768    -1.17663  
  -1.1768    -1.13804    -1.16802    -1.17663  
 -1.16095    -1.14013    -1.06648    -1.05899  
 -1.05741    -1.11829    -1.02879    -1.10307  
-0.974868    -1.09087   -0.991111   -0.972002  
-0.990005    -1.02331   -0.939272   -0.936451  
  -1.0666       -1.36    -1.05292    -1.05899  
 -1.07888    -1.12685    -1.05899    -1.05899  
 -2.12711    -2.05247    -2.05237    -2.20224  
 -2.04254       -1.95    -1.98361    -2.06928  
 -1.89

↓  ←  ←  ↓  ↓  ↓  ←  ←  ↓  ←  →  ↓  ←  →  ↓  ←  
→  ↓  ←  →  →  →  →  →  ↓  ←  ↓  ←  ↑  ←  ↓  ↓  
←  ←  ←  ←  ←  ↑  ←  →  →  ↓  ↓  ↓  ←  →  →  ↓  
↑  ←  →  ←  ←  ←  ↑  ←  →  ↓  ↓  ←  ←  ↓  ↓  ↓  
←  ←  ←  ←  ←  →  ←  ←  ←  ↓  ↓  ←  →  ↓  ↓  ↓  
←  ←  ←  ←  ←  ↑  ←  ↓  ←  →  ↓  →  →  ↓  ↓  ↓  
←  ←  ←  ←  ←  ←  ↓  ↑  ←  ↓  →  →  →  →  →  ←  


Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.6,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.117467,"[-0.4999988946304014, -0.49999815771733574, -0...",21,"[1, 1, 2, 2, 2, 1, 0, 0, 2, 1, 1, 0, 1, 0, 0, ...","[[↓, ↓, →, →, →, ↓, ←, ←, →, ↓, ↓, ←, ↓, ←, ←,...",0.056136,"[-0.4999988953522461, -0.4999981584391804, -0....",44,"[1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",0.342958,"[[-2.237583534539702, -2.149678017161159, -2.1...",303,"[-2.149678017161159, -2.0524998582434955, -1.9...","[1, 1, 2, 2, 3, 1, 0, 0, 1, 1, 3, 2, 3, 3, 2, ...","[[↓, ↓, →, →, ↑, ↓, ←, ←, ↓, ↓, ↑, →, ↑, ↑, →,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

342 ms ± 59.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Q-Learning converged after  316 epochs
--Q with all options--
  -2.24234    -2.14987    -2.14987    -2.24238  
  -2.23665     -2.0525     -2.0714    -2.14784  
  -2.04459    -1.99869    -1.99292    -1.99512  
  -2.06658    -1.97835    -1.95288    -2.00092  
  -1.98983    -1.94996    -1.86352    -1.86935  
  -1.85758     -1.7849      -1.832    -1.84754  
        -1          -1          -1          -1  
        -1          -1          -1          -1  
    -1.832    -1.18113    -1.21323    -1.14224  
  -1.12732    -1.23153    -1.25825    -1.17663  
  -1.17498    -1.25577    -1.16097    -1.17663  
  -1.17411    -1.17577    -1.14684    -1.05899  
  -1.13981    -1.19934    -1.17265    -1.17663  
  -1.17449    -1.17899    -1.17561    -1.17663  
  -1.18433      -1.832    -1.15686    -1.17663  
  -1.15227    -1.15064    -1.17663    -1.17663  
  -2.14969     -2.0525     -2.0525    -2.23778  
  -2.07124       -1.95    -2.00785   

-1.1686  -1.1768  -1.1908  -1.1762   -1.064       -1   0.6176       -1  -0.3176  -0.0527   0.1549  -0.0935       -1   0.4694    0.416   1.3699  
     -1       -1       -1       -1       -1   0.5411   0.7504   0.5315       -1   0.1551   0.3738       -1   0.5472   0.4541   1.2172   1.6525  
      1        1        1        1       -1    0.608   0.5984   0.4059   0.3317   0.3738    0.604   0.8463   1.1014    0.608   1.4762     1.95  
      1        1        1        1        1       -1   0.4237   0.4237       -1   0.2467   0.3055    1.066   1.3699   1.6525     1.95        1  

--Q-Learning Policy Matrix--
↓  ↓  →  →  →  ↓  ←  ←  ↑  ←  →  ↑  ←  ←  →  ↓  
↓  ↓  ↓  ↓  ←  ↓  ↓  ←  ↑  ↑  →  ←  →  ↓  ←  ↑  
→  ←  ↓  →  ↓  ↓  ↓  ←  ↑  ←  ↓  ←  ↓  ↑  ←  ←  
←  →  ↓  ←  ↓  ↓  →  →  →  →  →  →  ←  ←  ↑  ←  
←  ←  ↓  ↓  →  ↓  ←  ←  →  ↓  ←  ←  ←  ←  ←  ←  
←  ←  →  ↓  ←  ←  ←  →  ↑  ↓  ↓  ←  ←  ←  ←  ←  
←  ←  ←  →  ↓  ←  ↓  ←  ↑  ←  ↓  ←  ←  ←  ←  ←  
←  ↓  ←  →  →  ↑  →  ↓  →  →  ↑  →  →  →  ←  ← 

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.4,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.106519,"[-0.33333333332756876, -0.3333333333189219, -0...",19,"[1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, ↓, ↓, ←, ←, ←, ←,...",0.032775,"[-0.33333333323950837, -0.33333333323950837, -...",25,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[←, ←, ←, ←, ←, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",0.342001,"[[-2.2423428805237444, -2.149873352808323, -2....",316,"[-2.149873352808323, -2.0524998819832887, -1.9...","[1, 1, 2, 2, 2, 1, 0, 0, 3, 0, 2, 3, 0, 0, 2, ...","[[↓, ↓, →, →, →, ↓, ←, ←, ↑, ←, →, ↑, ←, ←, →,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

Q-Learning converged after  322 epochs
--Q with all options--
  -2.23081    -2.14987    -2.14987    -2.24225  
  -2.24211     -2.0525     -2.0794    -2.12173  
  -2.06298    -1.99609    -1.99514    -2.04676  
  -2.01677    -1.91385    -1.90045    -1.90548  
  -1.89022    -1.94906    -1.85035    -1.81092  
  -1.78616    -1.79414    -1.94528    -1.77602  
        -1          -1          -1          -1  
        -1          -1          -1          -1  
     -1.36    -1.18106    -1.28956    -1.17663  
  -1.20012    -1.24448    -1.28957    -1.28957  
  -1.29534    -1.26025    -1.34759    -1.28957  
  -1.27795    -1.34707    -1.37599    -1.34657  
  -1.32913    -1.36169    -1.28322    -1.28957  
  -1.34113    -1.28127     -1.2405    -1.28957  
  -1.28843       -1.36    -1.22993    -1.17663  
  -1.28529    -1.19713    -1.28957    -1.28957  
  -2.14894     -2.0525     -2.0525    -2.21985  
   -2.1388       -1.95    -1.99312     -2.0048  
  -1.92241    -1.91024    -1.92624     -2.0313  
  -1.98

↑  →  →  →  ←  ←  ↓  ←  →  →  ↓  ←  ←  →  ↓  ↓  
←  ←  ←  ←  ←  →  ↑  ←  ←  ↓  ↓  ←  ↓  ←  →  ↓  
←  ←  ←  ←  ←  ↑  ↓  ←  →  →  →  →  ↓  ↓  ↓  ↓  
←  ←  ←  ←  ←  ←  ↑  ↓  ←  ↓  ←  →  →  →  →  ←  


Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.2,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.067487,"[-0.25, -0.25, -0.25, -0.25, -0.25, -0.25, -1....",12,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, ...","[[←, ←, ←, ←, ←, ←, ←, ←, ↓, ↓, ↓, ←, ←, ←, ←,...",0.023044,"[-0.24999999995904001, -0.24999999995904001, -...",15,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[[←, ←, ←, ←, ←, ←, ←, ←, ↓, ←, ←, ←, ←, ←, ←,...",0.352674,"[[-2.2308136923961635, -2.1498745270375994, -2...",322,"[-2.1498745270375994, -2.0524999426173984, -1....","[1, 1, 2, 2, 3, 3, 0, 0, 3, 0, 1, 0, 2, 2, 3, ...","[[↓, ↓, →, →, ↑, ↑, ←, ←, ↑, ←, ↓, ←, →, →, ↑,..."


Complete!


# Notes

Default rewards in OpenAI gym Frozen-Lake-v0 are 1 for the G(oal) and 0 for everything else.

Maps are drawn according to the following logic

```
if desc and map_name are None, 
   then a default random map is drawn with 8
        using frozen_lake.generate_random_map(size=8, p=0.8)
elif desc is None and a map_name is given
   then a map_name is either '4x4' or '8x8'
        and is drawn from the dict MAPS in frozen_lake.py
elif desc is given
   then it must be in the form of a list with 
```

Default action probabilities are 1/3 chosen action, 1/3 each for right angles to chosen action, and 0 for reverse of chosen action. This is set with `is_slippery=True`. If `is_slippery=False`, then P=1 for chosen action and 0 for all other actions.

|ACTION|Value|Symbol|
|------|-----|------|
|LEFT  | 0   | ←    |
|DOWN  | 1   | ↓    |
|RIGHT | 2   | →    |
|UP    | 3   | ↑    |

# Sources

- Environment: <https://gym.openai.com/envs/FrozenLake-v0/>
- Code: <https://github.com/Twice22/HandsOnRL>
- Tutorial: <https://twice22.github.io/>