## Assignment 4 - MDP

## Importing Libraries

In [2]:
import cs7641assn4 as a4
import numpy as np
import pandas as pd
import warnings
import xlrd

pd.set_option('display.max_columns', 35)
# pd.reset_option("display.max_columns")

## Importing Settings

In [33]:

# Import settings, uncomment to read settings from an excel spreadsheet
# Imports 'default_settings.xlsx' spreadsheet
settings = pd.read_excel('C:\\Users\\Mike\\Documents\\Georgia Tech\\Machine Learning\\hw4-ncook-solution-master\\default_settings.xlsx').to_dict()

# Determine the number of runs
n_settings = len(settings['rH'])

In [35]:
for n in range(n_settings):
    # Load settings
    rH = settings['rH'][n] # -1 #-5 # reward for H(ole)
    rG = settings['rG'][n] #1 # 10 # reward for G(oal)
    rF = settings['rF'][n] #-0.2# reward includes S(tart) and F(rozen)
    size = settings['size'][n] #4 # height and width of square gridworld, [4, 8, 16] are included in cs7641assn4.py 
    p = settings['p'][n] #0.8 # if generating a random map probability that a grid will be F(rozen)
    map_name = 'x'.join([str(size)]*2) # None, if you want a random map
    desc = a4.MAPS[map_name] # None, if you want a random map
    is_slippery = settings['is_slippery'][n] #False
    render_initial = settings['render_initial'][n] # True

    epsilon = settings['epsilon'][n] #1e-8 # convergence threshold for policy/value iteration
    gamma = settings['gamma'][n] #0.8 # discount parameter for past policy/value iterations
    max_iter = settings['max_iter'][n] #10000 # maximum iterations for slowly converging policy/value iteration 

    # Qlearning(env, rH=0, rG=1, rF=0, qepsilon=0.1, lr=0.8, gamma=0.95, episodes=10000)
    qepsilon = settings['qepsilon'][n] #0.1 # epsilon value for the Q-learning epsilon greedy strategy
    lr = settings['lr'][n] #0.8 # Q-learning rate
    qgamma = settings['qgamma'][n] #0.95 # Q-Learning discount factor
    episodes = settings['episodes'][n] #10000 # number of Q-learning episodes
    initial = settings['initial'][n] #0 # value to initialize the Q grid
    decay = settings['decay'][n] #True

    # Printing options
    report = settings['report'][n] #True # For cs7641assn4.py policy and value iteration functions
    display_print = settings['display_print'][n] #True # For this script

    # Create Environment
    env = a4.getEnv(env_id='hw4-FrozenLake-v0', rH=rH, rG=rG, rF=rF, 
                    desc=desc,  
                    is_slippery=is_slippery, render_initial=True)

    # Store a representation of the map
    env_desc = env.desc.astype('<U8')

    # Store a representation of the state rewards
    env_rs = a4.getStateReward(env)

    if display_print:
        # Display reward at each state
        print('\n--Reward Values at Each State--')
        a4.matprint(a4.print_value(env_rs, width=size, height=size))
        
    ## Policy Iteration
    print('\n--Policy Iteration TimeIt--')
    pi_time = %timeit -o a4.policy_iteration(env, epsilon, gamma, max_iter, report=False)
    
    pi_V, pi_policy, pi_epochs = a4.policy_iteration(env, epsilon, gamma, max_iter, report=report)

    pi_policy_arrows = a4.print_policy(pi_policy, width=size, height=size)

    if display_print:
        # Display values
        print('\n--Policy Iteration Values in grid order--')
        a4.matprint(a4.print_value(pi_V, width=size, height=size))

        # Display policy
        print('\n--Policy Iteration Policy Matrix--')
        a4.matprint(pi_policy_arrows)
        
    ## Value Iteration
    print('\n--Value Iteration TimeIt--')
    vi_time = %timeit -o a4.valueIteration(env, epsilon, gamma, max_iter, report=False)
    
    vi_V, vi_epochs = a4.valueIteration(env, epsilon, gamma, max_iter, report=report)

    vi_policy = a4.value_to_policy(env, V=vi_V, gamma=gamma)

    vi_policy_arrows = a4.print_policy(vi_policy, width=size, height=size)

    if display_print:
        # display value function:
        print('\n--Value Iteration Values in grid order--')
        a4.matprint(a4.print_value(vi_V, width=size, height=size))
        
        # display policy
        print('\n--Value Iteration Policy Matrix--')
        a4.matprint(vi_policy_arrows)
        
    ## Q-Learning
    print('\n--Q-Learning TimeIt--')
    Q_time = %timeit -o a4.Qlearning(env, qepsilon, lr, qgamma, episodes, initial, decay, report=False)
        
    Q, Q_epochs = a4.Qlearning(env, qepsilon, lr, qgamma, episodes, initial, decay, report)

    maxQ = np.max(Q,axis=1)

    Q_policy = a4.Q_to_policy(Q)

    Q_policy_arrows = a4.print_policy(Q_policy, width=size, height=size)

    if display_print: 
        print('--Q with all options--')
        a4.matprint(Q)
        print('\n--argmax(Q) in grid order--')
        a4.matprint(a4.print_value(maxQ, width=size, height=size))
        print('\n--Q-Learning Policy Matrix--')
        a4.matprint(Q_policy_arrows)
        
    ## Save results to DataFrame
    results = pd.DataFrame({'rH': [rH], 
                        'rG': [rG], 
                        'rF': [rF], 
                        'size': [size], 
                        'p': [p], 
                        'desc': [desc], 
                        'map_name': [map_name],                        
                        'is_slippery': [is_slippery],
                        'epsilon': [epsilon],
                        'gamma': [gamma], 
                        'max_iter': [max_iter], 
                        'qepsilon': [qepsilon], 
                        'lr': [lr], 
                        'qgamma': [qgamma], 
                        'episodes': [episodes], 
                        'initial': [initial],
                        'env_desc': [env_desc],
                        'env_rs': [env_rs],
                        'pi_time': [pi_time.average],
                        'pi_V': [pi_V],
                        'pi_epochs': [pi_epochs],
                        'pi_policy': [pi_policy],
                        'pi_policy_arrows': [pi_policy_arrows],
                        'vi_time': [vi_time.average],
                        'vi_V': [vi_V],
                        'vi_epochs': [vi_epochs],
                        'vi_policy': [vi_policy],
                        'vi_policy_arrows': [vi_policy_arrows],
                        'Q_time': [Q_time.average],
                        'Q': [Q],
                        'Q_epochs': [Q_epochs],
                        'Q_V': [maxQ],
                        'Q_policy': [Q_policy],
                        'Q_policy_arrows': [Q_policy_arrows]})
    
    if display_print: 
        display(results)
        
    ## Save results to disk
    warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
    try:
        dataset = pd.read_hdf('data.h5', key='dataset', mode='a')
    except FileNotFoundError:
        results.to_hdf('data.h5', key='dataset', mode='a')
    else:
        dataset.append(
            other=results, 
            ignore_index=True,
            sort=False
            ).to_hdf(
            path_or_buf='data.h5', 
            key='dataset', 
            mode='a')
        
    if display_print:
        pd.read_hdf('data.h5', key='dataset', mode='a')
    
    
print('Complete!')   

--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

Q-Learning converged after  327 epochs
--Q with all options--
 -2.24236    -2.14987    -2.14987   -2.24152  
 -2.21983     -2.0525    -2.06099   -2.13791  
 -2.12644    -1.96818     -1.9682   -2.01351  
 -1.90041    -1.90377    -1.89183    -1.9232  
 -1.92103    -1.94906    -1.79556   -1.84672  
 -1.80672    -1.75565     -1.9264    -1.7751  
       -1          -1          -1         -1  
       -1          -1          -1         -1  
    -1.36    -1.25286    -1.20068   -1.17663  
 -1.25126    -1.22076    -1.13908   -1.23426  
 -1.08653    -1.14916    -1.11126   -1.05899  
 -1.08255    -1.17061    -1.06769   -1.05899  
 -1.15071    -1.06228    -1.05433   -1.05899  
 -1.16336    -1.06322    -1.15696   -1.14707  
 -1.15155       -1.36     -1.1718   -1.17663  
  -1.1726    -1.15772    -1.17663   -1.17663  
 -2.14953     -2.0525     -2.0525    -2.2399  
 -2.14531       -1.95    -1.97525   -2.12574  
 -1.99273    -1.88998     -1.9115   -1.93421  
 -1.85797    -1.84804    -1.94996   -1.94759 

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.8,10000,0.1,0.8,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.171301,"[-0.99071544970536, -0.9883943121316999, -0.98...",19,"[1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, ↓, ↓, ←, ←, ←, ←,...",0.240601,"[-0.9907154521919761, -0.9883943146183161, -0....",97,"[1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",0.753117,"[[-2.2423624612839927, -2.149871306879552, -2....",327,"[-2.149871306879552, -2.0524998803547048, -1.9...","[1, 1, 1, 2, 2, 1, 0, 0, 3, 2, 3, 3, 2, 1, 0, ...","[[↓, ↓, ↓, →, →, ↓, ←, ←, ↑, →, ↑, ↑, →, ↓, ←,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

628 ms ± 136 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Q-Learning converged after  373 epochs
--Q with all options--
  -2.24226     -2.14983    -2.14987     -2.24192  
  -2.20738      -2.0525    -2.07026     -2.13427  
  -2.10225     -1.98052     -1.9846      -2.0001  
  -1.92081     -1.89383    -1.91244     -1.92536  
  -1.85074     -1.87448    -1.84938     -1.88276  
  -1.86531     -1.76037    -1.93792      -1.7846  
        -1           -1          -1           -1  
        -1           -1          -1           -1  
    -1.478    -0.942276    -0.94134    -0.969528  
 -0.943025    -0.940913   -0.850254    -0.906326  
 -0.807601    -0.905352    -0.83509    -0.833744  
  -0.79334    -0.839984   -0.823406    -0.735819  
 -0.733877    -0.780649   -0.827635    -0.735819  
 -0.747871    -0.832601   -0.834318    -0.833744  
 -0.777785       -1.478    -0.83374    -0.735819  
 -0.738711     -0.83427   -0.833744    -0.833744  
   -2.1493     -2.05247    -2.05249      -2.2375  
  -2

  0.631765     0.621565      0.6463       0.7045  
    0.7045       0.7045     1.36981         0.85  
  0.974294       1.6525           1            1  
         1            1           1            1  
         1            1           1            1  
         1            1           1            1  
         1            1           1            1  
        -1           -1          -1           -1  
    -1.478        -0.77  -0.0191817    0.0559039  
-0.0171967    0.0419718   0.0564693  -0.00147396  
 0.0768684    0.0801671  -0.0418203    0.0628544  
  0.056201        -0.77  -0.0500817        -0.77  
 -0.012856    0.0535682   0.0972317     0.026418  
  0.123384     0.161213    0.420882      0.02977  
  0.180872     0.338252    0.778386        -0.77  
  0.534794      1.08833    0.556765     0.553765  
  0.573685     0.985398      0.6805     0.631765  
    0.6703      1.24486        0.85         0.85  
      0.85         1.95           1            1  
         1            1        

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.8,10000,0.1,0.6,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.248898,"[-0.99071544970536, -0.9883943121316999, -0.98...",19,"[1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, ↓, ↓, ←, ←, ←, ←,...",0.267423,"[-0.9907154521919761, -0.9883943146183161, -0....",97,"[1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",0.627539,"[[-2.2422605493051204, -2.1498273803827126, -2...",373,"[-2.1498273803827126, -2.052499345395234, -1.9...","[1, 1, 1, 1, 2, 1, 0, 0, 2, 2, 0, 3, 0, 0, 3, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, →, ←, ↑, ←, ←, ↑,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

860 ms ± 214 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Q-Learning converged after  538 epochs
--Q with all options--
 -2.24104    -2.14952   -2.14981    -2.24005  
 -2.22497    -2.05248   -2.06145    -2.14604  
 -2.05811    -1.97006   -1.96837    -2.01929  
 -1.96971    -1.87302   -1.87926    -1.90399  
 -1.84543    -1.92027   -1.78457    -1.79144  
 -1.75671     -1.6979   -1.86742    -1.71074  
       -1          -1         -1          -1  
       -1          -1         -1          -1  
  -1.3128   -0.962108   -1.00805   -0.982676  
-0.986608    -1.02501  -0.983322   -0.966229  
 -0.99146   -0.914923  -0.946476   -0.921098  
-0.919767   -0.916666  -0.938462   -0.921098  
-0.922689   -0.924414  -0.941965   -0.921098  
-0.936865   -0.877619   -0.92566   -0.921098  
-0.918484     -1.3128  -0.905815   -0.921098  
-0.936025    -0.91132  -0.921098   -0.921098  
 -2.08538    -2.05233   -2.05235    -2.19395  
 -2.12001       -1.95   -1.96348    -1.97076  
 -1.94218    -1.88042   -

-0.8583  -0.8577       -1  -1.3071   -1.244  -1.2278  -1.1359       -1   -0.888   -0.843    -0.78       -1       -1       -1       -1        1  
     -1  -0.9103       -1  -1.2182  -1.1561  -1.1441  -1.0562  -0.9639  -0.8565  -0.7828   -0.665  -0.5292  -0.4295  -0.3294  -0.1898       -1  
-0.9233  -0.9596   -1.026  -1.1219   -1.084  -1.0339  -0.9771  -0.8716  -0.8333       -1  -0.5445  -0.4517  -0.3869   -0.272       -1        1  
-0.9211  -0.9493  -1.0014  -1.0347  -0.9982  -0.9762       -1       -1  -0.7131       -1  -0.4546  -0.3836   -0.291   -0.114   0.1388       -1  
-0.9211  -0.9418       -1  -0.9768  -0.9376  -0.8906  -0.7865  -0.6917  -0.6249       -1  -0.3253  -0.2991   -0.257       -1   0.4453   0.4673  
-0.9117  -0.9271   -0.903       -1       -1   -0.888       -1  -0.6129  -0.4798   -0.348  -0.1985  -0.2304       -1    0.505   0.7682   1.0742  
 -0.888   -0.888   -0.888   -0.888   -0.888       -1   0.1477       -1  -0.3539    -0.17   0.0676    -0.18       -1   0.5829   0.7

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.8,10000,0.1,0.4,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.238555,"[-0.99071544970536, -0.9883943121316999, -0.98...",19,"[1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, ↓, ↓, ←, ←, ←, ←,...",0.247414,"[-0.9907154521919761, -0.9883943146183161, -0....",97,"[1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",0.859631,"[[-2.2410395891410584, -2.1495235200479392, -2...",538,"[-2.1495235200479392, -2.0524790131094566, -1....","[1, 1, 2, 1, 2, 1, 0, 0, 1, 3, 1, 1, 3, 1, 2, ...","[[↓, ↓, →, ↓, →, ↓, ←, ←, ↓, ↑, ↓, ↓, ↑, ↓, →,..."


--Board--

[41mS[0mFFFFFHHFFFFFFFF
FFFFHFFHFFFFFFHF
FHFFFFFHFHFHFFFF
HFFHFFFFFFFFFHFH
FHFFFFHHFFFHHFHF
FHFFHFHFFFFHFFFF
FFHFFFFHFFFHHHHF
HFHFFFFFFFFFFFFH
FFFFFFFFFHFFFFHF
FFFFFFHHFHFFFFFH
FFHFFFFFFHFFFHFF
FFFHHFHFFFFFHFFF
FFFFFHFHFFFFHFFF
HHHHHFFFHFFHFFFF
FHFFHFFFFFFFFFFF
FFHFHHFFHFFFFFFG

--Actions for Position to the Left of the Goal--
{0: [(1.0, 253, -0.2, False)],
 1: [(1.0, 254, -0.2, False)],
 2: [(1.0, 255, 1, True)],
 3: [(1.0, 238, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  
-0.2  -0.2  -0.2  -0.2    -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  -0.2    -1  -0.2  -0.2  -0.2  -0.2  
  -1  -0.2  -0.2    -1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2    -1  -0.2    -1  
-0.2    -1  -0.2  -0.2  -0.2  -0.2    -1    -1  -0.2  -0.2  -0.2    -1    -1  -0.2    -1  -0.2  
-0.2    -1  -0.2  -0

1.63 s ± 158 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Q-Learning converged after  1044 epochs
--Q with all options--
  -2.24183   -2.14965    -2.14981    -2.24181  
  -2.20648   -2.05249    -2.05791    -2.13976  
  -2.05687   -1.96228    -1.96422    -1.97272  
  -1.88515   -1.87972    -1.87091    -1.88478  
  -1.81386   -1.89686    -1.78035    -1.77765  
  -1.70644   -1.68732    -1.74728    -1.69415  
        -1         -1          -1          -1  
        -1         -1          -1          -1  
 -0.983344  -0.937685   -0.936875   -0.913496  
 -0.910331  -0.923774   -0.905486   -0.913549  
 -0.884467  -0.858049   -0.870952   -0.870667  
 -0.833148  -0.830433   -0.840365   -0.845036  
 -0.815124  -0.792082   -0.814114   -0.819073  
 -0.786366  -0.798621   -0.781556   -0.786942  
 -0.775667   -0.74168   -0.751185   -0.754487  
 -0.739104  -0.751543   -0.743111   -0.754487  
  -2.13339   -2.05235    -2.05235     -2.2211  
  -2.01556      -1.95     -1.9604    -2.11443  
  -1.9

  0.387502   0.358087    0.373333    0.353277  
   0.32471     -0.062     0.27659      -0.062  
  0.220379   0.241961    0.319969    0.192148  
  0.354717   0.347339    0.588685    0.352769  
  0.498271   0.841228    0.554354        0.41  
  0.674689   0.872465    0.742346    0.721728  
  0.836219    1.19117    0.878733    0.842527  
      0.95    1.42339        0.95        0.95  
      0.95     1.9494           1     1.07023  
         1          1           1           1  
         1          1           1           1  
         1          1           1           1  
         1          1           1           1  
         1          1           1           1  
        -1         -1          -1          -1  
      0.41   0.387605     0.37599    0.413942  
   0.39836   0.387605      -0.062    0.373433  
        -1         -1          -1          -1  
    -0.062   0.343729    0.369105    0.306682  
  0.456036   0.476691    0.631873    0.447892  
  0.554932   0.655059     1.10014      0

Unnamed: 0,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,gamma,max_iter,qepsilon,lr,qgamma,episodes,initial,env_desc,env_rs,pi_time,pi_V,pi_epochs,pi_policy,pi_policy_arrows,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_epochs,Q_V,Q_policy,Q_policy_arrows
0,-1,1,-0.2,16,0.8,"[SFFFFFHHFFFFFFFF, FFFFHFFHFFFFFFHF, FHFFFFFHF...",16x16,False,1e-08,0.8,10000,0.1,0.2,0.95,10000,1,"[[S, F, F, F, F, F, H, H, F, F, F, F, F, F, F,...","[-0.2, -0.2, -0.2, -0.2, -0.2, -0.2, -1.0, -1....",0.208761,"[-0.99071544970536, -0.9883943121316999, -0.98...",19,"[1, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, →, ↓, ↓, ←, ←, ←, ←,...",0.214375,"[-0.9907154521919761, -0.9883943146183161, -0....",97,"[1, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, ...","[[↓, ↓, ↓, ↓, →, ↓, ←, ←, ↓, ↓, ↓, ↓, ↓, ↓, ←,...",1.628355,"[[-2.241830936586645, -2.1496544835276685, -2....",1044,"[-2.1496544835276685, -2.0524868143105395, -1....","[1, 1, 1, 2, 3, 1, 0, 0, 3, 2, 1, 1, 1, 2, 1, ...","[[↓, ↓, ↓, →, ↑, ↓, ←, ←, ↑, →, ↓, ↓, ↓, →, ↓,..."


Complete!


# Notes

Default rewards in OpenAI gym Frozen-Lake-v0 are 1 for the G(oal) and 0 for everything else.

Maps are drawn according to the following logic

```
if desc and map_name are None, 
   then a default random map is drawn with 8
        using frozen_lake.generate_random_map(size=8, p=0.8)
elif desc is None and a map_name is given
   then a map_name is either '4x4' or '8x8'
        and is drawn from the dict MAPS in frozen_lake.py
elif desc is given
   then it must be in the form of a list with 
```

Default action probabilities are 1/3 chosen action, 1/3 each for right angles to chosen action, and 0 for reverse of chosen action. This is set with `is_slippery=True`. If `is_slippery=False`, then P=1 for chosen action and 0 for all other actions.

|ACTION|Value|Symbol|
|------|-----|------|
|LEFT  | 0   | ←    |
|DOWN  | 1   | ↓    |
|RIGHT | 2   | →    |
|UP    | 3   | ↑    |

# Sources

- Environment: <https://gym.openai.com/envs/FrozenLake-v0/>
- Code: <https://github.com/Twice22/HandsOnRL>
- Tutorial: <https://twice22.github.io/>