In [1]:
import gym
import numpy as np
import random

#initialize
qvalue = np.zeros([16,4])
evalue = np.zeros([16,4])

#slippery
# env = gym.make('FrozenLake-v0')

#non-slippery
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.8196, # optimum = .8196, changing this seems have no influence
)
env = gym.make("FrozenLakeNotSlippery-v0")

#hyperparameters
alphaLearningRate = 0.3
faresightLambda = 0.95
decayRate = 0.0
decay_rate_of_epsilon = 0.001
max_epsilon = 1.0
min_epsilon = 0.00001 

#record score
score = []

def choose_action(state, epsilon):
    if random.random() >= epsilon:
        return np.argmax(qvalue[state])
    else:
        return random.randrange(4)

def SARSAcontrol():
    epsilon = 1.0
    for i in range(20000):
        previousState = env.reset()
        previousAction = choose_action(previousState, epsilon)
        evalue = np.zeros([16,4])
        score = []
        done = False
        while not done:
            steps = 0
            currentState, reward, done, info = env.step(previousAction)
            steps += 1
            currentAction = choose_action(currentState, epsilon)
            error = reward + faresightLambda * qvalue[currentState][currentAction] \
                - qvalue[previousState][previousAction]
            evalue[previousState][previousAction] += 1
            
            nonZeroes = np.transpose(np.nonzero(evalue))
            for nonZeroe in nonZeroes:
                row = nonZeroe[0]
                column = nonZeroe[1]
                qvalue[row][column] += alphaLearningRate * evalue[row][column] * error
            
            evalue *= decayRate * faresightLambda
            
            
            previousState = currentState
            previousAction = currentAction
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate_of_epsilon*i)
                
    print("training end")
    print(qvalue)
    
def evaluation():
    score = []
    done = False
    for i in range(10000):
        observation = env.reset()
        while True:
            action = choose_action(observation, 0)
            observation, reward, done, info = env.step(action)
            if done:
                score.append(reward)
                break
    
    print("accuracy: {}".format(np.mean(score)))


In [2]:
for batch in range(10):
    SARSAcontrol()
    evaluation()

training end
[[0.59628938 0.77378094 0.54002364 0.58100452]
 [0.54321918 0.         0.26026425 0.30348757]
 [0.34614753 0.16257388 0.08793539 0.20285719]
 [0.13943466 0.         0.05167362 0.05246588]
 [0.65781859 0.81450625 0.         0.57031194]
 [0.         0.         0.         0.        ]
 [0.         0.89672656 0.         0.17839505]
 [0.         0.         0.         0.        ]
 [0.64936244 0.         0.857375   0.57724564]
 [0.71768466 0.9025     0.712975   0.        ]
 [0.83633485 0.95       0.         0.7900671 ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.88346659 0.95       0.83040634]
 [0.89166222 0.9497491  1.         0.87916188]
 [0.         0.         0.         0.        ]]
accuracy: 1.0
training end
[[0.62154908 0.58763485 0.77378094 0.62434858]
 [0.64238321 0.         0.81450625 0.59464561]
 [0.57002464 0.857375   0.5721133  0.64435625]
 [0.58006389 0.         0.40733216 0.44950603]
 [0.3285734  0.3841