In [27]:
import gym
from gym.envs.toy_text.frozen_lake import generate_random_map
import numpy as np
from tqdm.auto import tqdm
import time
from IPython.display import clear_output

In [36]:

def create_frozen_lake_env(map_name, map_size=4, is_slippery=False):
    return gym.make(
        "FrozenLake-v1",
        desc=generate_random_map(size=map_size),
        map_name=map_name,
        is_slippery=is_slippery,
        render_mode="ansi"
    )


def train(env, n_episode=10_000, alpha=0.5, gamma=0.5): #Returns the q_table
    q_table = np.zeros((env.observation_space.n, env.action_space.n))

    def get_new_action_quality(current_quality, reward, new_state):
        return current_quality + alpha * (reward + gamma * np.max(q_table[new_state]) - current_quality)

    for _ in tqdm(range(n_episode)):
        done = False
        state = env.reset()[0]
        
        while done == False:
            state_list = q_table[state]
            # action is the max rated action for this state or random if all recorded actions are 0
            action = env.action_space.sample() if np.all(state_list == 0) else np.argmax(state_list)
            new_state, reward, done, trucaded, info = env.step(action)
            q_table[state][action] = get_new_action_quality(q_table[state][action], reward, new_state)
            state = new_state
    return q_table

def play(env, q_table):
    done = False
    state = env.reset()[0]

    while done == False:
        state_list = q_table[state]
        action = env.action_space.sample() if np.all(state_list == 0) else np.argmax(state_list)
        state, reward, done, trucaded, info = env.step(action)
        clear_output(wait=True)
        print(env.render())
        time.sleep(0.25)


In [42]:
env = create_frozen_lake_env("normal", map_size=4)
q_table = train(env)
play(env, q_table)

  (Down)
SFFF
HHFF
FHFF
HFF[41mG[0m

