In [1]:
import numpy as np
import gym

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
def decay_schedule(init_value,
                   min_value,
                   decay_ratio,
                   max_steps,
                   log_start = -2,
                   log_base=10
):
    decay_steps = int(max_steps * decay_ratio)
    rem_steps = max_steps - decay_steps
    
    values = np.logspace(log_start,
                        0,
                        decay_steps,
                        base = log_base,
                        endpoint = True)[::-1]
    #print(value)
    values = (values - values.min())/(values.max() - values.min())
    values = np.pad(values, (0, rem_steps), 'edge')
    return values

In [19]:
def generate_trajectory(env, select_action, Q, epsilon, max_steps=200): #generate single trajectory from start to terminal state
    done, trajectory = False, []
    state = env.reset()
    while not done:
        #print(max_steps)
        for t in range(max_steps):
            action = select_action(state, Q, epsilon)
            next_state, reward, done, _ = env.step(action)
            experience = (state, action, reward, next_state)
            trajectory.append(experience)
            if done == True:
                break
            state = next_state
    return np.array(trajectory, np.object)

In [20]:
def mc_control(env,
              gamma=0.99,
               init_alpha=0.5,
               min_alpha=0.01,
               alpha_decay_ratio=0.5,
               init_epsilon=1.0,
               min_epsilon=0.1,
               epsilon_decay_ratio=0.9,
               n_episodes=3000,
               max_steps=200,
               first_visit=True
              ):
    nS, nA = env.observation_space.n, env.action_space.n
    discounts = np.logspace(
    0, 
    max_steps,
    num=max_steps,
    base=gamma,
    endpoint=False)
    
    alphas = decay_schedule(
    init_alpha,
    min_alpha,
    alpha_decay_ratio,
    n_episodes)#for error func decay
    
    epsilons = decay_schedule(
    init_epsilon,
    min_epsilon,
    epsilon_decay_ratio,
    n_episodes)#epsilon decay
    
    policy_track = []
    Q = np.zeros((nS, nA))
    Q_track = np.zeros((n_episodes, nS, nA))
    
    select_action = lambda state, Q, epsilon: \
    np.argmax(Q[state])\
    if np.random.random() > epsilon\
    else np.random.randint(len(Q[state]))\
    
    for e in range(n_episodes):
        trajectory = generate_trajectory(env, select_action, Q, epsilons[e])
        visited = np.zeros((nS,nA))
        
        for t, (state,action, reward, next_state) in enumerate(trajectory):
            if visited[state][action] and first_visit:
                continue
            visited[state][action] = True
            n_steps = len(trajectory[t:])
            G = np.sum(discounts[:n_steps]*trajectory[t:,-2])
            Q[state][action]+=alphas[e]*(G - Q[state][action])
        Q_track[e] = Q
        policy_track.append(np.argmax(Q, axis=1))
    V = np.max(Q, axis=1)
    policy = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    return Q, V, policy, Q_track, policy_track

In [33]:
def sarsa(env,
         gamma = 0.99,
        init_alpha = 0.5,
         min_alpha=0.01,
         alpha_decay_ratio=0.5,
         init_epsilon=1.0,
         min_epsilon=0.1,
         epsilon_decay_ratio=0.9,
         n_ep=3000):
    nS = env.observation_space.n
    nA = env.action_space.n
    Q = np.zeros((nS, nA))
    Q_track = np.zeros((n_ep,nS,nA))
    policy_track = []
    
    select_action = lambda state, Q, epsilon: \
    np.argmax(Q[state])\
    if np.random.random() > epsilon\
    else np.random.randint(len(Q[state]))
    
    alphas = decay_schedule(
    init_alpha,
    min_alpha,
    alpha_decay_ratio,
    n_ep)#for error func decay 
    
    epsilons = decay_schedule(
    init_epsilon,
    min_epsilon,
    epsilon_decay_ratio,
    n_ep)#epsilon decay
    
    for e in range(n_ep):
        state = env.reset()
        while True:
            action = select_action(state, Q, epsilons[e])
            next_state, reward, done, _ = env.step(action)
            td_target = reward + gamma*Q[next_state][action]
            td_error = td_target - Q[state][action]
            Q[state][action]+=alphas[e]*td_error
            state = next_state
            
            if done == True:
                break
        Q_track[e] = Q
        policy_track.append(np.argmax(Q, axis=1))#argmax returns index, max returns value
    V = np.max(Q, axis=1)
    policy_ = lambda s:{s:a in enumerate(np.argmax(Q, axis=1))}[s]
    return Q, V, policy_, Q_track, policy_track

In [40]:
def q_learning(env,
                gamma = 0.99,
              init_alpha=0.5,
              min_alpha = 0.01,
              alpha_decay_ratio=0.5,
              init_epsilon=1.0,
              min_epsilon = 0.1,
              epsilon_decay_ratio=0.9,
              n_ep=3000):
    
    nS, nA = env.observation_space.n, env.action_space.n
    Q = np.zeros((nS, nA))
    Q_track = np.zeros((n_ep, nS, nA))
    policy_track = []
    
    alphas = decay_schedule(
    init_alpha,
    min_alpha,
    alpha_decay_ratio,
    n_ep)#for error func decay 
    
    epsilons = decay_schedule(
    init_epsilon,
    min_epsilon,
    epsilon_decay_ratio,
    n_ep)#epsilon decay
    
    select_action = lambda state, Q, epsilon: \
    np.argmax(Q[state])\
    if np.random.random() > epsilon\
    else np.random.randint(len(Q[state]))
    
    for e in range(n_ep):
        state = env.reset()
        
        while True:
            action = select_action(state, Q, epsilons[e])
            next_state, reward, done, _ = env.step(action)
            sarsa_target=reward + gamma*(np.max(Q[next_state]))*(not done)
            sarsa_error =sarsa_target - Q[state][action]
            Q[state][action]+=alphas[e]*sarsa_error
            state = next_state
            if done == True:
                break
        Q_track[e] = Q
        policy_track.append(np.argmax(Q, axis=1))
    V = np.max(Q, axis=1)
    policy_ = lambda s:{s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    return Q, V, policy_, Q_track, policy_track

In [50]:
def double_q_learning(env,
                     gamma=0.99,
                     init_alpha=0.5,
                     min_alpha=0.01,
                     alpha_decay_ratio=0.5,
                     init_epsilon=1.0,
                     min_epsilon=0.1,
                     epsilon_decay_ratio=0.9,
                     n_ep=3000):
    nS, nA = env.observation_space.n, env.action_space.n
    Q1 = np.zeros((nS,nA))
    Q2 = np.zeros((nS,nA))
    policy_track = []
    Q1_track = np.zeros((n_ep, nS, nA))
    Q2_track = np.zeros((n_ep, nS, nA))
    
    select_action = lambda state, Q, epsilon: \
    np.argmax(Q[state])\
    if np.random.random() > epsilon\
    else np.random.randint(len(Q[state]))
    
    alphas = decay_schedule(
    init_alpha,
    min_alpha,
    alpha_decay_ratio,
    n_ep)#for error func decay 
    
    epsilons = decay_schedule(
    init_epsilon,
    min_epsilon,
    epsilon_decay_ratio,
    n_ep)#epsilon decay
    
    for e in range(n_ep):
        state = env.reset()
        while True:
            action = select_action(state, (Q1 + Q2)/2, epsilons[e])
            next_state, reward, done, _ = env.step(action)
            
            if np.random.randint(2):
                Q1_argmax = np.argmax(Q1[next_state])
                td_target = reward + gamma*Q2[next_state][Q1_argmax]#pick d b on Q2
                td_error = td_target - Q1[state][action]
                Q1[state][action]+=alphas[e]*td_error
            else:
                #print(Q2)
                Q2_argmax = np.argmax(Q2[next_state])
                td_target = reward + gamma*Q1[next_state][Q2_argmax]
                td_error = td_target - Q2[state][action]
                Q2[state][action]+=alphas[e]*td_error
            
            state = next_state
            if done == True:
                break
        Q1_track[e]=Q1
        Q2_track[e]=Q2
        policy_track.append(np.argmax((Q1+Q2/2), axis=1))
    Q = (Q1+Q2)/2
    V = np.max(Q, axis = 1)
    policy_ = lambda s:{s,a in enumerate(np.argmax(Q, axis =1))}[s]
    return Q, V, policy_, Q_track, policy_track    

In [22]:
Q, V, POLICY, Q_track, policy_track = mc_control(env)

In [34]:
Q, V, POLICY, Q_track, policy_track = sarsa(env)#td(0)

In [41]:
Q, V, POLICY, Q_track, policy_track = q_learning(env)

In [58]:
Q, V, POLICY, Q_track, policy_track = double_q_learning(env)

In [60]:
q_table = Q

In [61]:
maxStepsPerEpisode = 100
import time
from IPython.display import clear_output
for episode in range(10):
    state = env.reset()
    print("EPISODE: ", episode+1)
    time.sleep(2)

    for step in range(maxStepsPerEpisode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.5)

        action = np.argmax(q_table[state, :])
        new_state, reward, done, info = env.step(action)

        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("successfull goal")
                time.sleep(2)
            elif reward == -1:
                print("hole")
                time.sleep(2)
            clear_output(wait=True)
            break

    state = new_state
env.close()


[41mS[0mFFF
FHFH
FFFH
HFFG


KeyboardInterrupt: 