In [1]:
import numpy as np
import gym

In [2]:
def decay_schedule(init_value,
                   min_value,
                   decay_ratio,
                   max_steps,
                   log_start = -2,
                   log_base=10
):
    decay_steps = int(max_steps * decay_ratio)
    rem_steps = max_steps - decay_steps
    
    values = np.logspace(log_start,
                        0,
                        decay_steps,
                        base = log_base,
                        endpoint = True)[::-1]
    #print(value)
    values = (values - values.min())/(values.max() - values.min())
    values = np.pad(values, (0, rem_steps), 'edge')
    return values

In [3]:
def generate_trajectory(env, select_action, Q, epsilon, max_steps=200): #generate single trajectory from start to terminal state
    done, trajectory = False, []
    state = env.reset()
    while not done:
        #print(max_steps)
        for t in range(max_steps):
            action = select_action(state, Q, epsilon)
            next_state, reward, done, _ = env.step(action)
            experience = (state, action, reward, next_state)
            trajectory.append(experience)
            if done == True:
                break
            state = next_state
    return np.array(trajectory, np.object)

In [16]:
def dyna_q(env,
         gamma=0.99,
         init_alpha=0.5,
         min_alpha=0.01,
         alpha_decay_ratio=0.5,
         init_epsilon=1.0,
         min_epsilon=0.1,
         epsilon_decay_ratio=0.5,
         n_planning=7,
         n_ep=3000):
    nS, nA = env.observation_space.n, env.action_space.n
    policy_track = []
    Q = np.zeros((nS,nA))
    Q_track = np.zeros((n_ep, nS, nA))
    transition_count = np.zeros((nS, nA, nS))
    reward_model = np.zeros((nS, nA, nS))
    
    select_action = lambda state, Q, epsilon: \
    np.argmax(Q[state]) \
    if np.random.random() > epsilon\
    else np.random.randint(len(Q[state]))
    
    alphas = decay_schedule(
    init_alpha,
    min_alpha,
    alpha_decay_ratio,
    n_ep)
    
    epsilons = decay_schedule(
    init_epsilon,
    min_epsilon,
    epsilon_decay_ratio,
    n_ep)
    
    for e in range(n_ep):
        state = env.reset()
        while True:
            action = select_action(state, Q, epsilons[e])
            next_state, reward,done,_ = env.step(action)
            transition_count[state][action][next_state]+=1
            reward_diff = reward - reward_model[state][action][next_state]
            reward_model[state][action][next_state]+=reward_diff/transition_count[state][action][next_state]#model env reward
            td_target=reward + gamma*np.max(Q[next_state])*(not done)#dyna Q hence the max
            td_error =td_target-Q[state][action]
            Q[state][action]+=alphas[e]*td_error
            
            backup_next_state = next_state
            
            for _ in range(n_planning):
                if Q.sum==0:
                    break
                visited_states = np.where(\
                                          np.sum(transition_count, axis=(1,2))\
                                         > 0)[0]#look for visited states in agent model
                state = np.random.choice(visited_states)
                actions_taken=np.where(\
                                        np.sum(transition_count[state], axis=1) > 0)[0]
                action = np.random.choice(actions_taken)
                probs=transition_count[state][action] /transition_count[state][action].sum()
                next_state = np.random.choice(np.arange(nS), size=1, p=probs)[0]
                reward = reward_model[state][action][next_state]
                td_target = reward + gamma*np.max(Q[next_state])
                td_error = td_target - Q[state][action]
                Q[state][action]+=alphas[e]*td_error
            state = backup_next_state
            if done == True:
                break
        V = np.max(Q, axis=1)
    return Q,V

In [17]:
env = gym.make("FrozenLake-v0")
Q,V = dyna_q(env, n_ep=10000)

In [20]:
q_table = Q

maxStepsPerEpisode = 100
import time
from IPython.display import clear_output
for episode in range(10):
    state = env.reset()
    print("EPISODE: ", episode+1)
    time.sleep(2)

    for step in range(maxStepsPerEpisode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.5)

        action = np.argmax(q_table[state, :])
        new_state, reward, done, info = env.step(action)
        state = new_state
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("successfull goal")
                time.sleep(2)
            elif reward == -1:
                print("hole")
                time.sleep(2)
            clear_output(wait=True)
            break

    state = new_state
env.close()

  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG


KeyboardInterrupt: 

In [None]:
#Q