### Model Free Control

<img src="question.jpeg">

In [2]:
from gridworld import GridWorld
import numpy as np
import time

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
world=\
    """
    wwwwwwwwww
    wa       w
    w    wwwww
    wwww     w
    w        w
    w        w
    w        w
    w g  wwwww
    w        w
    wwwwwwwwww
    """
env=GridWorld(world)
env._max_epi_step=400

In [4]:
env.render()

In [5]:
env.close()

<img src="problem_window.png">

### Monte Carlo

In [7]:
def episode(policy):
    episodes=[]
    curr_state=env.reset()
    done=False
    while not done:
        action=policy[curr_state]
        next_state,reward,done,info=env.step(action)
        episodes.append((curr_state,action,reward))
        curr_state=next_state
    return episodes
        

In [8]:
def returns(episodes,gamma):
    returns=[]
    
    for i,(s,a,r) in enumerate(episodes):
        sum_g=r
        future=episodes[i+1:]
        for j,(sn,an,rn) in enumerate(future):
            sum_g+=(gamma**(j+1))*rn
        returns.append((s,a,sum_g))
    return returns
            

In [9]:
def greedy(Q,e,actions):
    policy=[]
    optimal_policy=np.argmax(Q,axis=1)
    for s in range(Q.shape[0]):
        chosen_action=optimal_policy[s] if np.random.random()<=(1-e) else np.random.choice(actions)
        policy.append(chosen_action)
    return policy

In [10]:
def get_e(epi,steady_e=0.01,steady_epi=1000):
    if epi>steady_epi:
        return steady_e
    else:
        m=(steady_e-1)/(steady_epi)
        return m*epi+1

In [31]:
def get_pi():
    Q=np.zeros((env.state_count,4))
    N=np.zeros((env.state_count,4))
    pi=greedy(Q,1,[0,1,2,3])

    for epi in tqdm(range(50000)):
        epis=returns(episode(pi),0.99)
        for s,a,G in epis:
            N[s,a]+=1
            Q[s,a]+=(1/N[s,a])*(G-Q[s,a])
        pi=greedy(Q,get_e(epi,steady_e=0.01, steady_epi=10000),[0,1,2,3])
    return pi


In [32]:
from tqdm import tqdm

In [33]:
pi=get_pi()

100%|██████████| 50000/50000 [02:48<00:00, 296.35it/s] 


In [34]:
env.setPolicy(pi)

In [35]:
env.play_as_human(show_policy=True)

<img src="route_mc.png">

### Sarsa

In [46]:
def get_tuple():
    curr_state=env.reset()
    action=policy[curr_state]
    while True:
        next_state,reward,done,info=env.step(action)
        action_next=policy[next_state]
        yield(curr_state,action,reward,next_state,action_next)
        if not done:
            curr_state=next_state
            action=action_next
        else:
            curr_state=env.reset()
            action=policy[curr_state]
            

In [47]:
def get_e(step,steady_e=0.01,steady_step=10000):
    if step>steady_step:
        return steady_e
    else:
        m=(steady_e-1)/(steady_step)
        return m*step+1

In [54]:
def greedy(Q,s,e,actions):
    optimal_action=np.argmax(Q[s])
    chosen_action=optimal_action if np.random.random()<=(1-e) else np.random.choice(actions)
    return chosen_action

In [55]:
def get_pi(actions):
    Q=np.zeros((env.state_count,4))
    global policy
    policy=np.random.choice([0,1,2,3],size=env.state_count)
    alpha=0.001
    gamma=0.99
    gen=get_tuple()

    for step in tqdm(range(1000000)):
        s,a,r,s_p,a_p=next(gen)
        Q[s,a]+=alpha*((r+gamma*Q[s_p,a_p])-Q[s,a])
        policy[s]=greedy(Q,s,get_e(step,steady_e=0.01,steady_step=100000),[0,1,2,3])
    
    return pi


In [56]:
pi=get_pi([0,1,2,3])

100%|██████████| 1000000/1000000 [00:56<00:00, 17693.28it/s]


In [57]:
env.setPolicy(pi)

In [58]:
env.play_as_human(True)

<img src="Route_sarsa.png">

### Q learning

In [60]:
def get_tuple():
    curr_state=env.reset()
    while True:
        act=policy[curr_state]
        next_state,reward,done,info=env.step(act)
        
        yield(curr_state,act,reward,next_state)
        
        if not done:
            curr_state=next_state
        else:
            curr_state=env.reset()


In [64]:
def get_pi(actions):
    Q=np.zeros((env.state_count,4))
    global policy
    policy=np.random.choice([0,1,2,3],size=env.state_count)
    alpha=0.001
    gamma=0.99
    gen=get_tuple()

    for step in tqdm(range(1000000)):
        s,a,r,s_p=next(gen)
        Q[s,a]+=alpha*((r+gamma*np.max(Q[s_p]))-Q[s,a])
        policy[s]=greedy(Q,s,get_e(step,steady_e=0.01,steady_step=100000),[0,1,2,3])
    
    return pi


In [65]:
pi=get_pi([0,1,2,3])

100%|██████████| 1000000/1000000 [01:09<00:00, 14320.58it/s]


In [66]:
env.setPolicy(pi)

In [67]:
env.play_as_human(True)

<img src="Route_q.png">

### Double Q

In [68]:
def get_tuple():
    curr_state=env.reset()
    while True:
        act=policy[curr_state]
        next_state,reward,done,info=env.step(act)
        
        yield(curr_state,act,reward,next_state)
        
        if not done:
            curr_state=next_state
        else:
            curr_state=env.reset()


In [69]:
def get_pi(actions):
    Q_max=np.zeros((env.state_count,4))
    Q_min=np.zeros((env.state_count,4))
    Q = np.zeros((env.state_count,4))
    global policy
    policy=np.random.choice([0,1,2,3],size=env.state_count)
    alpha=0.001
    gamma=0.99
    gen=get_tuple()

    for step in tqdm(range(1000000)):
        s,a,r,s_p=next(gen)
        Q_max[s,a]+=alpha*((r+gamma*np.max(Q[s_p]))-Q[s,a])
        Q_min[s,a]+=alpha*((r+gamma*np.min(Q[s_p]))-Q[s,a])
        Q[s,a]=(Q_max[s,a]+Q_min[s,a])/2
        policy[s]=greedy(Q,s,get_e(step,steady_e=0.01,steady_step=100000),[0,1,2,3])
    
    return pi


In [70]:
pi=get_pi([0,1,2,3])

100%|██████████| 1000000/1000000 [01:18<00:00, 12759.02it/s]


In [71]:
env.setPolicy(pi)

In [72]:
env.play_as_human(True)

<img src="Route_double_q.png">