In [1]:
import numpy as np
import numpy.testing as npt
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats
import logging
from collections import defaultdict

In [2]:
import gym
env = gym.make('FrozenLake-v0' )

[2018-01-16 13:25:13,611] Making new env: FrozenLake-v0


In [3]:
state  = env.reset()

In [4]:
def act(action):
    state, reward, done, info = env.step(action)
    print (state)
    env.render()  

In [5]:
act(3)

0
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG


In [6]:
pi = {0:1, 1:2, 2:1, 3:0, 4:1, 6:1, 8:2, 9:0, 10:1, 13:2, 14:2}

In [7]:
state  = env.reset()
print (state)
done = False
while not done:
    env.render()
    action = pi[state]
    print ("random action:", action)
    state, reward, done, info = env.step(action)
    print (state, reward, done, info)
    if done:
        print ("return:", reward) # return for all visited states is here last reward

env.render(close=True)

0

[41mS[0mFFF
FHFH
FFFH
HFFG
random action: 1
0 0.0 False {'prob': 0.3333333333333333}
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
random action: 1
0 0.0 False {'prob': 0.3333333333333333}
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
random action: 1
4 0.0 False {'prob': 0.3333333333333333}
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
random action: 1
8 0.0 False {'prob': 0.3333333333333333}
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
random action: 2
4 0.0 False {'prob': 0.3333333333333333}
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
random action: 1
4 0.0 False {'prob': 0.3333333333333333}
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
random action: 1
8 0.0 False {'prob': 0.3333333333333333}
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
random action: 2
9 0.0 False {'prob': 0.3333333333333333}
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
random action: 0
8 0.0 False {'prob': 0.3333333333333333}
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
random action: 2
12 0.0 True {'prob': 0.3333333333333333}
return: 0.0


In [8]:
# V(s)
def every_visit_monte_carlo_prediction_v(mc_policy, mc_env, target_state, nb_episodes=10000, discount=0.9):
    n_s = np.zeros(mc_env.observation_space.n)
    v = np.zeros(mc_env.observation_space.n)
    for i in range(1, nb_episodes + 1):
        episode = []
        state = mc_env.reset()
        done = False
        # V
        while not done:
            action = mc_policy[state]
            next_state, reward, done, _ = mc_env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
        every_visit_mc_v(episode,v,discount,n_s)
    return v

def every_visit_mc_v(episode,v,discount,n_s):
        sum = 0
        for x in reversed(range(len(episode))):
            s,a,r = episode[x]
            sum = r + discount * sum
            n_s[[s,a]] += 1
            alpha = 1/n_s[[s,a]]
            v[[s,a]] = v[[s,a]] + alpha * (sum - v[[s,a]])

# Q(s,a)
def every_visit_monte_carlo_prediction_q(mc_policy, mc_env, target_state, nb_episodes=10000, discount=0.9):
    n_sa = np.zeros([mc_env.observation_space.n, mc_env.action_space.n])
    q_sa = np.zeros([mc_env.observation_space.n, mc_env.action_space.n])
    for e in range(nb_episodes):
        done = False
        state = mc_env.reset()
        episode = []
        # Q
        while not done:
            action = mc_policy[state]
            next_state, reward, done, _ = mc_env.step(action)
            episode.append([state,action,reward])
            if done:
                break
            state = next_state
        # Q updaten
        every_visit_mc_q(episode,q_sa,discount,n_sa)     
    return q_sa
        
def every_visit_mc_q(episode,q_sa,discount,n_sa):
    _sum = 0
    for x in reversed(range(len(episode))):
        s,a,r = episode[x]
        _sum = r + discount * _sum
        n_sa[s,a] += 1
        alpha = 1/n_sa[s,a]
        q_sa[s,a] = q_sa[s,a] + alpha * (_sum - q_sa[s,a])

# control
def montecarlo_control(mc_env,discount=0.9,nb_episodes=10000):
    n_sa = np.zeros([mc_env.observation_space.n, mc_env.action_space.n])
    q_sa = np.zeros([mc_env.observation_space.n, mc_env.action_space.n])
    policy = np.random.randint(0,mc_env.action_space.n,mc_env.observation_space.n)
    pol_old = policy
    for e in range(1,nb_episodes+1):
        done = False
        state = mc_env.reset()
        episode = [] 
        while not done:
            action = policy[state]
            next_state, reward, done, _ = mc_env.step(action)
            episode.append([state,action,reward])
            if done:
                break
            state = next_state
        every_visit_mc_q(episode,q_sa,discount,n_sa)
        policy = policy_update(mc_env,q_sa,e)
        if e % 1000 == 0:
            print('Expected Reward post episode ' + str(e) + '')
            pol_new =np.argmax(np.random.random(q_sa.shape) * (q_sa.T==q_sa.max(axis=1)).T, axis=1)
            print(evaluate(mc_env, pol_new))
            if np.allclose(pol_new,pol_old):
                print("Policy hasn't changed")
                return n_sa,q_sa,pol_new
            pol_old = pol_new
    return n_sa,q_sa,np.argmax(np.random.random(q_sa.shape) * (q_sa.T==q_sa.max(axis=1)).T, axis=1)

def policy_update(mc_env,q_sa,k):
    epsilon = 0.1
    policy = np.zeros(mc_env.observation_space.n,dtype=int)
    for s in range(mc_env.observation_space.n):
        if np.random.rand() < epsilon:
            policy[s] = mc_env.action_space.sample()
        else:
            mostValued = np.where(q_sa[s] == np.amax(q_sa[s]))[0] # [0] da np.where returns Tupel
            choose_one = np.random.randint(0,len(mostValued),1)[0] 
            policy[s] = mostValued[choose_one]
    return policy
    
def evaluate(mc_env, policy,trials=100):
    rewardAll = 0
    counter = 0
    for i in range(trials):
        state = mc_env.reset()
        done = False
        rewardTrial = 0
        while not done:
            state, reward, done, _ = mc_env.step(policy[state])
            rewardTrial += reward
        rewardAll += rewardTrial
        counter += 1
    return rewardAll/counter

In [9]:
print(pi)
# V(s)
v_s = every_visit_monte_carlo_prediction_v(pi, env, 0, 10000)
print (v_s)

{0: 1, 1: 2, 2: 1, 3: 0, 4: 1, 6: 1, 8: 2, 9: 0, 10: 1, 13: 2, 14: 2}
[ 0.01590991  0.01658739  0.05003502  0.01042981  0.01263779  0.
  0.05589583  0.          0.02940273  0.08301691  0.23365018  0.          0.
  0.27365458  0.62428006  0.        ]


In [10]:
evaluate(env, pi)

0.01

In [11]:
print(pi)
q = every_visit_monte_carlo_prediction_q(pi,env,0, 10000)
print('Q_SA')
print(q)
print('V')
print(np.max(q,axis=1))

{0: 1, 1: 2, 2: 1, 3: 0, 4: 1, 6: 1, 8: 2, 9: 0, 10: 1, 13: 2, 14: 2}
Q_SA
[[ 0.          0.00870252  0.          0.        ]
 [ 0.          0.          0.0100307   0.        ]
 [ 0.          0.02381675  0.          0.        ]
 [ 0.01261329  0.          0.          0.        ]
 [ 0.          0.01366911  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.05763451  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.03017671  0.        ]
 [ 0.08761386  0.          0.          0.        ]
 [ 0.          0.18287804  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.26263611  0.        ]
 [ 0.          0.          0.5541214   0.        ]
 [ 0.          0.          0.          0.        ]]
V
[ 0.00870252  0.0100307   0.02381675  0.01261329  0.01366911  0.
  0.05763451  0.          0.03017671  0.0

In [12]:
policy_new = np.random.randint(0,env.action_space.n,env.observation_space.n)
policy_new

array([1, 1, 1, 0, 0, 3, 0, 2, 1, 3, 3, 2, 1, 3, 2, 1])

In [13]:
n,q,p = montecarlo_control(env,nb_episodes=100000)

Expected Reward post episode 1000
0.56
Expected Reward post episode 2000
0.6
Expected Reward post episode 3000
0.78
Expected Reward post episode 4000
0.75
Expected Reward post episode 5000
0.65
Expected Reward post episode 6000
0.63
Expected Reward post episode 7000
0.76
Expected Reward post episode 8000
0.72
Expected Reward post episode 9000
0.75
Expected Reward post episode 10000
0.64
Expected Reward post episode 11000
0.62
Expected Reward post episode 12000
0.72
Expected Reward post episode 13000
0.68
Expected Reward post episode 14000
0.81
Expected Reward post episode 15000
0.67
Expected Reward post episode 16000
0.74
Expected Reward post episode 17000
0.71
Expected Reward post episode 18000
0.68
Expected Reward post episode 19000
0.68
Expected Reward post episode 20000
0.75
Expected Reward post episode 21000
0.75
Expected Reward post episode 22000
0.72
Expected Reward post episode 23000
0.61
Expected Reward post episode 24000
0.69
Expected Reward post episode 25000
0.71
Expected R

In [15]:
np.random.randint(0,env.action_space.n,env.observation_space.n)

array([1, 1, 3, 2, 3, 3, 0, 2, 1, 1, 3, 0, 3, 1, 0, 2])

In [16]:
n,q,p = montecarlo_control(env, nb_episodes=100000)

Expected Reward post episode 1000
0.51
Expected Reward post episode 2000
0.42
Expected Reward post episode 3000
0.44
Expected Reward post episode 4000
0.55
Expected Reward post episode 5000
0.51
Expected Reward post episode 6000
0.59
Expected Reward post episode 7000
0.5
Expected Reward post episode 8000
0.45
Expected Reward post episode 9000
0.52
Expected Reward post episode 10000
0.57
Expected Reward post episode 11000
0.49
Expected Reward post episode 12000
0.47
Expected Reward post episode 13000
0.45
Expected Reward post episode 14000
0.48
Expected Reward post episode 15000
0.46
Expected Reward post episode 16000
0.5
Expected Reward post episode 17000
0.51
Expected Reward post episode 18000
0.5
Expected Reward post episode 19000
0.53
Expected Reward post episode 20000
0.54
Expected Reward post episode 21000
0.54
Expected Reward post episode 22000
0.54
Expected Reward post episode 23000
0.42
Expected Reward post episode 24000
0.54
Expected Reward post episode 25000
0.62
Expected Rew