In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym

In [2]:
# parameters
env = gym.make('FrozenLake-v0')
pi = {0:1, 1:2, 2:1, 3:0, 4:1, 6:1, 8:2, 9:0, 10:1, 13:2, 14:2}

[2018-01-16 13:10:51,786] Making new env: FrozenLake-v0


In [3]:
class TDZero:
    
    def __init__(self,env,alpha=0.05,gamma=0.99,epsilon=0.1):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_sa = np.zeros([env.observation_space.n, env.action_space.n])

    def run(self,episodes=10000):
        for episode in range(episodes):
            done = False
            state = self.env.reset()
            while not done:
                action = self.get_epsilon_greedy_action(state)
                next_state,reward,done,_ = self.env.step(action)
                self.q_sa[state,action] = self.q_sa[state,action] + self.alpha*(reward + self.gamma * np.max(self.q_sa[next_state, : ]) - self.q_sa[state,action])
                state = next_state
                
    def get_epsilon_greedy_action(self,state):
        if np.random.rand() < self.epsilon:
            # Random exploration
            return self.env.action_space.sample()
        else:
            # Return greedy action
            greedy = np.where(self.q_sa[state] == np.amax(self.q_sa[state]))[0]
            select = np.random.randint(0,len(greedy),1)[0] 
            return greedy[select]

In [4]:
env.observation_space.n

16

In [5]:
env.action_space.n

4

In [6]:
tdz = TDZero(env)

In [7]:
tdz.run()

In [8]:
tdz.q_sa

array([[ 0.51025902,  0.49281263,  0.48164008,  0.49347   ],
       [ 0.31986352,  0.23431611,  0.27712259,  0.4475331 ],
       [ 0.37752282,  0.27988847,  0.24019309,  0.25492194],
       [ 0.03480865,  0.13571008,  0.00960432,  0.08385311],
       [ 0.52892773,  0.37836702,  0.38925102,  0.36177912],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.35205438,  0.13860158,  0.24467419,  0.08614557],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.35333473,  0.36828908,  0.36438721,  0.56432136],
       [ 0.51392118,  0.62887009,  0.52859803,  0.3259483 ],
       [ 0.64781015,  0.46964517,  0.33018241,  0.30104954],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.42265795,  0.52903203,  0.69153769,  0.46977955],
       [ 0.74050786,  0.89184757,  0.77824732,  0.75820722],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])

In [9]:
def evaluate(env,policy,trials=100):
    sum_rewards = 0
    n = 0
    for i in range(trials):
        state = env.reset()
        done = False
        reward_i = 0
        while not done:
            state, reward, done, _ = env.step(policy[state])
            reward_i += reward
        sum_rewards += reward_i
        n += 1
    return sum_rewards/n

In [10]:
# pol = np.argmax(td0.q_sa, axis=1)
# evaluate(env,pol)
evaluate(env,pi)

0.04

In [11]:
# QLearning
class TDZeroQ:
    
    def __init__(self,env):
        self.env = env
        
        
    def run(self,alpha=0.05,gamma=0.99,epsilon=0.1,episodes=10000):
        q_sa = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        n_sa = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        sum_rewards = 0
        for e in range(1,episodes+1):
            done = False
            state = self.env.reset()
            while not done:
                action = self.get_epsilon_greedy_action(state,q_sa,epsilon)
                next_state,reward,done,_ = self.env.step(action)
                n_sa[state,action] += 1
                alpha = 1/n_sa[state,action]
                q_sa[state,action] = q_sa[state,action] + alpha*(reward + gamma * np.max(q_sa[next_state, : ]) - q_sa[state,action])
                state = next_state
                sum_rewards += reward 
            
            if e % 10000 == 0:
                print('Averaged Reward after ' + str(e) + ' episodes')
                print(sum_rewards/10000)
                sum_rewards = 0
            
        return q_sa, np.argmax(np.random.random(q_sa.shape) * (q_sa.T==q_sa.max(axis=1)).T, axis=1)
                
    def get_epsilon_greedy_action(self,state,q_sa,epsilon):
        if np.random.rand() < epsilon:
            # Random exploration
            return self.env.action_space.sample()
        else:
            # Return greedy action
            greedy = np.where(q_sa[state] == np.amax(q_sa[state]))[0]
            select = np.random.randint(0,len(greedy),1)[0] 
            return greedy[select]

In [12]:
tdz_q = TDZeroQ(env)

In [13]:
q, pi = tdz_q.run(gamma=0.9,episodes=500000)

Averaged Reward after 10000 episodes
0.1763
Averaged Reward after 20000 episodes
0.2866
Averaged Reward after 30000 episodes
0.2807
Averaged Reward after 40000 episodes
0.2874
Averaged Reward after 50000 episodes
0.2807
Averaged Reward after 60000 episodes
0.2811
Averaged Reward after 70000 episodes
0.2917
Averaged Reward after 80000 episodes
0.2802
Averaged Reward after 90000 episodes
0.2829
Averaged Reward after 100000 episodes
0.2864
Averaged Reward after 110000 episodes
0.2848
Averaged Reward after 120000 episodes
0.2891
Averaged Reward after 130000 episodes
0.2848
Averaged Reward after 140000 episodes
0.2775
Averaged Reward after 150000 episodes
0.2814
Averaged Reward after 160000 episodes
0.2781
Averaged Reward after 170000 episodes
0.2809
Averaged Reward after 180000 episodes
0.277
Averaged Reward after 190000 episodes
0.275
Averaged Reward after 200000 episodes
0.2833
Averaged Reward after 210000 episodes
0.2857
Averaged Reward after 220000 episodes
0.2782
Averaged Reward after

In [14]:
evaluate(env,pi)

0.42

In [15]:
# SARSA
class TDZeroSARSA:
    
    def __init__(self,env):
        self.env = env
        
    def run(self,alpha=0.05,gamma=0.99,epsilon=0.1,episodes=100000):
        q_sa = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        n_sa = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        for e in range(1,episodes+1):
            done = False
            state = self.env.reset()
            while not done:
                action = self.get_epsilon_greedy_action(state,q_sa,epsilon)
                next_state,reward,done,_ = self.env.step(action)
                n_sa[state,action] += 1
                alpha = 0.02
                next_action = self.get_epsilon_greedy_action(next_state,q_sa,epsilon)
                q_sa[state,action] = q_sa[state,action] + alpha*(reward + gamma * np.max(q_sa[next_state, : ]) - q_sa[state,action])
                state = next_state
            
            if (e-1) % 1000 == 0:
                print('Averaged Reward after ' + str(e) + ' episodes')
                new_policy = np.argmax(np.random.random(q_sa.shape) * (q_sa.T==q_sa.max(axis=1)).T, axis=1)
                print(evaluate(self.env, new_policy))
            
        return q_sa, np.argmax(np.random.random(q_sa.shape) * (q_sa.T==q_sa.max(axis=1)).T, axis=1)
                
    def get_epsilon_greedy_action(self,state,q_sa,epsilon):
        if np.random.rand() < epsilon:
            # Random exploration
            return self.env.action_space.sample()
        else:
            # Return greedy action
            greedy = np.where(q_sa[state] == np.amax(q_sa[state]))[0]
            select = np.random.randint(0,len(greedy),1)[0] 
            return greedy[select]

In [16]:
tdz_sarsa = TDZeroSARSA(env)

In [17]:
q, pi = tdz_sarsa.run()

Averaged Reward after 1 episodes
0.0
Averaged Reward after 1001 episodes
0.03
Averaged Reward after 2001 episodes
0.05
Averaged Reward after 3001 episodes
0.06
Averaged Reward after 4001 episodes
0.18
Averaged Reward after 5001 episodes
0.24
Averaged Reward after 6001 episodes
0.47
Averaged Reward after 7001 episodes
0.47
Averaged Reward after 8001 episodes
0.53
Averaged Reward after 9001 episodes
0.42
Averaged Reward after 10001 episodes
0.73
Averaged Reward after 11001 episodes
0.75
Averaged Reward after 12001 episodes
0.7
Averaged Reward after 13001 episodes
0.76
Averaged Reward after 14001 episodes
0.69
Averaged Reward after 15001 episodes
0.75
Averaged Reward after 16001 episodes
0.76
Averaged Reward after 17001 episodes
0.65
Averaged Reward after 18001 episodes
0.8
Averaged Reward after 19001 episodes
0.73
Averaged Reward after 20001 episodes
0.68
Averaged Reward after 21001 episodes
0.74
Averaged Reward after 22001 episodes
0.71
Averaged Reward after 23001 episodes
0.75
Averaged

In [18]:
evaluate(env,pi)

0.72

In [19]:
pi

array([0, 3, 0, 0, 0, 2, 2, 1, 3, 1, 0, 0, 0, 2, 1, 0])

In [20]:
q

array([[ 0.5409863 ,  0.5142835 ,  0.5109796 ,  0.50515038],
       [ 0.3746719 ,  0.30424755,  0.26857674,  0.46560099],
       [ 0.40003723,  0.32396464,  0.3006988 ,  0.31704239],
       [ 0.1760727 ,  0.13680972,  0.08807234,  0.14432618],
       [ 0.55962609,  0.38828871,  0.35343693,  0.4031182 ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.30196425,  0.18718252,  0.32699659,  0.14156722],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.36325864,  0.44866339,  0.44806309,  0.58973894],
       [ 0.4033352 ,  0.64322845,  0.40870175,  0.37107211],
       [ 0.64555936,  0.53369552,  0.42501489,  0.29117539],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.44207545,  0.57377942,  0.74320409,  0.46465264],
       [ 0.71734577,  0.85649115,  0.80398268,  0.75661743],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])