In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym

In [2]:
# parameters
env = gym.make('FrozenLake-v0')
pi = {0:1, 1:2, 2:1, 3:0, 4:1, 6:1, 8:2, 9:0, 10:1, 13:2, 14:2}

[2018-01-16 12:26:48,898] Making new env: FrozenLake-v0


In [3]:
class TDZero:
    
    def __init__(self,env,alpha=0.05,gamma=0.99,epsilon=0.1):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_sa = np.zeros([env.observation_space.n, env.action_space.n])

    def run(self,episodes=10000):
        for episode in range(episodes):
            done = False
            state = self.env.reset()
            while not done:
                action = self.get_epsilon_greedy_action(state)
                next_state,reward,done,_ = self.env.step(action)
                self.q_sa[state,action] = self.q_sa[state,action] + self.alpha*(reward + self.gamma * np.max(self.q_sa[next_state, : ]) - self.q_sa[state,action])
                state = next_state
                
    def get_epsilon_greedy_action(self,state):
        if np.random.rand() < self.epsilon:
            # Random exploration
            return self.env.action_space.sample()
        else:
            # Return greedy action
            greedy = np.where(self.q_sa[state] == np.amax(self.q_sa[state]))[0] # [0] da np.where returns Tupel
            select = np.random.randint(0,len(greedy),1)[0] 
            return greedy[select]

In [4]:
env.observation_space.n

16

In [5]:
env.action_space.n

4

In [6]:
td0 = TDZero(env)

In [7]:
td0.run()

In [8]:
td0.q_sa

array([[ 0.53311282,  0.52600281,  0.52105612,  0.5198663 ],
       [ 0.349514  ,  0.33043194,  0.34076468,  0.49810644],
       [ 0.42502067,  0.44044209,  0.40920359,  0.47274627],
       [ 0.26938632,  0.3061944 ,  0.22859394,  0.46461435],
       [ 0.54576869,  0.33998453,  0.3472335 ,  0.43304992],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.31771471,  0.1881106 ,  0.32232126,  0.16808559],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.36647102,  0.3956311 ,  0.43303531,  0.58421574],
       [ 0.41154575,  0.6464113 ,  0.41867214,  0.36441642],
       [ 0.60471077,  0.48332618,  0.26687804,  0.38931827],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.37695597,  0.58697422,  0.75559452,  0.52799243],
       [ 0.75768639,  0.89512845,  0.79532928,  0.811195  ],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])

In [9]:
def evaluate(env,policy,trials=100):
    sum_rewards = 0
    n = 0
    for i in range(trials):
        state = env.reset()
        done = False
        reward_i = 0
        while not done:
            state, reward, done, _ = env.step(policy[state])
            reward_i += reward
        sum_rewards += reward_i
        n += 1
    return sum_rewards/n

In [10]:
# pol = np.argmax(td0.q_sa, axis=1)
# evaluate(env,pol)
evaluate(env,pi)

0.04

In [11]:
# QLearning
class TDZeroQ:
    
    def __init__(self,env):
        self.env = env
        
        
    def run(self,alpha=0.05,gamma=0.99,epsilon=0.1,episodes=10000):
        q_sa = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        n_sa = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        sum_rewards = 0
        for e in range(1,episodes+1):
            done = False
            state = self.env.reset()
            while not done:
                aaction = self.get_epsilon_greedy_action(state,q_sa,epsilon)
                next_state,reward,done,_ = self.env.step(acton)
                n_sa[state,action] += 1
                alpha = 1/n_sa[state,action]
                q_sa[state,action] = q_sa[state,action] + alpha*(reward + gamma * np.max(Q_SA[next_state, : ]) - Q_SA[state,action])
                state = next_state
                sum_rewards += reward 
            
            if e % 10000 == 0:
                print('Averaged Reward after ' + str(e) + ' episodes')
                print(sum_rewards/10000)
                sum_rewards = 0
            
        return q_sa, np.argmax(np.random.random(q_sa.shape) * (q_sa.T==q_sa.max(axis=1)).T, axis=1)
                
    def get_epsilon_greedy_action(self,state):
        if np.random.rand() < self.epsilon:
            # Random exploration
            return self.env.action_space.sample()
        else:
            # Return greedy action
            greedy = np.where(self.q_sa[state] == np.amax(self.q_sa[state]))[0] # [0] da np.where returns Tupel
            select = np.random.randint(0,len(greedy),1)[0] 
            return greedy[select]