In [1]:
import numpy as np
import numpy.testing as npt
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats

import sys
sys.path.append('/Users/johannesthurn/gym')
import gym

# Frozen Lake Environment laden
envFrozLake = gym.make('FrozenLake-v0' )

[2017-12-07 15:17:00,656] Making new env: FrozenLake-v0


In [None]:
def TD0(S,action,r,S_p):
    '''
    S is the last state,
    action is the action taken,
    r is the immediate reward, 
    S_P is the following State

    '''
    

In [2]:
class TemporalDifferenceZero:
    
    def __init__(self,env,alpha=0.8,gamma=0.95,epsilon=0.1):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q_SA = np.zeros([env.observation_space.n, env.action_space.n])
        
    def run(self,episodes=100000):
        
        for e in range(episodes):
            done = False
            s = self.env.reset()
            while not done:
                a = self.getGreedyAction(s)
                s_prime,reward,done,_ = self.env.step(a)
                self.Q_SA[s,a] = self.Q_SA[s,a] + self.alpha*(reward + self.gamma * np.max(self.Q_SA[s_prime, : ]) - self.Q_SA[s,a])
                s = s_prime
                
    def getGreedyAction(self,s):
        if np.random.rand() < self.epsilon:
            # Random Action
            return self.env.action_space.sample()
        else:
            mostValued = np.where(self.Q_SA[s] == np.amax(self.Q_SA[s]))[0] # [0] da np.where returns Tupel
            choose_one = np.random.randint(0,len(mostValued),1)[0] 
            return mostValued[choose_one]
        

In [5]:
env.observation_space.n

16

In [6]:
env.action_space.n

4

In [71]:
t = TemporalDifferenceZero(envFrozLake, alpha=0.2)

In [72]:
t.run()

In [5]:
t.Q_SA

array([[  4.24317711e-01,   5.65609772e-02,   8.74065836e-02,
          2.55576584e-01],
       [  7.50422781e-03,   9.54072035e-03,   1.47371702e-03,
          1.12163206e-01],
       [  8.84574292e-03,   1.66441572e-02,   2.96632447e-02,
          4.11438548e-02],
       [  6.86686348e-05,   1.07238984e-02,   1.03655359e-02,
          3.80294536e-02],
       [  3.71401844e-01,   3.64219646e-02,   2.50489149e-01,
          1.29694529e-01],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  5.60116062e-02,   4.60974432e-12,   6.02708003e-03,
          2.66762276e-09],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  3.00964571e-02,   9.30273535e-02,   7.16745632e-02,
          4.31325687e-01],
       [  1.61322285e-01,   6.08034744e-01,   1.47142173e-01,
          1.22751421e-01],
       [  5.19039371e-01,   1.36525639e-01,   3.78526662e-03,
          1.77258411e-03],
       [  0.00000000e

In [6]:
def evaluate(env,policy,trials=100):
    rewardAll = 0
    counter = 0
    for i in range(trials):
        state = env.reset()
        done = False
        rewardTrial = 0
        while not done:
            state, reward, done, _ = env.step(policy[state])
            rewardTrial += reward
        rewardAll += rewardTrial
        counter += 1
    return rewardAll/counter

In [73]:
pol = np.argmax(t.Q_SA, axis=1)

In [75]:
evaluate(envFrozLake,pol)

0.23

In [119]:
# QLearning
class TemporalDifferenceZero2:
    
    def __init__(self,env):
        self.env = env
        
    def run(self,gamma=0.95,epsilon=0.1,episodes=100000):
        Q_SA = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        N_SA = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        rAll = 0
        for e in range(1,episodes+1):
            done = False
            s = self.env.reset()
            while not done:
                a = self.getGreedyAction(s,Q_SA,epsilon)
                s_prime,reward,done,_ = self.env.step(a)
                N_SA[s,a] += 1   # Aplha nicht sinnvoll
                alpha = 1/N_SA[s,a]    #.       Q(S_Prime) ist nun nicht von Pi abhaengig -> Q-Learning
                Q_SA[s,a] = Q_SA[s,a] + alpha*(reward + gamma * np.max(Q_SA[s_prime, : ]) - Q_SA[s,a])
                s = s_prime
                rAll += reward 
            
            if e % 10000 == 0:
                print('Averaged Reward after ' + str(e) + ' episodes')
                print(rAll/10000)
                rAll = 0
            
        return Q_SA, np.argmax(np.random.random(Q_SA.shape) * (Q_SA.T==Q_SA.max(axis=1)).T, axis=1)
                
    def getGreedyAction(self,s,Q_SA,epsilon):
        if np.random.rand() < epsilon:
            # Random Action
            return self.env.action_space.sample()
        else:
            mostValued = np.where(Q_SA[s] == np.amax(Q_SA[s]))[0] # [0] da np.where returns Tupel
            choose_one = np.random.randint(0,len(mostValued),1)[0] 
            return mostValued[choose_one]

In [120]:
t2 = TemporalDifferenceZero2(envFrozLake)

In [121]:
q, pol = t2.run(gamma=0.9,episodes=500000)

Averaged Reward after 10000 episodes
0.2271
Averaged Reward after 20000 episodes
0.2755
Averaged Reward after 30000 episodes
0.2856
Averaged Reward after 40000 episodes
0.2876
Averaged Reward after 50000 episodes
0.2784
Averaged Reward after 60000 episodes
0.2823
Averaged Reward after 70000 episodes
0.279
Averaged Reward after 80000 episodes
0.2877
Averaged Reward after 90000 episodes
0.2822
Averaged Reward after 100000 episodes
0.2777
Averaged Reward after 110000 episodes
0.2868
Averaged Reward after 120000 episodes
0.2805
Averaged Reward after 130000 episodes
0.2792
Averaged Reward after 140000 episodes
0.2873
Averaged Reward after 150000 episodes
0.2779
Averaged Reward after 160000 episodes
0.2755
Averaged Reward after 170000 episodes
0.2857
Averaged Reward after 180000 episodes
0.2871
Averaged Reward after 190000 episodes
0.2857
Averaged Reward after 200000 episodes
0.2853
Averaged Reward after 210000 episodes
0.2892
Averaged Reward after 220000 episodes
0.2776
Averaged Reward afte

In [122]:
evaluate(envFrozLake,pol)

0.31

In [123]:
#Sarsa -> on Policy
class TemporalDifferenceZero3:
    
    def __init__(self,env):
        self.env = env
        
    def run(self,gamma=0.95,epsilon=0.05,episodes=100000):
        Q_SA = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        N_SA = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        
        for e in range(1,episodes+1):
            done = False
            s = self.env.reset()
            while not done:
                a = self.getGreedyAction(s,Q_SA,epsilon)
                s_prime,reward,done,_ = self.env.step(a)
                N_SA[s,a] += 1
                alpha = 0.02 #1/e - ganz schlecht #1/N_SA[s,a]
                a_prime = self.getGreedyAction(s_prime,Q_SA,epsilon)
                Q_SA[s,a] = Q_SA[s,a] + alpha*(reward + gamma * Q_SA[s_prime,a_prime] - Q_SA[s,a])
                s = s_prime
            
            if (e-1) % 1000 == 0:
                print('Expected Reward after ' + str(e) + ' episodes')
                pol_new =np.argmax(np.random.random(Q_SA.shape) * (Q_SA.T==Q_SA.max(axis=1)).T, axis=1)
                print(evaluate(self.env,pol_new))
            
        return Q_SA, np.argmax(np.random.random(Q_SA.shape) * (Q_SA.T==Q_SA.max(axis=1)).T, axis=1)
                
    def getGreedyAction(self,s,Q_SA,epsilon):
        if np.random.rand() < epsilon:
            # Random Action
            return self.env.action_space.sample()
        else:
            mostValued = np.where(Q_SA[s] == np.amax(Q_SA[s]))[0] # [0] da np.where returns Tupel
            choose_one = np.random.randint(0,len(mostValued),1)[0] 
            return mostValued[choose_one]

In [124]:
t3 = TemporalDifferenceZero3(envFrozLake)

In [125]:
q, pol = t3.run(gamma=0.95)

Expected Reward after 1 episodes
0.0
Expected Reward after 1001 episodes
0.0
Expected Reward after 2001 episodes
0.04
Expected Reward after 3001 episodes
0.01
Expected Reward after 4001 episodes
0.07
Expected Reward after 5001 episodes
0.14
Expected Reward after 6001 episodes
0.13
Expected Reward after 7001 episodes
0.1
Expected Reward after 8001 episodes
0.15
Expected Reward after 9001 episodes
0.53
Expected Reward after 10001 episodes
0.68
Expected Reward after 11001 episodes
0.65
Expected Reward after 12001 episodes
0.6
Expected Reward after 13001 episodes
0.73
Expected Reward after 14001 episodes
0.66
Expected Reward after 15001 episodes
0.74
Expected Reward after 16001 episodes
0.69
Expected Reward after 17001 episodes
0.77
Expected Reward after 18001 episodes
0.73
Expected Reward after 19001 episodes
0.7
Expected Reward after 20001 episodes
0.68
Expected Reward after 21001 episodes
0.27
Expected Reward after 22001 episodes
0.64
Expected Reward after 23001 episodes
0.76
Expected R

In [132]:
evaluate(envFrozLake,pol)

0.75

In [133]:
pol

array([0, 3, 0, 3, 0, 1, 2, 1, 3, 1, 0, 3, 3, 2, 1, 3])

In [134]:
q

array([[ 0.14827263,  0.13735084,  0.13826603,  0.13072047],
       [ 0.08030282,  0.08344967,  0.07712953,  0.12773562],
       [ 0.12870363,  0.11882963,  0.12006722,  0.11399978],
       [ 0.02125939,  0.02389113,  0.0299297 ,  0.10909998],
       [ 0.17830151,  0.11123817,  0.11725045,  0.09993456],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.14037805,  0.11877892,  0.16442737,  0.02518008],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.11979872,  0.17256026,  0.16758132,  0.24122842],
       [ 0.2147155 ,  0.35557153,  0.25344186,  0.20779372],
       [ 0.36820489,  0.29373512,  0.3108812 ,  0.16224576],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.21581934,  0.3535727 ,  0.46847   ,  0.28152295],
       [ 0.49758102,  0.68940032,  0.60770646,  0.58204025],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])

In [None]:
import tslearn